From 792001f4f7aa036b1f1c1ed7bce44bb49126208a Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 18 Dec 2020 15:56:12 -0800 Subject: libbpf: Add user-space variants of BPF_CORE_READ() family of macros Add BPF_CORE_READ_USER(), BPF_CORE_READ_USER_STR() and their _INTO() variations to allow reading CO-RE-relocatable kernel data structures from the user-space. One of such cases is reading input arguments of syscalls, while reaping the benefits of CO-RE relocations w.r.t. handling 32/64 bit conversions and handling missing/new fields in UAPI data structs. Suggested-by: Gilad Reti Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20201218235614.2284956-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/bpf_core_read.h | 98 ++++++++++++++++++++++++++----------------- 1 file changed, 59 insertions(+), 39 deletions(-) diff --git a/tools/lib/bpf/bpf_core_read.h b/tools/lib/bpf/bpf_core_read.h index bbcefb3ff5a5..db0c735ceb53 100644 --- a/tools/lib/bpf/bpf_core_read.h +++ b/tools/lib/bpf/bpf_core_read.h @@ -195,17 +195,20 @@ enum bpf_enum_value_kind { * (local) BTF, used to record relocation. */ #define bpf_core_read(dst, sz, src) \ - bpf_probe_read_kernel(dst, sz, \ - (const void *)__builtin_preserve_access_index(src)) + bpf_probe_read_kernel(dst, sz, (const void *)__builtin_preserve_access_index(src)) +#define bpf_core_read_user(dst, sz, src) \ + bpf_probe_read_user(dst, sz, (const void *)__builtin_preserve_access_index(src)) /* * bpf_core_read_str() is a thin wrapper around bpf_probe_read_str() * additionally emitting BPF CO-RE field relocation for specified source * argument. */ #define bpf_core_read_str(dst, sz, src) \ - bpf_probe_read_kernel_str(dst, sz, \ - (const void *)__builtin_preserve_access_index(src)) + bpf_probe_read_kernel_str(dst, sz, (const void *)__builtin_preserve_access_index(src)) + +#define bpf_core_read_user_str(dst, sz, src) \ + bpf_probe_read_user_str(dst, sz, (const void *)__builtin_preserve_access_index(src)) #define ___concat(a, b) a ## b #define ___apply(fn, n) ___concat(fn, n) @@ -264,30 +267,29 @@ enum bpf_enum_value_kind { read_fn((void *)(dst), sizeof(*(dst)), &((src_type)(src))->accessor) /* "recursively" read a sequence of inner pointers using local __t var */ -#define ___rd_first(src, a) ___read(bpf_core_read, &__t, ___type(src), src, a); -#define ___rd_last(...) \ - ___read(bpf_core_read, &__t, \ - ___type(___nolast(__VA_ARGS__)), __t, ___last(__VA_ARGS__)); -#define ___rd_p1(...) const void *__t; ___rd_first(__VA_ARGS__) -#define ___rd_p2(...) ___rd_p1(___nolast(__VA_ARGS__)) ___rd_last(__VA_ARGS__) -#define ___rd_p3(...) ___rd_p2(___nolast(__VA_ARGS__)) ___rd_last(__VA_ARGS__) -#define ___rd_p4(...) ___rd_p3(___nolast(__VA_ARGS__)) ___rd_last(__VA_ARGS__) -#define ___rd_p5(...) ___rd_p4(___nolast(__VA_ARGS__)) ___rd_last(__VA_ARGS__) -#define ___rd_p6(...) ___rd_p5(___nolast(__VA_ARGS__)) ___rd_last(__VA_ARGS__) -#define ___rd_p7(...) ___rd_p6(___nolast(__VA_ARGS__)) ___rd_last(__VA_ARGS__) -#define ___rd_p8(...) ___rd_p7(___nolast(__VA_ARGS__)) ___rd_last(__VA_ARGS__) -#define ___rd_p9(...) ___rd_p8(___nolast(__VA_ARGS__)) ___rd_last(__VA_ARGS__) -#define ___read_ptrs(src, ...) \ - ___apply(___rd_p, ___narg(__VA_ARGS__))(src, __VA_ARGS__) - -#define ___core_read0(fn, dst, src, a) \ +#define ___rd_first(fn, src, a) ___read(fn, &__t, ___type(src), src, a); +#define ___rd_last(fn, ...) \ + ___read(fn, &__t, ___type(___nolast(__VA_ARGS__)), __t, ___last(__VA_ARGS__)); +#define ___rd_p1(fn, ...) const void *__t; ___rd_first(fn, __VA_ARGS__) +#define ___rd_p2(fn, ...) ___rd_p1(fn, ___nolast(__VA_ARGS__)) ___rd_last(fn, __VA_ARGS__) +#define ___rd_p3(fn, ...) ___rd_p2(fn, ___nolast(__VA_ARGS__)) ___rd_last(fn, __VA_ARGS__) +#define ___rd_p4(fn, ...) ___rd_p3(fn, ___nolast(__VA_ARGS__)) ___rd_last(fn, __VA_ARGS__) +#define ___rd_p5(fn, ...) ___rd_p4(fn, ___nolast(__VA_ARGS__)) ___rd_last(fn, __VA_ARGS__) +#define ___rd_p6(fn, ...) ___rd_p5(fn, ___nolast(__VA_ARGS__)) ___rd_last(fn, __VA_ARGS__) +#define ___rd_p7(fn, ...) ___rd_p6(fn, ___nolast(__VA_ARGS__)) ___rd_last(fn, __VA_ARGS__) +#define ___rd_p8(fn, ...) ___rd_p7(fn, ___nolast(__VA_ARGS__)) ___rd_last(fn, __VA_ARGS__) +#define ___rd_p9(fn, ...) ___rd_p8(fn, ___nolast(__VA_ARGS__)) ___rd_last(fn, __VA_ARGS__) +#define ___read_ptrs(fn, src, ...) \ + ___apply(___rd_p, ___narg(__VA_ARGS__))(fn, src, __VA_ARGS__) + +#define ___core_read0(fn, fn_ptr, dst, src, a) \ ___read(fn, dst, ___type(src), src, a); -#define ___core_readN(fn, dst, src, ...) \ - ___read_ptrs(src, ___nolast(__VA_ARGS__)) \ +#define ___core_readN(fn, fn_ptr, dst, src, ...) \ + ___read_ptrs(fn_ptr, src, ___nolast(__VA_ARGS__)) \ ___read(fn, dst, ___type(src, ___nolast(__VA_ARGS__)), __t, \ ___last(__VA_ARGS__)); -#define ___core_read(fn, dst, src, a, ...) \ - ___apply(___core_read, ___empty(__VA_ARGS__))(fn, dst, \ +#define ___core_read(fn, fn_ptr, dst, src, a, ...) \ + ___apply(___core_read, ___empty(__VA_ARGS__))(fn, fn_ptr, dst, \ src, a, ##__VA_ARGS__) /* @@ -295,20 +297,32 @@ enum bpf_enum_value_kind { * BPF_CORE_READ(), in which final field is read into user-provided storage. * See BPF_CORE_READ() below for more details on general usage. */ -#define BPF_CORE_READ_INTO(dst, src, a, ...) \ - ({ \ - ___core_read(bpf_core_read, dst, (src), a, ##__VA_ARGS__) \ - }) +#define BPF_CORE_READ_INTO(dst, src, a, ...) ({ \ + ___core_read(bpf_core_read, bpf_core_read, \ + dst, (src), a, ##__VA_ARGS__) \ +}) + +/* Variant of BPF_CORE_READ_INTO() for reading from user-space memory */ +#define BPF_CORE_READ_USER_INTO(dst, src, a, ...) ({ \ + ___core_read(bpf_core_read_user, bpf_core_read_user, \ + dst, (src), a, ##__VA_ARGS__) \ +}) /* * BPF_CORE_READ_STR_INTO() does same "pointer chasing" as * BPF_CORE_READ() for intermediate pointers, but then executes (and returns * corresponding error code) bpf_core_read_str() for final string read. */ -#define BPF_CORE_READ_STR_INTO(dst, src, a, ...) \ - ({ \ - ___core_read(bpf_core_read_str, dst, (src), a, ##__VA_ARGS__)\ - }) +#define BPF_CORE_READ_STR_INTO(dst, src, a, ...) ({ \ + ___core_read(bpf_core_read_str, bpf_core_read, \ + dst, (src), a, ##__VA_ARGS__) \ +}) + +/* Variant of BPF_CORE_READ_STR_INTO() for reading from user-space memory */ +#define BPF_CORE_READ_USER_STR_INTO(dst, src, a, ...) ({ \ + ___core_read(bpf_core_read_user_str, bpf_core_read_user, \ + dst, (src), a, ##__VA_ARGS__) \ +}) /* * BPF_CORE_READ() is used to simplify BPF CO-RE relocatable read, especially @@ -334,12 +348,18 @@ enum bpf_enum_value_kind { * N.B. Only up to 9 "field accessors" are supported, which should be more * than enough for any practical purpose. */ -#define BPF_CORE_READ(src, a, ...) \ - ({ \ - ___type((src), a, ##__VA_ARGS__) __r; \ - BPF_CORE_READ_INTO(&__r, (src), a, ##__VA_ARGS__); \ - __r; \ - }) +#define BPF_CORE_READ(src, a, ...) ({ \ + ___type((src), a, ##__VA_ARGS__) __r; \ + BPF_CORE_READ_INTO(&__r, (src), a, ##__VA_ARGS__); \ + __r; \ +}) + +/* Variant of BPF_CORE_READ() for reading from user-space memory */ +#define BPF_CORE_READ_USER(src, a, ...) ({ \ + ___type((src), a, ##__VA_ARGS__) __r; \ + BPF_CORE_READ_USER_INTO(&__r, (src), a, ##__VA_ARGS__); \ + __r; \ +}) #endif -- cgit v1.2.3 From a4b09a9ef9451b09d87550720f8db3ae280b3eea Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 18 Dec 2020 15:56:13 -0800 Subject: libbpf: Add non-CO-RE variants of BPF_CORE_READ() macro family BPF_CORE_READ(), in addition to handling CO-RE relocations, also allows much nicer way to read data structures with nested pointers. Instead of writing a sequence of bpf_probe_read() calls to follow links, one can just write BPF_CORE_READ(a, b, c, d) to effectively do a->b->c->d read. This is a welcome ability when porting BCC code, which (in most cases) allows exactly the intuitive a->b->c->d variant. This patch adds non-CO-RE variants of BPF_CORE_READ() family of macros for cases where CO-RE is not supported (e.g., old kernels). In such cases, the property of shortening a sequence of bpf_probe_read()s to a simple BPF_PROBE_READ(a, b, c, d) invocation is still desirable, especially when porting BCC code to libbpf. Yet, no CO-RE relocation is going to be emitted. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20201218235614.2284956-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/bpf_core_read.h | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/tools/lib/bpf/bpf_core_read.h b/tools/lib/bpf/bpf_core_read.h index db0c735ceb53..9456aabcb03a 100644 --- a/tools/lib/bpf/bpf_core_read.h +++ b/tools/lib/bpf/bpf_core_read.h @@ -308,6 +308,18 @@ enum bpf_enum_value_kind { dst, (src), a, ##__VA_ARGS__) \ }) +/* Non-CO-RE variant of BPF_CORE_READ_INTO() */ +#define BPF_PROBE_READ_INTO(dst, src, a, ...) ({ \ + ___core_read(bpf_probe_read, bpf_probe_read, \ + dst, (src), a, ##__VA_ARGS__) \ +}) + +/* Non-CO-RE variant of BPF_CORE_READ_USER_INTO() */ +#define BPF_PROBE_READ_USER_INTO(dst, src, a, ...) ({ \ + ___core_read(bpf_probe_read_user, bpf_probe_read_user, \ + dst, (src), a, ##__VA_ARGS__) \ +}) + /* * BPF_CORE_READ_STR_INTO() does same "pointer chasing" as * BPF_CORE_READ() for intermediate pointers, but then executes (and returns @@ -324,6 +336,18 @@ enum bpf_enum_value_kind { dst, (src), a, ##__VA_ARGS__) \ }) +/* Non-CO-RE variant of BPF_CORE_READ_STR_INTO() */ +#define BPF_PROBE_READ_STR_INTO(dst, src, a, ...) ({ \ + ___core_read(bpf_probe_read_str, bpf_probe_read, \ + dst, (src), a, ##__VA_ARGS__) \ +}) + +/* Non-CO-RE variant of BPF_CORE_READ_USER_STR_INTO() */ +#define BPF_PROBE_READ_USER_STR_INTO(dst, src, a, ...) ({ \ + ___core_read(bpf_probe_read_user_str, bpf_probe_read_user, \ + dst, (src), a, ##__VA_ARGS__) \ +}) + /* * BPF_CORE_READ() is used to simplify BPF CO-RE relocatable read, especially * when there are few pointer chasing steps. @@ -361,5 +385,19 @@ enum bpf_enum_value_kind { __r; \ }) +/* Non-CO-RE variant of BPF_CORE_READ() */ +#define BPF_PROBE_READ(src, a, ...) ({ \ + ___type((src), a, ##__VA_ARGS__) __r; \ + BPF_PROBE_READ_INTO(&__r, (src), a, ##__VA_ARGS__); \ + __r; \ +}) + +/* Non-CO-RE variant of BPF_CORE_READ_USER() */ +#define BPF_PROBE_READ_USER(src, a, ...) ({ \ + ___type((src), a, ##__VA_ARGS__) __r; \ + BPF_PROBE_READ_USER_INTO(&__r, (src), a, ##__VA_ARGS__); \ + __r; \ +}) + #endif -- cgit v1.2.3 From 9e80114b1a271803767d4c5baa11ea9e8678aac3 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 18 Dec 2020 15:56:14 -0800 Subject: selftests/bpf: Add tests for user- and non-CO-RE BPF_CORE_READ() variants Add selftests validating that newly added variations of BPF_CORE_READ(), for use with user-space addresses and for non-CO-RE reads, work as expected. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20201218235614.2284956-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/core_read_macros.c | 64 ++++++++++++++++++++++ .../selftests/bpf/progs/test_core_read_macros.c | 50 +++++++++++++++++ 2 files changed, 114 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/core_read_macros.c create mode 100644 tools/testing/selftests/bpf/progs/test_core_read_macros.c diff --git a/tools/testing/selftests/bpf/prog_tests/core_read_macros.c b/tools/testing/selftests/bpf/prog_tests/core_read_macros.c new file mode 100644 index 000000000000..96f5cf3c6fa2 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/core_read_macros.c @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ + +#include + +struct callback_head { + struct callback_head *next; + void (*func)(struct callback_head *head); +}; + +/* ___shuffled flavor is just an illusion for BPF code, it doesn't really + * exist and user-space needs to provide data in the memory layout that + * matches callback_head. We just defined ___shuffled flavor to make it easier + * to work with the skeleton + */ +struct callback_head___shuffled { + struct callback_head___shuffled *next; + void (*func)(struct callback_head *head); +}; + +#include "test_core_read_macros.skel.h" + +void test_core_read_macros(void) +{ + int duration = 0, err; + struct test_core_read_macros* skel; + struct test_core_read_macros__bss *bss; + struct callback_head u_probe_in; + struct callback_head___shuffled u_core_in; + + skel = test_core_read_macros__open_and_load(); + if (CHECK(!skel, "skel_open", "failed to open skeleton\n")) + return; + bss = skel->bss; + bss->my_pid = getpid(); + + /* next pointers have to be set from the kernel side */ + bss->k_probe_in.func = (void *)(long)0x1234; + bss->k_core_in.func = (void *)(long)0xabcd; + + u_probe_in.next = &u_probe_in; + u_probe_in.func = (void *)(long)0x5678; + bss->u_probe_in = &u_probe_in; + + u_core_in.next = &u_core_in; + u_core_in.func = (void *)(long)0xdbca; + bss->u_core_in = &u_core_in; + + err = test_core_read_macros__attach(skel); + if (CHECK(err, "skel_attach", "skeleton attach failed: %d\n", err)) + goto cleanup; + + /* trigger tracepoint */ + usleep(1); + + ASSERT_EQ(bss->k_probe_out, 0x1234, "k_probe_out"); + ASSERT_EQ(bss->k_core_out, 0xabcd, "k_core_out"); + + ASSERT_EQ(bss->u_probe_out, 0x5678, "u_probe_out"); + ASSERT_EQ(bss->u_core_out, 0xdbca, "u_core_out"); + +cleanup: + test_core_read_macros__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/test_core_read_macros.c b/tools/testing/selftests/bpf/progs/test_core_read_macros.c new file mode 100644 index 000000000000..fd54caa17319 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_core_read_macros.c @@ -0,0 +1,50 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Facebook + +#include "vmlinux.h" +#include +#include + +char _license[] SEC("license") = "GPL"; + +/* shuffled layout for relocatable (CO-RE) reads */ +struct callback_head___shuffled { + void (*func)(struct callback_head___shuffled *head); + struct callback_head___shuffled *next; +}; + +struct callback_head k_probe_in = {}; +struct callback_head___shuffled k_core_in = {}; + +struct callback_head *u_probe_in = 0; +struct callback_head___shuffled *u_core_in = 0; + +long k_probe_out = 0; +long u_probe_out = 0; + +long k_core_out = 0; +long u_core_out = 0; + +int my_pid = 0; + +SEC("raw_tracepoint/sys_enter") +int handler(void *ctx) +{ + int pid = bpf_get_current_pid_tgid() >> 32; + + if (my_pid != pid) + return 0; + + /* next pointers for kernel address space have to be initialized from + * BPF side, user-space mmaped addresses are stil user-space addresses + */ + k_probe_in.next = &k_probe_in; + __builtin_preserve_access_index(({k_core_in.next = &k_core_in;})); + + k_probe_out = (long)BPF_PROBE_READ(&k_probe_in, next, next, func); + k_core_out = (long)BPF_CORE_READ(&k_core_in, next, next, func); + u_probe_out = (long)BPF_PROBE_READ_USER(u_probe_in, next, next, func); + u_core_out = (long)BPF_CORE_READ_USER(u_core_in, next, next, func); + + return 0; +} -- cgit v1.2.3 From 619775c3cfd2bc8559abc4395bf7d85b72bd723f Mon Sep 17 00:00:00 2001 From: Leah Neukirchen Date: Wed, 16 Dec 2020 11:03:06 +0100 Subject: bpf: Remove unnecessary include from preload/iterators This program does not use argp (which is a glibcism). Instead include directly, which was pulled in by . Signed-off-by: Leah Neukirchen Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201216100306.30942-1-leah@vuxu.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/preload/iterators/iterators.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/preload/iterators/iterators.c b/kernel/bpf/preload/iterators/iterators.c index b7ff87939172..5d872a705470 100644 --- a/kernel/bpf/preload/iterators/iterators.c +++ b/kernel/bpf/preload/iterators/iterators.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2020 Facebook */ -#include +#include #include #include #include -- cgit v1.2.3 From ec24e11e0817404ef9e04b50170e1a68793cd9f5 Mon Sep 17 00:00:00 2001 From: Zheng Yongjun Date: Tue, 29 Dec 2020 21:48:34 +0800 Subject: bpf: Replace fput with sockfd_put in sock map The function sockfd_lookup uses fget on the value that is stored in the file field of the returned structure, so fput should ultimately be applied to this value. This can be done directly, but it seems better to use the specific macro sockfd_put, which does the same thing. The cleanup was done using the following semantic patch: (http://www.emn.fr/x-info/coccinelle/) // @@ expression s; @@ s = sockfd_lookup(...) ... + sockfd_put(s); ?- fput(s->file); // Signed-off-by: Zheng Yongjun Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20201229134834.22962-1-zhengyongjun3@huawei.com Signed-off-by: Alexei Starovoitov --- net/core/sock_map.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 64b5ec14ff50..d758fb83c884 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -602,7 +602,7 @@ int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, ret = sock_hash_update_common(map, key, sk, flags); sock_map_sk_release(sk); out: - fput(sock->file); + sockfd_put(sock); return ret; } -- cgit v1.2.3 From 43b5169d8355ccf26d726fbc75f083b2429113e4 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 22 Dec 2020 22:09:28 +0100 Subject: net, xdp: Introduce xdp_init_buff utility routine Introduce xdp_init_buff utility routine to initialize xdp_buff fields const over NAPI iterations (e.g. frame_sz or rxq pointer). Rely on xdp_init_buff in all XDP capable drivers. Signed-off-by: Lorenzo Bianconi Signed-off-by: Daniel Borkmann Reviewed-by: Alexander Duyck Acked-by: Jesper Dangaard Brouer Acked-by: John Fastabend Acked-by: Shay Agroskin Acked-by: Martin Habets Acked-by: Camelia Groza Acked-by: Marcin Wojtas Link: https://lore.kernel.org/bpf/7f8329b6da1434dc2b05a77f2e800b29628a8913.1608670965.git.lorenzo@kernel.org Signed-off-by: Alexei Starovoitov --- drivers/net/ethernet/amazon/ena/ena_netdev.c | 3 +-- drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c | 4 ++-- drivers/net/ethernet/cavium/thunder/nicvf_main.c | 4 ++-- drivers/net/ethernet/freescale/dpaa/dpaa_eth.c | 4 ++-- drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 8 ++++---- drivers/net/ethernet/intel/i40e/i40e_txrx.c | 6 +++--- drivers/net/ethernet/intel/ice/ice_txrx.c | 6 +++--- drivers/net/ethernet/intel/igb/igb_main.c | 6 +++--- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 7 +++---- drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 7 +++---- drivers/net/ethernet/marvell/mvneta.c | 3 +-- drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 8 +++++--- drivers/net/ethernet/mellanox/mlx4/en_rx.c | 3 +-- drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 3 +-- drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 4 ++-- drivers/net/ethernet/qlogic/qede/qede_fp.c | 3 +-- drivers/net/ethernet/sfc/rx.c | 3 +-- drivers/net/ethernet/socionext/netsec.c | 3 +-- drivers/net/ethernet/ti/cpsw.c | 4 ++-- drivers/net/ethernet/ti/cpsw_new.c | 4 ++-- drivers/net/hyperv/netvsc_bpf.c | 3 +-- drivers/net/tun.c | 7 +++---- drivers/net/veth.c | 8 ++++---- drivers/net/virtio_net.c | 6 ++---- drivers/net/xen-netfront.c | 4 ++-- include/net/xdp.h | 7 +++++++ net/bpf/test_run.c | 4 ++-- net/core/dev.c | 8 ++++---- 28 files changed, 68 insertions(+), 72 deletions(-) diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c index 06596fa1f9fe..43331f6967c9 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.c +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c @@ -1634,8 +1634,7 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi, netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev, "%s qid %d\n", __func__, rx_ring->qid); res_budget = budget; - xdp.rxq = &rx_ring->xdp_rxq; - xdp.frame_sz = ENA_PAGE_SIZE; + xdp_init_buff(&xdp, ENA_PAGE_SIZE, &rx_ring->xdp_rxq); do { xdp_verdict = XDP_PASS; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c index fcc262064766..ab805d6750e5 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c @@ -133,12 +133,12 @@ bool bnxt_rx_xdp(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, u16 cons, dma_sync_single_for_cpu(&pdev->dev, mapping + offset, *len, bp->rx_dir); txr = rxr->bnapi->tx_ring; + /* BNXT_RX_PAGE_MODE(bp) when XDP enabled */ + xdp_init_buff(&xdp, PAGE_SIZE, &rxr->xdp_rxq); xdp.data_hard_start = *data_ptr - offset; xdp.data = *data_ptr; xdp_set_data_meta_invalid(&xdp); xdp.data_end = *data_ptr + *len; - xdp.rxq = &rxr->xdp_rxq; - xdp.frame_sz = PAGE_SIZE; /* BNXT_RX_PAGE_MODE(bp) when XDP enabled */ orig_data = xdp.data; rcu_read_lock(); diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c index f3b7b443f964..9fc672f075f2 100644 --- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c +++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c @@ -547,12 +547,12 @@ static inline bool nicvf_xdp_rx(struct nicvf *nic, struct bpf_prog *prog, cpu_addr = (u64)phys_to_virt(cpu_addr); page = virt_to_page((void *)cpu_addr); + xdp_init_buff(&xdp, RCV_FRAG_LEN + XDP_PACKET_HEADROOM, + &rq->xdp_rxq); xdp.data_hard_start = page_address(page); xdp.data = (void *)cpu_addr; xdp_set_data_meta_invalid(&xdp); xdp.data_end = xdp.data + len; - xdp.rxq = &rq->xdp_rxq; - xdp.frame_sz = RCV_FRAG_LEN + XDP_PACKET_HEADROOM; orig_data = xdp.data; rcu_read_lock(); diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c index 4360ce4d3fb6..26e20b96fd96 100644 --- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c +++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c @@ -2532,12 +2532,12 @@ static u32 dpaa_run_xdp(struct dpaa_priv *priv, struct qm_fd *fd, void *vaddr, return XDP_PASS; } + xdp_init_buff(&xdp, DPAA_BP_RAW_SIZE - DPAA_TX_PRIV_DATA_SIZE, + &dpaa_fq->xdp_rxq); xdp.data = vaddr + fd_off; xdp.data_meta = xdp.data; xdp.data_hard_start = xdp.data - XDP_PACKET_HEADROOM; xdp.data_end = xdp.data + qm_fd_get_length(fd); - xdp.frame_sz = DPAA_BP_RAW_SIZE - DPAA_TX_PRIV_DATA_SIZE; - xdp.rxq = &dpaa_fq->xdp_rxq; /* We reserve a fixed headroom of 256 bytes under the erratum and we * offer it all to XDP programs to use. If no room is left for the diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c index fb0bcd18ec0c..3a7892a66fe5 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c @@ -358,14 +358,14 @@ static u32 dpaa2_eth_run_xdp(struct dpaa2_eth_priv *priv, if (!xdp_prog) goto out; + xdp_init_buff(&xdp, + DPAA2_ETH_RX_BUF_RAW_SIZE - + (dpaa2_fd_get_offset(fd) - XDP_PACKET_HEADROOM), + &ch->xdp_rxq); xdp.data = vaddr + dpaa2_fd_get_offset(fd); xdp.data_end = xdp.data + dpaa2_fd_get_len(fd); xdp.data_hard_start = xdp.data - XDP_PACKET_HEADROOM; xdp_set_data_meta_invalid(&xdp); - xdp.rxq = &ch->xdp_rxq; - - xdp.frame_sz = DPAA2_ETH_RX_BUF_RAW_SIZE - - (dpaa2_fd_get_offset(fd) - XDP_PACKET_HEADROOM); xdp_act = bpf_prog_run_xdp(xdp_prog, &xdp); diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index 4aca637d4a23..a87fb8264d0c 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -2344,7 +2344,7 @@ static void i40e_inc_ntc(struct i40e_ring *rx_ring) **/ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget) { - unsigned int total_rx_bytes = 0, total_rx_packets = 0; + unsigned int total_rx_bytes = 0, total_rx_packets = 0, frame_sz = 0; struct sk_buff *skb = rx_ring->skb; u16 cleaned_count = I40E_DESC_UNUSED(rx_ring); unsigned int xdp_xmit = 0; @@ -2352,9 +2352,9 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget) struct xdp_buff xdp; #if (PAGE_SIZE < 8192) - xdp.frame_sz = i40e_rx_frame_truesize(rx_ring, 0); + frame_sz = i40e_rx_frame_truesize(rx_ring, 0); #endif - xdp.rxq = &rx_ring->xdp_rxq; + xdp_init_buff(&xdp, frame_sz, &rx_ring->xdp_rxq); while (likely(total_rx_packets < (unsigned int)budget)) { struct i40e_rx_buffer *rx_buffer; diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c index a2d0aad8cfdd..500e93bf6238 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c @@ -1089,18 +1089,18 @@ ice_is_non_eop(struct ice_ring *rx_ring, union ice_32b_rx_flex_desc *rx_desc, */ int ice_clean_rx_irq(struct ice_ring *rx_ring, int budget) { - unsigned int total_rx_bytes = 0, total_rx_pkts = 0; + unsigned int total_rx_bytes = 0, total_rx_pkts = 0, frame_sz = 0; u16 cleaned_count = ICE_DESC_UNUSED(rx_ring); unsigned int xdp_res, xdp_xmit = 0; struct bpf_prog *xdp_prog = NULL; struct xdp_buff xdp; bool failure; - xdp.rxq = &rx_ring->xdp_rxq; /* Frame size depend on rx_ring setup when PAGE_SIZE=4K */ #if (PAGE_SIZE < 8192) - xdp.frame_sz = ice_rx_frame_truesize(rx_ring, 0); + frame_sz = ice_rx_frame_truesize(rx_ring, 0); #endif + xdp_init_buff(&xdp, frame_sz, &rx_ring->xdp_rxq); /* start the loop to process Rx packets bounded by 'budget' */ while (likely(total_rx_pkts < (unsigned int)budget)) { diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 03f78fdb0dcd..cf9e8b7d2c70 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -8681,13 +8681,13 @@ static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget) u16 cleaned_count = igb_desc_unused(rx_ring); unsigned int xdp_xmit = 0; struct xdp_buff xdp; - - xdp.rxq = &rx_ring->xdp_rxq; + u32 frame_sz = 0; /* Frame size depend on rx_ring setup when PAGE_SIZE=4K */ #if (PAGE_SIZE < 8192) - xdp.frame_sz = igb_rx_frame_truesize(rx_ring, 0); + frame_sz = igb_rx_frame_truesize(rx_ring, 0); #endif + xdp_init_buff(&xdp, frame_sz, &rx_ring->xdp_rxq); while (likely(total_packets < budget)) { union e1000_adv_rx_desc *rx_desc; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 6cbbe09ce8a0..7a7c86835a87 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -2291,7 +2291,7 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector, struct ixgbe_ring *rx_ring, const int budget) { - unsigned int total_rx_bytes = 0, total_rx_packets = 0; + unsigned int total_rx_bytes = 0, total_rx_packets = 0, frame_sz = 0; struct ixgbe_adapter *adapter = q_vector->adapter; #ifdef IXGBE_FCOE int ddp_bytes; @@ -2301,12 +2301,11 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector, unsigned int xdp_xmit = 0; struct xdp_buff xdp; - xdp.rxq = &rx_ring->xdp_rxq; - /* Frame size depend on rx_ring setup when PAGE_SIZE=4K */ #if (PAGE_SIZE < 8192) - xdp.frame_sz = ixgbe_rx_frame_truesize(rx_ring, 0); + frame_sz = ixgbe_rx_frame_truesize(rx_ring, 0); #endif + xdp_init_buff(&xdp, frame_sz, &rx_ring->xdp_rxq); while (likely(total_rx_packets < budget)) { union ixgbe_adv_rx_desc *rx_desc; diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index 4061cd7db5dd..624efcd71569 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -1121,19 +1121,18 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector, struct ixgbevf_ring *rx_ring, int budget) { - unsigned int total_rx_bytes = 0, total_rx_packets = 0; + unsigned int total_rx_bytes = 0, total_rx_packets = 0, frame_sz = 0; struct ixgbevf_adapter *adapter = q_vector->adapter; u16 cleaned_count = ixgbevf_desc_unused(rx_ring); struct sk_buff *skb = rx_ring->skb; bool xdp_xmit = false; struct xdp_buff xdp; - xdp.rxq = &rx_ring->xdp_rxq; - /* Frame size depend on rx_ring setup when PAGE_SIZE=4K */ #if (PAGE_SIZE < 8192) - xdp.frame_sz = ixgbevf_rx_frame_truesize(rx_ring, 0); + frame_sz = ixgbevf_rx_frame_truesize(rx_ring, 0); #endif + xdp_init_buff(&xdp, frame_sz, &rx_ring->xdp_rxq); while (likely(total_rx_packets < budget)) { struct ixgbevf_rx_buffer *rx_buffer; diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index bc4d8d144401..038c6b436cba 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -2363,9 +2363,8 @@ static int mvneta_rx_swbm(struct napi_struct *napi, u32 desc_status, frame_sz; struct xdp_buff xdp_buf; + xdp_init_buff(&xdp_buf, PAGE_SIZE, &rxq->xdp_rxq); xdp_buf.data_hard_start = NULL; - xdp_buf.frame_sz = PAGE_SIZE; - xdp_buf.rxq = &rxq->xdp_rxq; sinfo.nr_frags = 0; diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index 4b1808acef58..5872cb011ae1 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -3563,16 +3563,18 @@ static int mvpp2_rx(struct mvpp2_port *port, struct napi_struct *napi, frag_size = bm_pool->frag_size; if (xdp_prog) { + struct xdp_rxq_info *xdp_rxq; + xdp.data_hard_start = data; xdp.data = data + MVPP2_MH_SIZE + MVPP2_SKB_HEADROOM; xdp.data_end = xdp.data + rx_bytes; - xdp.frame_sz = PAGE_SIZE; if (bm_pool->pkt_size == MVPP2_BM_SHORT_PKT_SIZE) - xdp.rxq = &rxq->xdp_rxq_short; + xdp_rxq = &rxq->xdp_rxq_short; else - xdp.rxq = &rxq->xdp_rxq_long; + xdp_rxq = &rxq->xdp_rxq_long; + xdp_init_buff(&xdp, PAGE_SIZE, xdp_rxq); xdp_set_data_meta_invalid(&xdp); ret = mvpp2_run_xdp(port, rxq, xdp_prog, &xdp, pp, &ps); diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index c1c9118a66c9..93da9c2d50c0 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -682,8 +682,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud /* Protect accesses to: ring->xdp_prog, priv->mac_hash list */ rcu_read_lock(); xdp_prog = rcu_dereference(ring->xdp_prog); - xdp.rxq = &ring->xdp_rxq; - xdp.frame_sz = priv->frag_info[0].frag_stride; + xdp_init_buff(&xdp, priv->frag_info[0].frag_stride, &ring->xdp_rxq); doorbell_pending = false; /* We assume a 1:1 mapping between CQEs and Rx descriptors, so Rx diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index 7f5851c61218..bc7c81f2b036 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -1126,12 +1126,11 @@ struct sk_buff *mlx5e_build_linear_skb(struct mlx5e_rq *rq, void *va, static void mlx5e_fill_xdp_buff(struct mlx5e_rq *rq, void *va, u16 headroom, u32 len, struct xdp_buff *xdp) { + xdp_init_buff(xdp, rq->buff.frame0_sz, &rq->xdp_rxq); xdp->data_hard_start = va; xdp->data = va + headroom; xdp_set_data_meta_invalid(xdp); xdp->data_end = xdp->data + len; - xdp->rxq = &rq->xdp_rxq; - xdp->frame_sz = rq->buff.frame0_sz; } static struct sk_buff * diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index 7ba8f4c7f26d..513bc60bcd09 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -1822,8 +1822,8 @@ static int nfp_net_rx(struct nfp_net_rx_ring *rx_ring, int budget) rcu_read_lock(); xdp_prog = READ_ONCE(dp->xdp_prog); true_bufsz = xdp_prog ? PAGE_SIZE : dp->fl_bufsz; - xdp.frame_sz = PAGE_SIZE - NFP_NET_RX_BUF_HEADROOM; - xdp.rxq = &rx_ring->xdp_rxq; + xdp_init_buff(&xdp, PAGE_SIZE - NFP_NET_RX_BUF_HEADROOM, + &rx_ring->xdp_rxq); tx_ring = r_vec->xdp_ring; while (pkts_polled < budget) { diff --git a/drivers/net/ethernet/qlogic/qede/qede_fp.c b/drivers/net/ethernet/qlogic/qede/qede_fp.c index ca0ee29a57b5..8d0c6d62022a 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_fp.c +++ b/drivers/net/ethernet/qlogic/qede/qede_fp.c @@ -1090,12 +1090,11 @@ static bool qede_rx_xdp(struct qede_dev *edev, struct xdp_buff xdp; enum xdp_action act; + xdp_init_buff(&xdp, rxq->rx_buf_seg_size, &rxq->xdp_rxq); xdp.data_hard_start = page_address(bd->data); xdp.data = xdp.data_hard_start + *data_offset; xdp_set_data_meta_invalid(&xdp); xdp.data_end = xdp.data + *len; - xdp.rxq = &rxq->xdp_rxq; - xdp.frame_sz = rxq->rx_buf_seg_size; /* PAGE_SIZE when XDP enabled */ /* Queues always have a full reset currently, so for the time * being until there's atomic program replace just mark read diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c index aaa112877561..eaa6650955d1 100644 --- a/drivers/net/ethernet/sfc/rx.c +++ b/drivers/net/ethernet/sfc/rx.c @@ -293,14 +293,13 @@ static bool efx_do_xdp(struct efx_nic *efx, struct efx_channel *channel, memcpy(rx_prefix, *ehp - efx->rx_prefix_size, efx->rx_prefix_size); + xdp_init_buff(&xdp, efx->rx_page_buf_step, &rx_queue->xdp_rxq_info); xdp.data = *ehp; xdp.data_hard_start = xdp.data - EFX_XDP_HEADROOM; /* No support yet for XDP metadata */ xdp_set_data_meta_invalid(&xdp); xdp.data_end = xdp.data + rx_buf->len; - xdp.rxq = &rx_queue->xdp_rxq_info; - xdp.frame_sz = efx->rx_page_buf_step; xdp_act = bpf_prog_run_xdp(xdp_prog, &xdp); rcu_read_unlock(); diff --git a/drivers/net/ethernet/socionext/netsec.c b/drivers/net/ethernet/socionext/netsec.c index 19d20a6d0d44..945ca9517bf9 100644 --- a/drivers/net/ethernet/socionext/netsec.c +++ b/drivers/net/ethernet/socionext/netsec.c @@ -956,8 +956,7 @@ static int netsec_process_rx(struct netsec_priv *priv, int budget) u32 xdp_act = 0; int done = 0; - xdp.rxq = &dring->xdp_rxq; - xdp.frame_sz = PAGE_SIZE; + xdp_init_buff(&xdp, PAGE_SIZE, &dring->xdp_rxq); rcu_read_lock(); xdp_prog = READ_ONCE(priv->xdp_prog); diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c index b0f00b4edd94..78a923391828 100644 --- a/drivers/net/ethernet/ti/cpsw.c +++ b/drivers/net/ethernet/ti/cpsw.c @@ -392,6 +392,8 @@ static void cpsw_rx_handler(void *token, int len, int status) } if (priv->xdp_prog) { + xdp_init_buff(&xdp, PAGE_SIZE, &priv->xdp_rxq[ch]); + if (status & CPDMA_RX_VLAN_ENCAP) { xdp.data = pa + CPSW_HEADROOM + CPSW_RX_VLAN_ENCAP_HDR_SIZE; @@ -405,8 +407,6 @@ static void cpsw_rx_handler(void *token, int len, int status) xdp_set_data_meta_invalid(&xdp); xdp.data_hard_start = pa; - xdp.rxq = &priv->xdp_rxq[ch]; - xdp.frame_sz = PAGE_SIZE; port = priv->emac_port + cpsw->data.dual_emac; ret = cpsw_run_xdp(priv, ch, &xdp, page, port); diff --git a/drivers/net/ethernet/ti/cpsw_new.c b/drivers/net/ethernet/ti/cpsw_new.c index 2f5e0ad23ad7..1b3385ec9645 100644 --- a/drivers/net/ethernet/ti/cpsw_new.c +++ b/drivers/net/ethernet/ti/cpsw_new.c @@ -335,6 +335,8 @@ static void cpsw_rx_handler(void *token, int len, int status) } if (priv->xdp_prog) { + xdp_init_buff(&xdp, PAGE_SIZE, &priv->xdp_rxq[ch]); + if (status & CPDMA_RX_VLAN_ENCAP) { xdp.data = pa + CPSW_HEADROOM + CPSW_RX_VLAN_ENCAP_HDR_SIZE; @@ -348,8 +350,6 @@ static void cpsw_rx_handler(void *token, int len, int status) xdp_set_data_meta_invalid(&xdp); xdp.data_hard_start = pa; - xdp.rxq = &priv->xdp_rxq[ch]; - xdp.frame_sz = PAGE_SIZE; ret = cpsw_run_xdp(priv, ch, &xdp, page, priv->emac_port); if (ret != CPSW_XDP_PASS) diff --git a/drivers/net/hyperv/netvsc_bpf.c b/drivers/net/hyperv/netvsc_bpf.c index 440486d9c999..14a7ee4c6899 100644 --- a/drivers/net/hyperv/netvsc_bpf.c +++ b/drivers/net/hyperv/netvsc_bpf.c @@ -44,12 +44,11 @@ u32 netvsc_run_xdp(struct net_device *ndev, struct netvsc_channel *nvchan, goto out; } + xdp_init_buff(xdp, PAGE_SIZE, &nvchan->xdp_rxq); xdp->data_hard_start = page_address(page); xdp->data = xdp->data_hard_start + NETVSC_XDP_HDRM; xdp_set_data_meta_invalid(xdp); xdp->data_end = xdp->data + len; - xdp->rxq = &nvchan->xdp_rxq; - xdp->frame_sz = PAGE_SIZE; memcpy(xdp->data, data, len); diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 02a93cfdb6b1..3b7728433a86 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1599,12 +1599,11 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun, struct xdp_buff xdp; u32 act; + xdp_init_buff(&xdp, buflen, &tfile->xdp_rxq); xdp.data_hard_start = buf; xdp.data = buf + pad; xdp_set_data_meta_invalid(&xdp); xdp.data_end = xdp.data + len; - xdp.rxq = &tfile->xdp_rxq; - xdp.frame_sz = buflen; act = bpf_prog_run_xdp(xdp_prog, &xdp); if (act == XDP_REDIRECT || act == XDP_TX) { @@ -2342,9 +2341,9 @@ static int tun_xdp_one(struct tun_struct *tun, skb_xdp = true; goto build; } + + xdp_init_buff(xdp, buflen, &tfile->xdp_rxq); xdp_set_data_meta_invalid(xdp); - xdp->rxq = &tfile->xdp_rxq; - xdp->frame_sz = buflen; act = bpf_prog_run_xdp(xdp_prog, xdp); err = tun_xdp_act(tun, xdp_prog, xdp, act); diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 02bfcdf50a7a..25f3601fb6dd 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -654,7 +654,7 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, struct veth_xdp_tx_bq *bq, struct veth_stats *stats) { - u32 pktlen, headroom, act, metalen; + u32 pktlen, headroom, act, metalen, frame_sz; void *orig_data, *orig_data_end; struct bpf_prog *xdp_prog; int mac_len, delta, off; @@ -714,11 +714,11 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, xdp.data = skb_mac_header(skb); xdp.data_end = xdp.data + pktlen; xdp.data_meta = xdp.data; - xdp.rxq = &rq->xdp_rxq; /* SKB "head" area always have tailroom for skb_shared_info */ - xdp.frame_sz = (void *)skb_end_pointer(skb) - xdp.data_hard_start; - xdp.frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + frame_sz = (void *)skb_end_pointer(skb) - xdp.data_hard_start; + frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + xdp_init_buff(&xdp, frame_sz, &rq->xdp_rxq); orig_data = xdp.data; orig_data_end = xdp.data_end; diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 508408fbe78f..36b0f81bcd7a 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -689,12 +689,11 @@ static struct sk_buff *receive_small(struct net_device *dev, page = xdp_page; } + xdp_init_buff(&xdp, buflen, &rq->xdp_rxq); xdp.data_hard_start = buf + VIRTNET_RX_PAD + vi->hdr_len; xdp.data = xdp.data_hard_start + xdp_headroom; xdp.data_end = xdp.data + len; xdp.data_meta = xdp.data; - xdp.rxq = &rq->xdp_rxq; - xdp.frame_sz = buflen; orig_data = xdp.data; act = bpf_prog_run_xdp(xdp_prog, &xdp); stats->xdp_packets++; @@ -859,12 +858,11 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, * the descriptor on if we get an XDP_TX return code. */ data = page_address(xdp_page) + offset; + xdp_init_buff(&xdp, frame_sz - vi->hdr_len, &rq->xdp_rxq); xdp.data_hard_start = data - VIRTIO_XDP_HEADROOM + vi->hdr_len; xdp.data = data + vi->hdr_len; xdp.data_end = xdp.data + (len - vi->hdr_len); xdp.data_meta = xdp.data; - xdp.rxq = &rq->xdp_rxq; - xdp.frame_sz = frame_sz - vi->hdr_len; act = bpf_prog_run_xdp(xdp_prog, &xdp); stats->xdp_packets++; diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index b01848ef4649..329397c60d84 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -864,12 +864,12 @@ static u32 xennet_run_xdp(struct netfront_queue *queue, struct page *pdata, u32 act; int err; + xdp_init_buff(xdp, XEN_PAGE_SIZE - XDP_PACKET_HEADROOM, + &queue->xdp_rxq); xdp->data_hard_start = page_address(pdata); xdp->data = xdp->data_hard_start + XDP_PACKET_HEADROOM; xdp_set_data_meta_invalid(xdp); xdp->data_end = xdp->data + len; - xdp->rxq = &queue->xdp_rxq; - xdp->frame_sz = XEN_PAGE_SIZE - XDP_PACKET_HEADROOM; act = bpf_prog_run_xdp(prog, xdp); switch (act) { diff --git a/include/net/xdp.h b/include/net/xdp.h index 600acb307db6..8a589f7e08e0 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -76,6 +76,13 @@ struct xdp_buff { u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/ }; +static __always_inline void +xdp_init_buff(struct xdp_buff *xdp, u32 frame_sz, struct xdp_rxq_info *rxq) +{ + xdp->frame_sz = frame_sz; + xdp->rxq = rxq; +} + /* Reserve memory area at end-of data area. * * This macro reserves tailroom in the XDP buffer by limiting the diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index c1c30a9f76f3..a8fa5a9e4137 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -640,10 +640,10 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, xdp.data = data + headroom; xdp.data_meta = xdp.data; xdp.data_end = xdp.data + size; - xdp.frame_sz = headroom + max_data_sz + tailroom; rxqueue = __netif_get_rx_queue(current->nsproxy->net_ns->loopback_dev, 0); - xdp.rxq = &rxqueue->xdp_rxq; + xdp_init_buff(&xdp, headroom + max_data_sz + tailroom, + &rxqueue->xdp_rxq); bpf_prog_change_xdp(NULL, prog); ret = bpf_test_run(prog, &xdp, repeat, &retval, &duration, true); if (ret) diff --git a/net/core/dev.c b/net/core/dev.c index 7afbb642e203..e6d758a3c2a9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4606,11 +4606,11 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, struct netdev_rx_queue *rxqueue; void *orig_data, *orig_data_end; u32 metalen, act = XDP_DROP; + u32 mac_len, frame_sz; __be16 orig_eth_type; struct ethhdr *eth; bool orig_bcast; int hlen, off; - u32 mac_len; /* Reinjected packets coming from act_mirred or similar should * not get XDP generic processing. @@ -4649,8 +4649,8 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, xdp->data_hard_start = skb->data - skb_headroom(skb); /* SKB "head" area always have tailroom for skb_shared_info */ - xdp->frame_sz = (void *)skb_end_pointer(skb) - xdp->data_hard_start; - xdp->frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + frame_sz = (void *)skb_end_pointer(skb) - xdp->data_hard_start; + frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); orig_data_end = xdp->data_end; orig_data = xdp->data; @@ -4659,7 +4659,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, orig_eth_type = eth->h_proto; rxqueue = netif_get_rxqueue(skb); - xdp->rxq = &rxqueue->xdp_rxq; + xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq); act = bpf_prog_run_xdp(xdp_prog, xdp); -- cgit v1.2.3 From be9df4aff65f18caa79b35f88f42c3d5a43af14f Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 22 Dec 2020 22:09:29 +0100 Subject: net, xdp: Introduce xdp_prepare_buff utility routine Introduce xdp_prepare_buff utility routine to initialize per-descriptor xdp_buff fields (e.g. xdp_buff pointers). Rely on xdp_prepare_buff() in all XDP capable drivers. Signed-off-by: Lorenzo Bianconi Signed-off-by: Daniel Borkmann Reviewed-by: Alexander Duyck Acked-by: Jesper Dangaard Brouer Acked-by: John Fastabend Acked-by: Shay Agroskin Acked-by: Martin Habets Acked-by: Camelia Groza Acked-by: Marcin Wojtas Link: https://lore.kernel.org/bpf/45f46f12295972a97da8ca01990b3e71501e9d89.1608670965.git.lorenzo@kernel.org Signed-off-by: Alexei Starovoitov --- drivers/net/ethernet/amazon/ena/ena_netdev.c | 7 +++---- drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c | 5 +---- drivers/net/ethernet/cavium/thunder/nicvf_main.c | 8 ++++---- drivers/net/ethernet/freescale/dpaa/dpaa_eth.c | 6 ++---- drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 14 +++++--------- drivers/net/ethernet/intel/i40e/i40e_txrx.c | 12 ++++++------ drivers/net/ethernet/intel/ice/ice_txrx.c | 9 +++++---- drivers/net/ethernet/intel/igb/igb_main.c | 12 ++++++------ drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 12 ++++++------ drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 12 ++++++------ drivers/net/ethernet/marvell/mvneta.c | 7 ++----- drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 8 +++----- drivers/net/ethernet/mellanox/mlx4/en_rx.c | 6 ++---- drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 5 +---- drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 8 ++++---- drivers/net/ethernet/qlogic/qede/qede_fp.c | 6 ++---- drivers/net/ethernet/sfc/rx.c | 7 ++----- drivers/net/ethernet/socionext/netsec.c | 6 ++---- drivers/net/ethernet/ti/cpsw.c | 16 +++++----------- drivers/net/ethernet/ti/cpsw_new.c | 16 +++++----------- drivers/net/hyperv/netvsc_bpf.c | 5 +---- drivers/net/tun.c | 5 +---- drivers/net/veth.c | 8 ++------ drivers/net/virtio_net.c | 12 ++++-------- drivers/net/xen-netfront.c | 6 ++---- include/net/xdp.h | 12 ++++++++++++ net/bpf/test_run.c | 7 ++----- net/core/dev.c | 20 +++++++++----------- 28 files changed, 105 insertions(+), 152 deletions(-) diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c index 43331f6967c9..1db6cfd2b55c 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.c +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c @@ -1585,10 +1585,9 @@ static int ena_xdp_handle_buff(struct ena_ring *rx_ring, struct xdp_buff *xdp) int ret; rx_info = &rx_ring->rx_buffer_info[rx_ring->ena_bufs[0].req_id]; - xdp->data = page_address(rx_info->page) + rx_info->page_offset; - xdp_set_data_meta_invalid(xdp); - xdp->data_hard_start = page_address(rx_info->page); - xdp->data_end = xdp->data + rx_ring->ena_bufs[0].len; + xdp_prepare_buff(xdp, page_address(rx_info->page), + rx_info->page_offset, + rx_ring->ena_bufs[0].len, false); /* If for some reason we received a bigger packet than * we expect, then we simply drop it */ diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c index ab805d6750e5..641303894341 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c @@ -135,10 +135,7 @@ bool bnxt_rx_xdp(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, u16 cons, txr = rxr->bnapi->tx_ring; /* BNXT_RX_PAGE_MODE(bp) when XDP enabled */ xdp_init_buff(&xdp, PAGE_SIZE, &rxr->xdp_rxq); - xdp.data_hard_start = *data_ptr - offset; - xdp.data = *data_ptr; - xdp_set_data_meta_invalid(&xdp); - xdp.data_end = *data_ptr + *len; + xdp_prepare_buff(&xdp, *data_ptr - offset, offset, *len, false); orig_data = xdp.data; rcu_read_lock(); diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c index 9fc672f075f2..c33b4e837515 100644 --- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c +++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c @@ -530,6 +530,7 @@ static inline bool nicvf_xdp_rx(struct nicvf *nic, struct bpf_prog *prog, struct cqe_rx_t *cqe_rx, struct snd_queue *sq, struct rcv_queue *rq, struct sk_buff **skb) { + unsigned char *hard_start, *data; struct xdp_buff xdp; struct page *page; u32 action; @@ -549,10 +550,9 @@ static inline bool nicvf_xdp_rx(struct nicvf *nic, struct bpf_prog *prog, xdp_init_buff(&xdp, RCV_FRAG_LEN + XDP_PACKET_HEADROOM, &rq->xdp_rxq); - xdp.data_hard_start = page_address(page); - xdp.data = (void *)cpu_addr; - xdp_set_data_meta_invalid(&xdp); - xdp.data_end = xdp.data + len; + hard_start = page_address(page); + data = (unsigned char *)cpu_addr; + xdp_prepare_buff(&xdp, hard_start, data - hard_start, len, false); orig_data = xdp.data; rcu_read_lock(); diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c index 26e20b96fd96..d8e568f6caf3 100644 --- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c +++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c @@ -2534,10 +2534,8 @@ static u32 dpaa_run_xdp(struct dpaa_priv *priv, struct qm_fd *fd, void *vaddr, xdp_init_buff(&xdp, DPAA_BP_RAW_SIZE - DPAA_TX_PRIV_DATA_SIZE, &dpaa_fq->xdp_rxq); - xdp.data = vaddr + fd_off; - xdp.data_meta = xdp.data; - xdp.data_hard_start = xdp.data - XDP_PACKET_HEADROOM; - xdp.data_end = xdp.data + qm_fd_get_length(fd); + xdp_prepare_buff(&xdp, vaddr + fd_off - XDP_PACKET_HEADROOM, + XDP_PACKET_HEADROOM, qm_fd_get_length(fd), true); /* We reserve a fixed headroom of 256 bytes under the erratum and we * offer it all to XDP programs to use. If no room is left for the diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c index 3a7892a66fe5..55b73c1136ca 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c @@ -350,7 +350,7 @@ static u32 dpaa2_eth_run_xdp(struct dpaa2_eth_priv *priv, struct bpf_prog *xdp_prog; struct xdp_buff xdp; u32 xdp_act = XDP_PASS; - int err; + int err, offset; rcu_read_lock(); @@ -358,14 +358,10 @@ static u32 dpaa2_eth_run_xdp(struct dpaa2_eth_priv *priv, if (!xdp_prog) goto out; - xdp_init_buff(&xdp, - DPAA2_ETH_RX_BUF_RAW_SIZE - - (dpaa2_fd_get_offset(fd) - XDP_PACKET_HEADROOM), - &ch->xdp_rxq); - xdp.data = vaddr + dpaa2_fd_get_offset(fd); - xdp.data_end = xdp.data + dpaa2_fd_get_len(fd); - xdp.data_hard_start = xdp.data - XDP_PACKET_HEADROOM; - xdp_set_data_meta_invalid(&xdp); + offset = dpaa2_fd_get_offset(fd) - XDP_PACKET_HEADROOM; + xdp_init_buff(&xdp, DPAA2_ETH_RX_BUF_RAW_SIZE - offset, &ch->xdp_rxq); + xdp_prepare_buff(&xdp, vaddr + offset, XDP_PACKET_HEADROOM, + dpaa2_fd_get_len(fd), false); xdp_act = bpf_prog_run_xdp(xdp_prog, &xdp); diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index a87fb8264d0c..2574e78f7597 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -2406,12 +2406,12 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget) /* retrieve a buffer from the ring */ if (!skb) { - xdp.data = page_address(rx_buffer->page) + - rx_buffer->page_offset; - xdp.data_meta = xdp.data; - xdp.data_hard_start = xdp.data - - i40e_rx_offset(rx_ring); - xdp.data_end = xdp.data + size; + unsigned int offset = i40e_rx_offset(rx_ring); + unsigned char *hard_start; + + hard_start = page_address(rx_buffer->page) + + rx_buffer->page_offset - offset; + xdp_prepare_buff(&xdp, hard_start, offset, size, true); #if (PAGE_SIZE > 4096) /* At larger PAGE_SIZE, frame_sz depend on len size */ xdp.frame_sz = i40e_rx_frame_truesize(rx_ring, size); diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c index 500e93bf6238..422f53997c02 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c @@ -1104,8 +1104,10 @@ int ice_clean_rx_irq(struct ice_ring *rx_ring, int budget) /* start the loop to process Rx packets bounded by 'budget' */ while (likely(total_rx_pkts < (unsigned int)budget)) { + unsigned int offset = ice_rx_offset(rx_ring); union ice_32b_rx_flex_desc *rx_desc; struct ice_rx_buf *rx_buf; + unsigned char *hard_start; struct sk_buff *skb; unsigned int size; u16 stat_err_bits; @@ -1151,10 +1153,9 @@ int ice_clean_rx_irq(struct ice_ring *rx_ring, int budget) goto construct_skb; } - xdp.data = page_address(rx_buf->page) + rx_buf->page_offset; - xdp.data_hard_start = xdp.data - ice_rx_offset(rx_ring); - xdp.data_meta = xdp.data; - xdp.data_end = xdp.data + size; + hard_start = page_address(rx_buf->page) + rx_buf->page_offset - + offset; + xdp_prepare_buff(&xdp, hard_start, offset, size, true); #if (PAGE_SIZE > 4096) /* At larger PAGE_SIZE, frame_sz depend on len size */ xdp.frame_sz = ice_rx_frame_truesize(rx_ring, size); diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index cf9e8b7d2c70..6b5adbd9660b 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -8715,12 +8715,12 @@ static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget) /* retrieve a buffer from the ring */ if (!skb) { - xdp.data = page_address(rx_buffer->page) + - rx_buffer->page_offset; - xdp.data_meta = xdp.data; - xdp.data_hard_start = xdp.data - - igb_rx_offset(rx_ring); - xdp.data_end = xdp.data + size; + unsigned int offset = igb_rx_offset(rx_ring); + unsigned char *hard_start; + + hard_start = page_address(rx_buffer->page) + + rx_buffer->page_offset - offset; + xdp_prepare_buff(&xdp, hard_start, offset, size, true); #if (PAGE_SIZE > 4096) /* At larger PAGE_SIZE, frame_sz depend on len size */ xdp.frame_sz = igb_rx_frame_truesize(rx_ring, size); diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 7a7c86835a87..56dca73d158e 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -2335,12 +2335,12 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector, /* retrieve a buffer from the ring */ if (!skb) { - xdp.data = page_address(rx_buffer->page) + - rx_buffer->page_offset; - xdp.data_meta = xdp.data; - xdp.data_hard_start = xdp.data - - ixgbe_rx_offset(rx_ring); - xdp.data_end = xdp.data + size; + unsigned int offset = ixgbe_rx_offset(rx_ring); + unsigned char *hard_start; + + hard_start = page_address(rx_buffer->page) + + rx_buffer->page_offset - offset; + xdp_prepare_buff(&xdp, hard_start, offset, size, true); #if (PAGE_SIZE > 4096) /* At larger PAGE_SIZE, frame_sz depend on len size */ xdp.frame_sz = ixgbe_rx_frame_truesize(rx_ring, size); diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index 624efcd71569..a534a3fb392e 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -1160,12 +1160,12 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector, /* retrieve a buffer from the ring */ if (!skb) { - xdp.data = page_address(rx_buffer->page) + - rx_buffer->page_offset; - xdp.data_meta = xdp.data; - xdp.data_hard_start = xdp.data - - ixgbevf_rx_offset(rx_ring); - xdp.data_end = xdp.data + size; + unsigned int offset = ixgbevf_rx_offset(rx_ring); + unsigned char *hard_start; + + hard_start = page_address(rx_buffer->page) + + rx_buffer->page_offset - offset; + xdp_prepare_buff(&xdp, hard_start, offset, size, true); #if (PAGE_SIZE > 4096) /* At larger PAGE_SIZE, frame_sz depend on len size */ xdp.frame_sz = ixgbevf_rx_frame_truesize(rx_ring, size); diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index 038c6b436cba..6290bfb6494e 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -2263,11 +2263,8 @@ mvneta_swbm_rx_frame(struct mvneta_port *pp, /* Prefetch header */ prefetch(data); - - xdp->data_hard_start = data; - xdp->data = data + pp->rx_offset_correction + MVNETA_MH_SIZE; - xdp->data_end = xdp->data + data_len; - xdp_set_data_meta_invalid(xdp); + xdp_prepare_buff(xdp, data, pp->rx_offset_correction + MVNETA_MH_SIZE, + data_len, false); sinfo = xdp_get_shared_info_from_buff(xdp); sinfo->nr_frags = 0; diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index 5872cb011ae1..5272f26a3e3e 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -3565,17 +3565,15 @@ static int mvpp2_rx(struct mvpp2_port *port, struct napi_struct *napi, if (xdp_prog) { struct xdp_rxq_info *xdp_rxq; - xdp.data_hard_start = data; - xdp.data = data + MVPP2_MH_SIZE + MVPP2_SKB_HEADROOM; - xdp.data_end = xdp.data + rx_bytes; - if (bm_pool->pkt_size == MVPP2_BM_SHORT_PKT_SIZE) xdp_rxq = &rxq->xdp_rxq_short; else xdp_rxq = &rxq->xdp_rxq_long; xdp_init_buff(&xdp, PAGE_SIZE, xdp_rxq); - xdp_set_data_meta_invalid(&xdp); + xdp_prepare_buff(&xdp, data, + MVPP2_MH_SIZE + MVPP2_SKB_HEADROOM, + rx_bytes, false); ret = mvpp2_run_xdp(port, rxq, xdp_prog, &xdp, pp, &ps); diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index 93da9c2d50c0..e35e4d7ef4d1 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -776,10 +776,8 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud priv->frag_info[0].frag_size, DMA_FROM_DEVICE); - xdp.data_hard_start = va - frags[0].page_offset; - xdp.data = va; - xdp_set_data_meta_invalid(&xdp); - xdp.data_end = xdp.data + length; + xdp_prepare_buff(&xdp, va - frags[0].page_offset, + frags[0].page_offset, length, false); orig_data = xdp.data; act = bpf_prog_run_xdp(xdp_prog, &xdp); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index bc7c81f2b036..a63ce7c8b98f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -1127,10 +1127,7 @@ static void mlx5e_fill_xdp_buff(struct mlx5e_rq *rq, void *va, u16 headroom, u32 len, struct xdp_buff *xdp) { xdp_init_buff(xdp, rq->buff.frame0_sz, &rq->xdp_rxq); - xdp->data_hard_start = va; - xdp->data = va + headroom; - xdp_set_data_meta_invalid(xdp); - xdp->data_end = xdp->data + len; + xdp_prepare_buff(xdp, va, headroom, len, false); } static struct sk_buff * diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index 513bc60bcd09..eeb30680b4dc 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -1914,10 +1914,10 @@ static int nfp_net_rx(struct nfp_net_rx_ring *rx_ring, int budget) unsigned int dma_off; int act; - xdp.data_hard_start = rxbuf->frag + NFP_NET_RX_BUF_HEADROOM; - xdp.data = orig_data; - xdp.data_meta = orig_data; - xdp.data_end = orig_data + pkt_len; + xdp_prepare_buff(&xdp, + rxbuf->frag + NFP_NET_RX_BUF_HEADROOM, + pkt_off - NFP_NET_RX_BUF_HEADROOM, + pkt_len, true); act = bpf_prog_run_xdp(xdp_prog, &xdp); diff --git a/drivers/net/ethernet/qlogic/qede/qede_fp.c b/drivers/net/ethernet/qlogic/qede/qede_fp.c index 8d0c6d62022a..70c8d3cd85c0 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_fp.c +++ b/drivers/net/ethernet/qlogic/qede/qede_fp.c @@ -1091,10 +1091,8 @@ static bool qede_rx_xdp(struct qede_dev *edev, enum xdp_action act; xdp_init_buff(&xdp, rxq->rx_buf_seg_size, &rxq->xdp_rxq); - xdp.data_hard_start = page_address(bd->data); - xdp.data = xdp.data_hard_start + *data_offset; - xdp_set_data_meta_invalid(&xdp); - xdp.data_end = xdp.data + *len; + xdp_prepare_buff(&xdp, page_address(bd->data), *data_offset, + *len, false); /* Queues always have a full reset currently, so for the time * being until there's atomic program replace just mark read diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c index eaa6650955d1..89c5c75f479f 100644 --- a/drivers/net/ethernet/sfc/rx.c +++ b/drivers/net/ethernet/sfc/rx.c @@ -294,12 +294,9 @@ static bool efx_do_xdp(struct efx_nic *efx, struct efx_channel *channel, efx->rx_prefix_size); xdp_init_buff(&xdp, efx->rx_page_buf_step, &rx_queue->xdp_rxq_info); - xdp.data = *ehp; - xdp.data_hard_start = xdp.data - EFX_XDP_HEADROOM; - /* No support yet for XDP metadata */ - xdp_set_data_meta_invalid(&xdp); - xdp.data_end = xdp.data + rx_buf->len; + xdp_prepare_buff(&xdp, *ehp - EFX_XDP_HEADROOM, EFX_XDP_HEADROOM, + rx_buf->len, false); xdp_act = bpf_prog_run_xdp(xdp_prog, &xdp); rcu_read_unlock(); diff --git a/drivers/net/ethernet/socionext/netsec.c b/drivers/net/ethernet/socionext/netsec.c index 945ca9517bf9..3c53051bdacf 100644 --- a/drivers/net/ethernet/socionext/netsec.c +++ b/drivers/net/ethernet/socionext/netsec.c @@ -1015,10 +1015,8 @@ static int netsec_process_rx(struct netsec_priv *priv, int budget) dma_dir); prefetch(desc->addr); - xdp.data_hard_start = desc->addr; - xdp.data = desc->addr + NETSEC_RXBUF_HEADROOM; - xdp_set_data_meta_invalid(&xdp); - xdp.data_end = xdp.data + pkt_len; + xdp_prepare_buff(&xdp, desc->addr, NETSEC_RXBUF_HEADROOM, + pkt_len, false); if (xdp_prog) { xdp_result = netsec_run_xdp(priv, xdp_prog, &xdp); diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c index 78a923391828..5239318e9686 100644 --- a/drivers/net/ethernet/ti/cpsw.c +++ b/drivers/net/ethernet/ti/cpsw.c @@ -392,21 +392,15 @@ static void cpsw_rx_handler(void *token, int len, int status) } if (priv->xdp_prog) { - xdp_init_buff(&xdp, PAGE_SIZE, &priv->xdp_rxq[ch]); + int headroom = CPSW_HEADROOM, size = len; + xdp_init_buff(&xdp, PAGE_SIZE, &priv->xdp_rxq[ch]); if (status & CPDMA_RX_VLAN_ENCAP) { - xdp.data = pa + CPSW_HEADROOM + - CPSW_RX_VLAN_ENCAP_HDR_SIZE; - xdp.data_end = xdp.data + len - - CPSW_RX_VLAN_ENCAP_HDR_SIZE; - } else { - xdp.data = pa + CPSW_HEADROOM; - xdp.data_end = xdp.data + len; + headroom += CPSW_RX_VLAN_ENCAP_HDR_SIZE; + size -= CPSW_RX_VLAN_ENCAP_HDR_SIZE; } - xdp_set_data_meta_invalid(&xdp); - - xdp.data_hard_start = pa; + xdp_prepare_buff(&xdp, pa, headroom, size, false); port = priv->emac_port + cpsw->data.dual_emac; ret = cpsw_run_xdp(priv, ch, &xdp, page, port); diff --git a/drivers/net/ethernet/ti/cpsw_new.c b/drivers/net/ethernet/ti/cpsw_new.c index 1b3385ec9645..94747f82c60b 100644 --- a/drivers/net/ethernet/ti/cpsw_new.c +++ b/drivers/net/ethernet/ti/cpsw_new.c @@ -335,21 +335,15 @@ static void cpsw_rx_handler(void *token, int len, int status) } if (priv->xdp_prog) { - xdp_init_buff(&xdp, PAGE_SIZE, &priv->xdp_rxq[ch]); + int headroom = CPSW_HEADROOM, size = len; + xdp_init_buff(&xdp, PAGE_SIZE, &priv->xdp_rxq[ch]); if (status & CPDMA_RX_VLAN_ENCAP) { - xdp.data = pa + CPSW_HEADROOM + - CPSW_RX_VLAN_ENCAP_HDR_SIZE; - xdp.data_end = xdp.data + len - - CPSW_RX_VLAN_ENCAP_HDR_SIZE; - } else { - xdp.data = pa + CPSW_HEADROOM; - xdp.data_end = xdp.data + len; + headroom += CPSW_RX_VLAN_ENCAP_HDR_SIZE; + size -= CPSW_RX_VLAN_ENCAP_HDR_SIZE; } - xdp_set_data_meta_invalid(&xdp); - - xdp.data_hard_start = pa; + xdp_prepare_buff(&xdp, pa, headroom, size, false); ret = cpsw_run_xdp(priv, ch, &xdp, page, priv->emac_port); if (ret != CPSW_XDP_PASS) diff --git a/drivers/net/hyperv/netvsc_bpf.c b/drivers/net/hyperv/netvsc_bpf.c index 14a7ee4c6899..d60dcf6c9829 100644 --- a/drivers/net/hyperv/netvsc_bpf.c +++ b/drivers/net/hyperv/netvsc_bpf.c @@ -45,10 +45,7 @@ u32 netvsc_run_xdp(struct net_device *ndev, struct netvsc_channel *nvchan, } xdp_init_buff(xdp, PAGE_SIZE, &nvchan->xdp_rxq); - xdp->data_hard_start = page_address(page); - xdp->data = xdp->data_hard_start + NETVSC_XDP_HDRM; - xdp_set_data_meta_invalid(xdp); - xdp->data_end = xdp->data + len; + xdp_prepare_buff(xdp, page_address(page), NETVSC_XDP_HDRM, len, false); memcpy(xdp->data, data, len); diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 3b7728433a86..702215596889 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1600,10 +1600,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun, u32 act; xdp_init_buff(&xdp, buflen, &tfile->xdp_rxq); - xdp.data_hard_start = buf; - xdp.data = buf + pad; - xdp_set_data_meta_invalid(&xdp); - xdp.data_end = xdp.data + len; + xdp_prepare_buff(&xdp, buf, pad, len, false); act = bpf_prog_run_xdp(xdp_prog, &xdp); if (act == XDP_REDIRECT || act == XDP_TX) { diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 25f3601fb6dd..99caae7d1641 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -710,15 +710,11 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, skb = nskb; } - xdp.data_hard_start = skb->head; - xdp.data = skb_mac_header(skb); - xdp.data_end = xdp.data + pktlen; - xdp.data_meta = xdp.data; - /* SKB "head" area always have tailroom for skb_shared_info */ - frame_sz = (void *)skb_end_pointer(skb) - xdp.data_hard_start; + frame_sz = skb_end_pointer(skb) - skb->head; frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); xdp_init_buff(&xdp, frame_sz, &rq->xdp_rxq); + xdp_prepare_buff(&xdp, skb->head, skb->mac_header, pktlen, true); orig_data = xdp.data; orig_data_end = xdp.data_end; diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 36b0f81bcd7a..ba8e63792549 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -690,10 +690,8 @@ static struct sk_buff *receive_small(struct net_device *dev, } xdp_init_buff(&xdp, buflen, &rq->xdp_rxq); - xdp.data_hard_start = buf + VIRTNET_RX_PAD + vi->hdr_len; - xdp.data = xdp.data_hard_start + xdp_headroom; - xdp.data_end = xdp.data + len; - xdp.data_meta = xdp.data; + xdp_prepare_buff(&xdp, buf + VIRTNET_RX_PAD + vi->hdr_len, + xdp_headroom, len, true); orig_data = xdp.data; act = bpf_prog_run_xdp(xdp_prog, &xdp); stats->xdp_packets++; @@ -859,10 +857,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, */ data = page_address(xdp_page) + offset; xdp_init_buff(&xdp, frame_sz - vi->hdr_len, &rq->xdp_rxq); - xdp.data_hard_start = data - VIRTIO_XDP_HEADROOM + vi->hdr_len; - xdp.data = data + vi->hdr_len; - xdp.data_end = xdp.data + (len - vi->hdr_len); - xdp.data_meta = xdp.data; + xdp_prepare_buff(&xdp, data - VIRTIO_XDP_HEADROOM + vi->hdr_len, + VIRTIO_XDP_HEADROOM, len - vi->hdr_len, true); act = bpf_prog_run_xdp(xdp_prog, &xdp); stats->xdp_packets++; diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index 329397c60d84..c20b78120bb4 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -866,10 +866,8 @@ static u32 xennet_run_xdp(struct netfront_queue *queue, struct page *pdata, xdp_init_buff(xdp, XEN_PAGE_SIZE - XDP_PACKET_HEADROOM, &queue->xdp_rxq); - xdp->data_hard_start = page_address(pdata); - xdp->data = xdp->data_hard_start + XDP_PACKET_HEADROOM; - xdp_set_data_meta_invalid(xdp); - xdp->data_end = xdp->data + len; + xdp_prepare_buff(xdp, page_address(pdata), XDP_PACKET_HEADROOM, + len, false); act = bpf_prog_run_xdp(prog, xdp); switch (act) { diff --git a/include/net/xdp.h b/include/net/xdp.h index 8a589f7e08e0..0cf3976ce77c 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -83,6 +83,18 @@ xdp_init_buff(struct xdp_buff *xdp, u32 frame_sz, struct xdp_rxq_info *rxq) xdp->rxq = rxq; } +static __always_inline void +xdp_prepare_buff(struct xdp_buff *xdp, unsigned char *hard_start, + int headroom, int data_len, const bool meta_valid) +{ + unsigned char *data = hard_start + headroom; + + xdp->data_hard_start = hard_start; + xdp->data = data; + xdp->data_end = data + data_len; + xdp->data_meta = meta_valid ? data : data + 1; +} + /* Reserve memory area at end-of data area. * * This macro reserves tailroom in the XDP buffer by limiting the diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index a8fa5a9e4137..23dfb2010ba6 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -636,14 +636,11 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, if (IS_ERR(data)) return PTR_ERR(data); - xdp.data_hard_start = data; - xdp.data = data + headroom; - xdp.data_meta = xdp.data; - xdp.data_end = xdp.data + size; - rxqueue = __netif_get_rx_queue(current->nsproxy->net_ns->loopback_dev, 0); xdp_init_buff(&xdp, headroom + max_data_sz + tailroom, &rxqueue->xdp_rxq); + xdp_prepare_buff(&xdp, data, headroom, size, true); + bpf_prog_change_xdp(NULL, prog); ret = bpf_test_run(prog, &xdp, repeat, &retval, &duration, true); if (ret) diff --git a/net/core/dev.c b/net/core/dev.c index e6d758a3c2a9..55499b017a42 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4603,14 +4603,14 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { + void *orig_data, *orig_data_end, *hard_start; struct netdev_rx_queue *rxqueue; - void *orig_data, *orig_data_end; u32 metalen, act = XDP_DROP; u32 mac_len, frame_sz; __be16 orig_eth_type; struct ethhdr *eth; bool orig_bcast; - int hlen, off; + int off; /* Reinjected packets coming from act_mirred or similar should * not get XDP generic processing. @@ -4642,25 +4642,23 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, * header. */ mac_len = skb->data - skb_mac_header(skb); - hlen = skb_headlen(skb) + mac_len; - xdp->data = skb->data - mac_len; - xdp->data_meta = xdp->data; - xdp->data_end = xdp->data + hlen; - xdp->data_hard_start = skb->data - skb_headroom(skb); + hard_start = skb->data - skb_headroom(skb); /* SKB "head" area always have tailroom for skb_shared_info */ - frame_sz = (void *)skb_end_pointer(skb) - xdp->data_hard_start; + frame_sz = (void *)skb_end_pointer(skb) - hard_start; frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + rxqueue = netif_get_rxqueue(skb); + xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq); + xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len, + skb_headlen(skb) + mac_len, true); + orig_data_end = xdp->data_end; orig_data = xdp->data; eth = (struct ethhdr *)xdp->data; orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest); orig_eth_type = eth->h_proto; - rxqueue = netif_get_rxqueue(skb); - xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq); - act = bpf_prog_run_xdp(xdp_prog, xdp); /* check if bpf_xdp_adjust_head was used */ -- cgit v1.2.3 From 9a8120a8d7eb872da43e61d598c226f0d6b7ce5f Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Tue, 5 Jan 2021 07:20:47 -0800 Subject: selftests/bpf: Remove duplicate include in test_lsm 'unistd.h' included in 'selftests/bpf/prog_tests/test_lsm.c' is duplicated. Signed-off-by: Menglong Dong Signed-off-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20210105152047.6070-1-dong.menglong@zte.com.cn Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/test_lsm.c | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/test_lsm.c b/tools/testing/selftests/bpf/prog_tests/test_lsm.c index 6ab29226c99b..2755e4f81499 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_lsm.c +++ b/tools/testing/selftests/bpf/prog_tests/test_lsm.c @@ -10,7 +10,6 @@ #include #include #include -#include #include "lsm.skel.h" -- cgit v1.2.3 From e22d7f05e445165e58feddb4e40cc9c0f94453bc Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 8 Jan 2021 11:44:08 -0800 Subject: libbpf: Clarify kernel type use with USER variants of CORE reading macros Add comments clarifying that USER variants of CO-RE reading macro are still only going to work with kernel types, defined in kernel or kernel module BTF. This should help preventing invalid use of those macro to read user-defined types (which doesn't work with CO-RE). Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210108194408.3468860-1-andrii@kernel.org --- tools/lib/bpf/bpf_core_read.h | 45 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 39 insertions(+), 6 deletions(-) diff --git a/tools/lib/bpf/bpf_core_read.h b/tools/lib/bpf/bpf_core_read.h index 9456aabcb03a..53b3e199fb25 100644 --- a/tools/lib/bpf/bpf_core_read.h +++ b/tools/lib/bpf/bpf_core_read.h @@ -197,6 +197,7 @@ enum bpf_enum_value_kind { #define bpf_core_read(dst, sz, src) \ bpf_probe_read_kernel(dst, sz, (const void *)__builtin_preserve_access_index(src)) +/* NOTE: see comments for BPF_CORE_READ_USER() about the proper types use. */ #define bpf_core_read_user(dst, sz, src) \ bpf_probe_read_user(dst, sz, (const void *)__builtin_preserve_access_index(src)) /* @@ -207,6 +208,7 @@ enum bpf_enum_value_kind { #define bpf_core_read_str(dst, sz, src) \ bpf_probe_read_kernel_str(dst, sz, (const void *)__builtin_preserve_access_index(src)) +/* NOTE: see comments for BPF_CORE_READ_USER() about the proper types use. */ #define bpf_core_read_user_str(dst, sz, src) \ bpf_probe_read_user_str(dst, sz, (const void *)__builtin_preserve_access_index(src)) @@ -302,7 +304,11 @@ enum bpf_enum_value_kind { dst, (src), a, ##__VA_ARGS__) \ }) -/* Variant of BPF_CORE_READ_INTO() for reading from user-space memory */ +/* + * Variant of BPF_CORE_READ_INTO() for reading from user-space memory. + * + * NOTE: see comments for BPF_CORE_READ_USER() about the proper types use. + */ #define BPF_CORE_READ_USER_INTO(dst, src, a, ...) ({ \ ___core_read(bpf_core_read_user, bpf_core_read_user, \ dst, (src), a, ##__VA_ARGS__) \ @@ -314,7 +320,11 @@ enum bpf_enum_value_kind { dst, (src), a, ##__VA_ARGS__) \ }) -/* Non-CO-RE variant of BPF_CORE_READ_USER_INTO() */ +/* Non-CO-RE variant of BPF_CORE_READ_USER_INTO(). + * + * As no CO-RE relocations are emitted, source types can be arbitrary and are + * not restricted to kernel types only. + */ #define BPF_PROBE_READ_USER_INTO(dst, src, a, ...) ({ \ ___core_read(bpf_probe_read_user, bpf_probe_read_user, \ dst, (src), a, ##__VA_ARGS__) \ @@ -330,7 +340,11 @@ enum bpf_enum_value_kind { dst, (src), a, ##__VA_ARGS__) \ }) -/* Variant of BPF_CORE_READ_STR_INTO() for reading from user-space memory */ +/* + * Variant of BPF_CORE_READ_STR_INTO() for reading from user-space memory. + * + * NOTE: see comments for BPF_CORE_READ_USER() about the proper types use. + */ #define BPF_CORE_READ_USER_STR_INTO(dst, src, a, ...) ({ \ ___core_read(bpf_core_read_user_str, bpf_core_read_user, \ dst, (src), a, ##__VA_ARGS__) \ @@ -342,7 +356,12 @@ enum bpf_enum_value_kind { dst, (src), a, ##__VA_ARGS__) \ }) -/* Non-CO-RE variant of BPF_CORE_READ_USER_STR_INTO() */ +/* + * Non-CO-RE variant of BPF_CORE_READ_USER_STR_INTO(). + * + * As no CO-RE relocations are emitted, source types can be arbitrary and are + * not restricted to kernel types only. + */ #define BPF_PROBE_READ_USER_STR_INTO(dst, src, a, ...) ({ \ ___core_read(bpf_probe_read_user_str, bpf_probe_read_user, \ dst, (src), a, ##__VA_ARGS__) \ @@ -378,7 +397,16 @@ enum bpf_enum_value_kind { __r; \ }) -/* Variant of BPF_CORE_READ() for reading from user-space memory */ +/* + * Variant of BPF_CORE_READ() for reading from user-space memory. + * + * NOTE: all the source types involved are still *kernel types* and need to + * exist in kernel (or kernel module) BTF, otherwise CO-RE relocation will + * fail. Custom user types are not relocatable with CO-RE. + * The typical situation in which BPF_CORE_READ_USER() might be used is to + * read kernel UAPI types from the user-space memory passed in as a syscall + * input argument. + */ #define BPF_CORE_READ_USER(src, a, ...) ({ \ ___type((src), a, ##__VA_ARGS__) __r; \ BPF_CORE_READ_USER_INTO(&__r, (src), a, ##__VA_ARGS__); \ @@ -392,7 +420,12 @@ enum bpf_enum_value_kind { __r; \ }) -/* Non-CO-RE variant of BPF_CORE_READ_USER() */ +/* + * Non-CO-RE variant of BPF_CORE_READ_USER(). + * + * As no CO-RE relocations are emitted, source types can be arbitrary and are + * not restricted to kernel types only. + */ #define BPF_PROBE_READ_USER(src, a, ...) ({ \ ___type((src), a, ##__VA_ARGS__) __r; \ BPF_PROBE_READ_USER_INTO(&__r, (src), a, ##__VA_ARGS__); \ -- cgit v1.2.3 From c6458e72f6fd6ac7e390da0d9abe8446084886e5 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Tue, 12 Jan 2021 12:34:22 +0000 Subject: bpf: Clarify return value of probe str helpers When the buffer is too small to contain the input string, these helpers return the length of the buffer, not the length of the original string. This tries to make the docs totally clear about that, since "the length of the [copied ]string" could also refer to the length of the input. Signed-off-by: Brendan Jackman Signed-off-by: Daniel Borkmann Acked-by: KP Singh Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210112123422.2011234-1-jackmanb@google.com --- include/uapi/linux/bpf.h | 10 +++++----- tools/include/uapi/linux/bpf.h | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 77d7c1bb2923..a1ad32456f89 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2993,10 +2993,10 @@ union bpf_attr { * string length is larger than *size*, just *size*-1 bytes are * copied and the last byte is set to NUL. * - * On success, the length of the copied string is returned. This - * makes this helper useful in tracing programs for reading - * strings, and more importantly to get its length at runtime. See - * the following snippet: + * On success, returns the number of bytes that were written, + * including the terminal NUL. This makes this helper useful in + * tracing programs for reading strings, and more importantly to + * get its length at runtime. See the following snippet: * * :: * @@ -3024,7 +3024,7 @@ union bpf_attr { * **->mm->env_start**: using this helper and the return value, * one can quickly iterate at the right offset of the memory area. * Return - * On success, the strictly positive length of the string, + * On success, the strictly positive length of the output string, * including the trailing NUL character. On error, a negative * value. * diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 77d7c1bb2923..a1ad32456f89 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2993,10 +2993,10 @@ union bpf_attr { * string length is larger than *size*, just *size*-1 bytes are * copied and the last byte is set to NUL. * - * On success, the length of the copied string is returned. This - * makes this helper useful in tracing programs for reading - * strings, and more importantly to get its length at runtime. See - * the following snippet: + * On success, returns the number of bytes that were written, + * including the terminal NUL. This makes this helper useful in + * tracing programs for reading strings, and more importantly to + * get its length at runtime. See the following snippet: * * :: * @@ -3024,7 +3024,7 @@ union bpf_attr { * **->mm->env_start**: using this helper and the return value, * one can quickly iterate at the right offset of the memory area. * Return - * On success, the strictly positive length of the string, + * On success, the strictly positive length of the output string, * including the trailing NUL character. On error, a negative * value. * -- cgit v1.2.3 From 28a8add64181059034b7f281491132112cd95bb4 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Tue, 12 Jan 2021 12:39:13 +0000 Subject: bpf: Fix a verifier message for alloc size helper arg The error message here is misleading, the argument will be rejected unless it is a known constant. Signed-off-by: Brendan Jackman Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210112123913.2016804-1-jackmanb@google.com --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 17270b8404f1..5534e667bdb1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4319,7 +4319,7 @@ skip_type_check: err = mark_chain_precision(env, regno); } else if (arg_type_is_alloc_size(arg_type)) { if (!tnum_is_const(reg->var_off)) { - verbose(env, "R%d unbounded size, use 'var &= const' or 'if (var < const)'\n", + verbose(env, "R%d is not a known constant'\n", regno); return -EACCES; } -- cgit v1.2.3 From bcd6f4a8bedabac6949d05e418e73a0a1b309e84 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 12 Jan 2021 00:09:39 +0100 Subject: bpf: Allow to retrieve sol_socket opts from sock_addr progs The _bpf_setsockopt() is able to set some of the SOL_SOCKET level options, however, _bpf_getsockopt() has little support to actually retrieve them. This small patch adds few misc options such as SO_MARK, SO_PRIORITY and SO_BINDTOIFINDEX. For the latter getter and setter are added. The mark and priority in particular allow to retrieve the options from BPF cgroup hooks to then implement custom behavior / settings on the syscall hooks compared to other sockets that stick to the defaults, for example. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/cba44439b801e5ddc1170e5be787f4dc93a2d7f9.1610406333.git.daniel@iogearbox.net --- net/core/filter.c | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 255aeee72402..9ab94e90d660 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4770,6 +4770,10 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, ifindex = dev->ifindex; dev_put(dev); } + fallthrough; + case SO_BINDTOIFINDEX: + if (optname == SO_BINDTOIFINDEX) + ifindex = val; ret = sock_bindtoindex(sk, ifindex, false); break; case SO_KEEPALIVE: @@ -4932,8 +4936,25 @@ static int _bpf_getsockopt(struct sock *sk, int level, int optname, sock_owned_by_me(sk); + if (level == SOL_SOCKET) { + if (optlen != sizeof(int)) + goto err_clear; + + switch (optname) { + case SO_MARK: + *((int *)optval) = sk->sk_mark; + break; + case SO_PRIORITY: + *((int *)optval) = sk->sk_priority; + break; + case SO_BINDTOIFINDEX: + *((int *)optval) = sk->sk_bound_dev_if; + break; + default: + goto err_clear; + } #ifdef CONFIG_INET - if (level == SOL_TCP && sk->sk_prot->getsockopt == tcp_getsockopt) { + } else if (level == SOL_TCP && sk->sk_prot->getsockopt == tcp_getsockopt) { struct inet_connection_sock *icsk; struct tcp_sock *tp; @@ -4986,12 +5007,12 @@ static int _bpf_getsockopt(struct sock *sk, int level, int optname, default: goto err_clear; } +#endif #endif } else { goto err_clear; } return 0; -#endif err_clear: memset(optval, 0, optlen); return -EINVAL; -- cgit v1.2.3 From 3218231dbb16cd85d94831759b6c78e6feb54b58 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 12 Jan 2021 00:09:40 +0100 Subject: bpf: Extend bind v4/v6 selftests for mark/prio/bindtoifindex Extend existing cgroup bind4/bind6 tests to add coverage for setting and retrieving SO_MARK, SO_PRIORITY and SO_BINDTOIFINDEX at the bind hook. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/384fdc90e5fa83f8335a37aa90fa2f5f3661929c.1610406333.git.daniel@iogearbox.net --- tools/testing/selftests/bpf/progs/bind4_prog.c | 42 +++++++++++++++++++++++--- tools/testing/selftests/bpf/progs/bind6_prog.c | 42 +++++++++++++++++++++++--- 2 files changed, 76 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/bind4_prog.c b/tools/testing/selftests/bpf/progs/bind4_prog.c index c6520f21f5f5..115a3b0ad984 100644 --- a/tools/testing/selftests/bpf/progs/bind4_prog.c +++ b/tools/testing/selftests/bpf/progs/bind4_prog.c @@ -29,18 +29,48 @@ static __inline int bind_to_device(struct bpf_sock_addr *ctx) char veth2[IFNAMSIZ] = "test_sock_addr2"; char missing[IFNAMSIZ] = "nonexistent_dev"; char del_bind[IFNAMSIZ] = ""; + int veth1_idx, veth2_idx; if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE, - &veth1, sizeof(veth1))) + &veth1, sizeof(veth1))) + return 1; + if (bpf_getsockopt(ctx, SOL_SOCKET, SO_BINDTOIFINDEX, + &veth1_idx, sizeof(veth1_idx)) || !veth1_idx) return 1; if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE, - &veth2, sizeof(veth2))) + &veth2, sizeof(veth2))) + return 1; + if (bpf_getsockopt(ctx, SOL_SOCKET, SO_BINDTOIFINDEX, + &veth2_idx, sizeof(veth2_idx)) || !veth2_idx || + veth1_idx == veth2_idx) return 1; if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE, - &missing, sizeof(missing)) != -ENODEV) + &missing, sizeof(missing)) != -ENODEV) + return 1; + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTOIFINDEX, + &veth1_idx, sizeof(veth1_idx))) return 1; if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE, - &del_bind, sizeof(del_bind))) + &del_bind, sizeof(del_bind))) + return 1; + + return 0; +} + +static __inline int misc_opts(struct bpf_sock_addr *ctx, int opt) +{ + int old, tmp, new = 0xeb9f; + + /* Socket in test case has guarantee that old never equals to new. */ + if (bpf_getsockopt(ctx, SOL_SOCKET, opt, &old, sizeof(old)) || + old == new) + return 1; + if (bpf_setsockopt(ctx, SOL_SOCKET, opt, &new, sizeof(new))) + return 1; + if (bpf_getsockopt(ctx, SOL_SOCKET, opt, &tmp, sizeof(tmp)) || + tmp != new) + return 1; + if (bpf_setsockopt(ctx, SOL_SOCKET, opt, &old, sizeof(old))) return 1; return 0; @@ -93,6 +123,10 @@ int bind_v4_prog(struct bpf_sock_addr *ctx) if (bind_to_device(ctx)) return 0; + /* Test for misc socket options. */ + if (misc_opts(ctx, SO_MARK) || misc_opts(ctx, SO_PRIORITY)) + return 0; + ctx->user_ip4 = bpf_htonl(SERV4_REWRITE_IP); ctx->user_port = bpf_htons(SERV4_REWRITE_PORT); diff --git a/tools/testing/selftests/bpf/progs/bind6_prog.c b/tools/testing/selftests/bpf/progs/bind6_prog.c index 4358e44dcf47..4c0d348034b9 100644 --- a/tools/testing/selftests/bpf/progs/bind6_prog.c +++ b/tools/testing/selftests/bpf/progs/bind6_prog.c @@ -35,18 +35,48 @@ static __inline int bind_to_device(struct bpf_sock_addr *ctx) char veth2[IFNAMSIZ] = "test_sock_addr2"; char missing[IFNAMSIZ] = "nonexistent_dev"; char del_bind[IFNAMSIZ] = ""; + int veth1_idx, veth2_idx; if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE, - &veth1, sizeof(veth1))) + &veth1, sizeof(veth1))) + return 1; + if (bpf_getsockopt(ctx, SOL_SOCKET, SO_BINDTOIFINDEX, + &veth1_idx, sizeof(veth1_idx)) || !veth1_idx) return 1; if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE, - &veth2, sizeof(veth2))) + &veth2, sizeof(veth2))) + return 1; + if (bpf_getsockopt(ctx, SOL_SOCKET, SO_BINDTOIFINDEX, + &veth2_idx, sizeof(veth2_idx)) || !veth2_idx || + veth1_idx == veth2_idx) return 1; if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE, - &missing, sizeof(missing)) != -ENODEV) + &missing, sizeof(missing)) != -ENODEV) + return 1; + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTOIFINDEX, + &veth1_idx, sizeof(veth1_idx))) return 1; if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE, - &del_bind, sizeof(del_bind))) + &del_bind, sizeof(del_bind))) + return 1; + + return 0; +} + +static __inline int misc_opts(struct bpf_sock_addr *ctx, int opt) +{ + int old, tmp, new = 0xeb9f; + + /* Socket in test case has guarantee that old never equals to new. */ + if (bpf_getsockopt(ctx, SOL_SOCKET, opt, &old, sizeof(old)) || + old == new) + return 1; + if (bpf_setsockopt(ctx, SOL_SOCKET, opt, &new, sizeof(new))) + return 1; + if (bpf_getsockopt(ctx, SOL_SOCKET, opt, &tmp, sizeof(tmp)) || + tmp != new) + return 1; + if (bpf_setsockopt(ctx, SOL_SOCKET, opt, &old, sizeof(old))) return 1; return 0; @@ -107,6 +137,10 @@ int bind_v6_prog(struct bpf_sock_addr *ctx) if (bind_to_device(ctx)) return 0; + /* Test for misc socket options. */ + if (misc_opts(ctx, SO_MARK) || misc_opts(ctx, SO_PRIORITY)) + return 0; + ctx->user_ip6[0] = bpf_htonl(SERV6_REWRITE_IP_0); ctx->user_ip6[1] = bpf_htonl(SERV6_REWRITE_IP_1); ctx->user_ip6[2] = bpf_htonl(SERV6_REWRITE_IP_2); -- cgit v1.2.3 From a643bff752dcf72a07e1b2ab2f8587e4f51118be Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 11 Jan 2021 23:55:14 -0800 Subject: bpf: Add bpf_patch_call_args prototype to include/linux/bpf.h Add bpf_patch_call_args() prototype. This function is called from BPF verifier and only if CONFIG_BPF_JIT_ALWAYS_ON is not defined. This fixes compiler warning about missing prototype in some kernel configurations. Fixes: 1ea47e01ad6e ("bpf: add support for bpf_call to interpreter") Reported-by: kernel test robot Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210112075520.4103414-2-andrii@kernel.org --- include/linux/bpf.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 07cb5d15e743..ef9309604b3e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1403,7 +1403,10 @@ static inline void bpf_long_memcpy(void *dst, const void *src, u32 size) /* verify correctness of eBPF program */ int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, union bpf_attr __user *uattr); + +#ifndef CONFIG_BPF_JIT_ALWAYS_ON void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth); +#endif struct btf *bpf_get_btf_vmlinux(void); -- cgit v1.2.3 From 6943c2b05bf09fd5c5729f7d7d803bf3f126cb9a Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 11 Jan 2021 23:55:15 -0800 Subject: bpf: Avoid warning when re-casting __bpf_call_base into __bpf_call_base_args BPF interpreter uses extra input argument, so re-casts __bpf_call_base into __bpf_call_base_args. Avoid compiler warning about incompatible function prototypes by casting to void * first. Fixes: 1ea47e01ad6e ("bpf: add support for bpf_call to interpreter") Reported-by: kernel test robot Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210112075520.4103414-3-andrii@kernel.org --- include/linux/filter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index 29c27656165b..5edf2b660881 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -886,7 +886,7 @@ void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp); u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); #define __bpf_call_base_args \ ((u64 (*)(u64, u64, u64, u64, u64, const struct bpf_insn *)) \ - __bpf_call_base) + (void *)__bpf_call_base) struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog); void bpf_jit_compile(struct bpf_prog *prog); -- cgit v1.2.3 From 936f8946bdb48239f4292812d4d2e26c6d328c95 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 11 Jan 2021 23:55:16 -0800 Subject: bpf: Declare __bpf_free_used_maps() unconditionally __bpf_free_used_maps() is always defined in kernel/bpf/core.c, while include/linux/bpf.h is guarding it behind CONFIG_BPF_SYSCALL. Move it out of that guard region and fix compiler warning. Fixes: a2ea07465c8d ("bpf: Fix missing prog untrack in release_maps") Reported-by: kernel test robot Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210112075520.4103414-4-andrii@kernel.org --- include/linux/bpf.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ef9309604b3e..6e585dbc10df 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1206,8 +1206,6 @@ void bpf_prog_sub(struct bpf_prog *prog, int i); void bpf_prog_inc(struct bpf_prog *prog); struct bpf_prog * __must_check bpf_prog_inc_not_zero(struct bpf_prog *prog); void bpf_prog_put(struct bpf_prog *prog); -void __bpf_free_used_maps(struct bpf_prog_aux *aux, - struct bpf_map **used_maps, u32 len); void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock); void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock); @@ -1676,6 +1674,9 @@ static inline struct bpf_prog *bpf_prog_get_type(u32 ufd, return bpf_prog_get_type_dev(ufd, type, false); } +void __bpf_free_used_maps(struct bpf_prog_aux *aux, + struct bpf_map **used_maps, u32 len); + bool bpf_prog_get_ok(struct bpf_prog *, enum bpf_prog_type *, bool); int bpf_prog_offload_compile(struct bpf_prog *prog); -- cgit v1.2.3 From 635599bace259a2c42741c3ea61bfa7be6f15556 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 11 Jan 2021 23:55:17 -0800 Subject: selftests/bpf: Sync RCU before unloading bpf_testmod If some of the subtests use module BTFs through ksyms, they will cause bpf_prog to take a refcount on bpf_testmod module, which will prevent it from successfully unloading. Module's refcnt is decremented when bpf_prog is freed, which generally happens in RCU callback. So we need to trigger syncronize_rcu() in the kernel, which can be achieved nicely with membarrier(MEMBARRIER_CMD_SHARED) or membarrier(MEMBARRIER_CMD_GLOBAL) syscall. So do that in kernel_sync_rcu() and make it available to other test inside the test_progs. This synchronize_rcu() is called before attempting to unload bpf_testmod. Fixes: 9f7fa225894c ("selftests/bpf: Add bpf_testmod kernel module for testing") Suggested-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Acked-by: Hao Luo Link: https://lore.kernel.org/bpf/20210112075520.4103414-5-andrii@kernel.org --- .../selftests/bpf/prog_tests/btf_map_in_map.c | 33 ---------------------- tools/testing/selftests/bpf/test_progs.c | 11 ++++++++ tools/testing/selftests/bpf/test_progs.h | 1 + 3 files changed, 12 insertions(+), 33 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c b/tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c index 76ebe4c250f1..eb90a6b8850d 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c @@ -20,39 +20,6 @@ static __u32 bpf_map_id(struct bpf_map *map) return info.id; } -/* - * Trigger synchronize_rcu() in kernel. - * - * ARRAY_OF_MAPS/HASH_OF_MAPS lookup/update operations trigger synchronize_rcu() - * if looking up an existing non-NULL element or updating the map with a valid - * inner map FD. Use this fact to trigger synchronize_rcu(): create map-in-map, - * create a trivial ARRAY map, update map-in-map with ARRAY inner map. Then - * cleanup. At the end, at least one synchronize_rcu() would be called. - */ -static int kern_sync_rcu(void) -{ - int inner_map_fd, outer_map_fd, err, zero = 0; - - inner_map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, 4, 4, 1, 0); - if (CHECK(inner_map_fd < 0, "inner_map_create", "failed %d\n", -errno)) - return -1; - - outer_map_fd = bpf_create_map_in_map(BPF_MAP_TYPE_ARRAY_OF_MAPS, NULL, - sizeof(int), inner_map_fd, 1, 0); - if (CHECK(outer_map_fd < 0, "outer_map_create", "failed %d\n", -errno)) { - close(inner_map_fd); - return -1; - } - - err = bpf_map_update_elem(outer_map_fd, &zero, &inner_map_fd, 0); - if (err) - err = -errno; - CHECK(err, "outer_map_update", "failed %d\n", err); - close(inner_map_fd); - close(outer_map_fd); - return err; -} - static void test_lookup_update(void) { int map1_fd, map2_fd, map3_fd, map4_fd, map5_fd, map1_id, map2_id; diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index 7d077d48cadd..213628ee721c 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -11,6 +11,7 @@ #include #include #include /* backtrace */ +#include #define EXIT_NO_TEST 2 #define EXIT_ERR_SETUP_INFRA 3 @@ -370,8 +371,18 @@ static int delete_module(const char *name, int flags) return syscall(__NR_delete_module, name, flags); } +/* + * Trigger synchronize_rcu() in kernel. + */ +int kern_sync_rcu(void) +{ + return syscall(__NR_membarrier, MEMBARRIER_CMD_SHARED, 0, 0); +} + static void unload_bpf_testmod(void) { + if (kern_sync_rcu()) + fprintf(env.stderr, "Failed to trigger kernel-side RCU sync!\n"); if (delete_module("bpf_testmod", 0)) { if (errno == ENOENT) { if (env.verbosity > VERBOSE_NONE) diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h index 115953243f62..e49e2fdde942 100644 --- a/tools/testing/selftests/bpf/test_progs.h +++ b/tools/testing/selftests/bpf/test_progs.h @@ -219,6 +219,7 @@ int bpf_find_map(const char *test, struct bpf_object *obj, const char *name); int compare_map_keys(int map1_fd, int map2_fd); int compare_stack_ips(int smap_fd, int amap_fd, int stack_trace_len); int extract_build_id(char *build_id, size_t size); +int kern_sync_rcu(void); #ifdef __x86_64__ #define SYS_NANOSLEEP_KPROBE_NAME "__x64_sys_nanosleep" -- cgit v1.2.3 From 541c3bad8dc51b253ba8686d0cd7628e6b9b5f4c Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 11 Jan 2021 23:55:18 -0800 Subject: bpf: Support BPF ksym variables in kernel modules Add support for directly accessing kernel module variables from BPF programs using special ldimm64 instructions. This functionality builds upon vmlinux ksym support, but extends ldimm64 with src_reg=BPF_PSEUDO_BTF_ID to allow specifying kernel module BTF's FD in insn[1].imm field. During BPF program load time, verifier will resolve FD to BTF object and will take reference on BTF object itself and, for module BTFs, corresponding module as well, to make sure it won't be unloaded from under running BPF program. The mechanism used is similar to how bpf_prog keeps track of used bpf_maps. One interesting change is also in how per-CPU variable is determined. The logic is to find .data..percpu data section in provided BTF, but both vmlinux and module each have their own .data..percpu entries in BTF. So for module's case, the search for DATASEC record needs to look at only module's added BTF types. This is implemented with custom search function. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Acked-by: Hao Luo Link: https://lore.kernel.org/bpf/20210112075520.4103414-6-andrii@kernel.org --- include/linux/bpf.h | 10 +++ include/linux/bpf_verifier.h | 3 + include/linux/btf.h | 3 + kernel/bpf/btf.c | 31 ++++++++- kernel/bpf/core.c | 23 +++++++ kernel/bpf/verifier.c | 154 +++++++++++++++++++++++++++++++++++-------- 6 files changed, 194 insertions(+), 30 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6e585dbc10df..1aac2af12fed 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -761,9 +761,15 @@ struct bpf_ctx_arg_aux { u32 btf_id; }; +struct btf_mod_pair { + struct btf *btf; + struct module *module; +}; + struct bpf_prog_aux { atomic64_t refcnt; u32 used_map_cnt; + u32 used_btf_cnt; u32 max_ctx_offset; u32 max_pkt_offset; u32 max_tp_access; @@ -802,6 +808,7 @@ struct bpf_prog_aux { const struct bpf_prog_ops *ops; struct bpf_map **used_maps; struct mutex used_maps_mutex; /* mutex for used_maps and used_map_cnt */ + struct btf_mod_pair *used_btfs; struct bpf_prog *prog; struct user_struct *user; u64 load_time; /* ns since boottime */ @@ -1668,6 +1675,9 @@ bpf_base_func_proto(enum bpf_func_id func_id) } #endif /* CONFIG_BPF_SYSCALL */ +void __bpf_free_used_btfs(struct bpf_prog_aux *aux, + struct btf_mod_pair *used_btfs, u32 len); + static inline struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type) { diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index e941fe1484e5..dfe6f85d97dd 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -340,6 +340,7 @@ struct bpf_insn_aux_data { }; #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ +#define MAX_USED_BTFS 64 /* max number of BTFs accessed by one BPF program */ #define BPF_VERIFIER_TMP_LOG_SIZE 1024 @@ -398,7 +399,9 @@ struct bpf_verifier_env { struct bpf_verifier_state_list **explored_states; /* search pruning optimization */ struct bpf_verifier_state_list *free_list; struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ + struct btf_mod_pair used_btfs[MAX_USED_BTFS]; /* array of BTF's used by BPF program */ u32 used_map_cnt; /* number of used maps */ + u32 used_btf_cnt; /* number of used BTF objects */ u32 id_gen; /* used to generate unique reg IDs */ bool allow_ptr_leaks; bool allow_ptr_to_map_access; diff --git a/include/linux/btf.h b/include/linux/btf.h index 4c200f5d242b..7fabf1428093 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -91,6 +91,9 @@ int btf_type_snprintf_show(const struct btf *btf, u32 type_id, void *obj, int btf_get_fd_by_id(u32 id); u32 btf_obj_id(const struct btf *btf); bool btf_is_kernel(const struct btf *btf); +bool btf_is_module(const struct btf *btf); +struct module *btf_try_get_module(const struct btf *btf); +u32 btf_nr_types(const struct btf *btf); bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, const struct btf_member *m, u32 expected_offset, u32 expected_size); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 8d6bdb4f4d61..7ccc0133723a 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -458,7 +458,7 @@ static bool btf_type_is_datasec(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC; } -static u32 btf_nr_types_total(const struct btf *btf) +u32 btf_nr_types(const struct btf *btf) { u32 total = 0; @@ -476,7 +476,7 @@ s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind) const char *tname; u32 i, total; - total = btf_nr_types_total(btf); + total = btf_nr_types(btf); for (i = 1; i < total; i++) { t = btf_type_by_id(btf, i); if (BTF_INFO_KIND(t->info) != kind) @@ -5743,6 +5743,11 @@ bool btf_is_kernel(const struct btf *btf) return btf->kernel_btf; } +bool btf_is_module(const struct btf *btf) +{ + return btf->kernel_btf && strcmp(btf->name, "vmlinux") != 0; +} + static int btf_id_cmp_func(const void *a, const void *b) { const int *pa = a, *pb = b; @@ -5877,3 +5882,25 @@ static int __init btf_module_init(void) fs_initcall(btf_module_init); #endif /* CONFIG_DEBUG_INFO_BTF_MODULES */ + +struct module *btf_try_get_module(const struct btf *btf) +{ + struct module *res = NULL; +#ifdef CONFIG_DEBUG_INFO_BTF_MODULES + struct btf_module *btf_mod, *tmp; + + mutex_lock(&btf_module_mutex); + list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) { + if (btf_mod->btf != btf) + continue; + + if (try_module_get(btf_mod->module)) + res = btf_mod->module; + + break; + } + mutex_unlock(&btf_module_mutex); +#endif + + return res; +} diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 261f8692d0d2..69c3c308de5e 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2119,6 +2119,28 @@ static void bpf_free_used_maps(struct bpf_prog_aux *aux) kfree(aux->used_maps); } +void __bpf_free_used_btfs(struct bpf_prog_aux *aux, + struct btf_mod_pair *used_btfs, u32 len) +{ +#ifdef CONFIG_BPF_SYSCALL + struct btf_mod_pair *btf_mod; + u32 i; + + for (i = 0; i < len; i++) { + btf_mod = &used_btfs[i]; + if (btf_mod->module) + module_put(btf_mod->module); + btf_put(btf_mod->btf); + } +#endif +} + +static void bpf_free_used_btfs(struct bpf_prog_aux *aux) +{ + __bpf_free_used_btfs(aux, aux->used_btfs, aux->used_btf_cnt); + kfree(aux->used_btfs); +} + static void bpf_prog_free_deferred(struct work_struct *work) { struct bpf_prog_aux *aux; @@ -2126,6 +2148,7 @@ static void bpf_prog_free_deferred(struct work_struct *work) aux = container_of(work, struct bpf_prog_aux, work); bpf_free_used_maps(aux); + bpf_free_used_btfs(aux); if (bpf_prog_is_dev_bound(aux)) bpf_prog_offload_destroy(aux->prog); #ifdef CONFIG_PERF_EVENTS diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5534e667bdb1..ae2aee48cf82 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9703,6 +9703,36 @@ process_bpf_exit: return 0; } +static int find_btf_percpu_datasec(struct btf *btf) +{ + const struct btf_type *t; + const char *tname; + int i, n; + + /* + * Both vmlinux and module each have their own ".data..percpu" + * DATASECs in BTF. So for module's case, we need to skip vmlinux BTF + * types to look at only module's own BTF types. + */ + n = btf_nr_types(btf); + if (btf_is_module(btf)) + i = btf_nr_types(btf_vmlinux); + else + i = 1; + + for(; i < n; i++) { + t = btf_type_by_id(btf, i); + if (BTF_INFO_KIND(t->info) != BTF_KIND_DATASEC) + continue; + + tname = btf_name_by_offset(btf, t->name_off); + if (!strcmp(tname, ".data..percpu")) + return i; + } + + return -ENOENT; +} + /* replace pseudo btf_id with kernel symbol address */ static int check_pseudo_btf_id(struct bpf_verifier_env *env, struct bpf_insn *insn, @@ -9710,48 +9740,57 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, { const struct btf_var_secinfo *vsi; const struct btf_type *datasec; + struct btf_mod_pair *btf_mod; const struct btf_type *t; const char *sym_name; bool percpu = false; u32 type, id = insn->imm; + struct btf *btf; s32 datasec_id; u64 addr; - int i; - - if (!btf_vmlinux) { - verbose(env, "kernel is missing BTF, make sure CONFIG_DEBUG_INFO_BTF=y is specified in Kconfig.\n"); - return -EINVAL; - } + int i, btf_fd, err; - if (insn[1].imm != 0) { - verbose(env, "reserved field (insn[1].imm) is used in pseudo_btf_id ldimm64 insn.\n"); - return -EINVAL; + btf_fd = insn[1].imm; + if (btf_fd) { + btf = btf_get_by_fd(btf_fd); + if (IS_ERR(btf)) { + verbose(env, "invalid module BTF object FD specified.\n"); + return -EINVAL; + } + } else { + if (!btf_vmlinux) { + verbose(env, "kernel is missing BTF, make sure CONFIG_DEBUG_INFO_BTF=y is specified in Kconfig.\n"); + return -EINVAL; + } + btf = btf_vmlinux; + btf_get(btf); } - t = btf_type_by_id(btf_vmlinux, id); + t = btf_type_by_id(btf, id); if (!t) { verbose(env, "ldimm64 insn specifies invalid btf_id %d.\n", id); - return -ENOENT; + err = -ENOENT; + goto err_put; } if (!btf_type_is_var(t)) { - verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR.\n", - id); - return -EINVAL; + verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR.\n", id); + err = -EINVAL; + goto err_put; } - sym_name = btf_name_by_offset(btf_vmlinux, t->name_off); + sym_name = btf_name_by_offset(btf, t->name_off); addr = kallsyms_lookup_name(sym_name); if (!addr) { verbose(env, "ldimm64 failed to find the address for kernel symbol '%s'.\n", sym_name); - return -ENOENT; + err = -ENOENT; + goto err_put; } - datasec_id = btf_find_by_name_kind(btf_vmlinux, ".data..percpu", - BTF_KIND_DATASEC); + datasec_id = find_btf_percpu_datasec(btf); if (datasec_id > 0) { - datasec = btf_type_by_id(btf_vmlinux, datasec_id); + datasec = btf_type_by_id(btf, datasec_id); for_each_vsi(i, datasec, vsi) { if (vsi->type == id) { percpu = true; @@ -9764,10 +9803,10 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, insn[1].imm = addr >> 32; type = t->type; - t = btf_type_skip_modifiers(btf_vmlinux, type, NULL); + t = btf_type_skip_modifiers(btf, type, NULL); if (percpu) { aux->btf_var.reg_type = PTR_TO_PERCPU_BTF_ID; - aux->btf_var.btf = btf_vmlinux; + aux->btf_var.btf = btf; aux->btf_var.btf_id = type; } else if (!btf_type_is_struct(t)) { const struct btf_type *ret; @@ -9775,21 +9814,54 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, u32 tsize; /* resolve the type size of ksym. */ - ret = btf_resolve_size(btf_vmlinux, t, &tsize); + ret = btf_resolve_size(btf, t, &tsize); if (IS_ERR(ret)) { - tname = btf_name_by_offset(btf_vmlinux, t->name_off); + tname = btf_name_by_offset(btf, t->name_off); verbose(env, "ldimm64 unable to resolve the size of type '%s': %ld\n", tname, PTR_ERR(ret)); - return -EINVAL; + err = -EINVAL; + goto err_put; } aux->btf_var.reg_type = PTR_TO_MEM; aux->btf_var.mem_size = tsize; } else { aux->btf_var.reg_type = PTR_TO_BTF_ID; - aux->btf_var.btf = btf_vmlinux; + aux->btf_var.btf = btf; aux->btf_var.btf_id = type; } + + /* check whether we recorded this BTF (and maybe module) already */ + for (i = 0; i < env->used_btf_cnt; i++) { + if (env->used_btfs[i].btf == btf) { + btf_put(btf); + return 0; + } + } + + if (env->used_btf_cnt >= MAX_USED_BTFS) { + err = -E2BIG; + goto err_put; + } + + btf_mod = &env->used_btfs[env->used_btf_cnt]; + btf_mod->btf = btf; + btf_mod->module = NULL; + + /* if we reference variables from kernel module, bump its refcount */ + if (btf_is_module(btf)) { + btf_mod->module = btf_try_get_module(btf); + if (!btf_mod->module) { + err = -ENXIO; + goto err_put; + } + } + + env->used_btf_cnt++; + return 0; +err_put: + btf_put(btf); + return err; } static int check_map_prealloc(struct bpf_map *map) @@ -10086,6 +10158,13 @@ static void release_maps(struct bpf_verifier_env *env) env->used_map_cnt); } +/* drop refcnt of maps used by the rejected program */ +static void release_btfs(struct bpf_verifier_env *env) +{ + __bpf_free_used_btfs(env->prog->aux, env->used_btfs, + env->used_btf_cnt); +} + /* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env) { @@ -12098,7 +12177,10 @@ skip_full_check: goto err_release_maps; } - if (ret == 0 && env->used_map_cnt) { + if (ret) + goto err_release_maps; + + if (env->used_map_cnt) { /* if program passed verifier, update used_maps in bpf_prog_info */ env->prog->aux->used_maps = kmalloc_array(env->used_map_cnt, sizeof(env->used_maps[0]), @@ -12112,15 +12194,29 @@ skip_full_check: memcpy(env->prog->aux->used_maps, env->used_maps, sizeof(env->used_maps[0]) * env->used_map_cnt); env->prog->aux->used_map_cnt = env->used_map_cnt; + } + if (env->used_btf_cnt) { + /* if program passed verifier, update used_btfs in bpf_prog_aux */ + env->prog->aux->used_btfs = kmalloc_array(env->used_btf_cnt, + sizeof(env->used_btfs[0]), + GFP_KERNEL); + if (!env->prog->aux->used_btfs) { + ret = -ENOMEM; + goto err_release_maps; + } + memcpy(env->prog->aux->used_btfs, env->used_btfs, + sizeof(env->used_btfs[0]) * env->used_btf_cnt); + env->prog->aux->used_btf_cnt = env->used_btf_cnt; + } + if (env->used_map_cnt || env->used_btf_cnt) { /* program is valid. Convert pseudo bpf_ld_imm64 into generic * bpf_ld_imm64 instructions */ convert_pseudo_ld_imm64(env); } - if (ret == 0) - adjust_btf_func(env); + adjust_btf_func(env); err_release_maps: if (!env->prog->aux->used_maps) @@ -12128,6 +12224,8 @@ err_release_maps: * them now. Otherwise free_used_maps() will release them. */ release_maps(env); + if (!env->prog->aux->used_btfs) + release_btfs(env); /* extension progs temporarily inherit the attach_type of their targets for verification purposes, so set it back to zero before returning -- cgit v1.2.3 From 284d2587ea8a96f97ca519a3de683ff226e9e2b3 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 11 Jan 2021 23:55:19 -0800 Subject: libbpf: Support kernel module ksym externs Add support for searching for ksym externs not just in vmlinux BTF, but across all module BTFs, similarly to how it's done for CO-RE relocations. Kernels that expose module BTFs through sysfs are assumed to support new ldimm64 instruction extension with BTF FD provided in insn[1].imm field, so no extra feature detection is performed. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Acked-by: Hao Luo Link: https://lore.kernel.org/bpf/20210112075520.4103414-7-andrii@kernel.org --- tools/lib/bpf/libbpf.c | 50 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 18 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 6ae748f6ea11..2abbc3800568 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -395,7 +395,8 @@ struct extern_desc { unsigned long long addr; /* target btf_id of the corresponding kernel var. */ - int vmlinux_btf_id; + int kernel_btf_obj_fd; + int kernel_btf_id; /* local btf_id of the ksym extern's type. */ __u32 type_id; @@ -6162,7 +6163,8 @@ bpf_object__relocate_data(struct bpf_object *obj, struct bpf_program *prog) } else /* EXT_KSYM */ { if (ext->ksym.type_id) { /* typed ksyms */ insn[0].src_reg = BPF_PSEUDO_BTF_ID; - insn[0].imm = ext->ksym.vmlinux_btf_id; + insn[0].imm = ext->ksym.kernel_btf_id; + insn[1].imm = ext->ksym.kernel_btf_obj_fd; } else { /* typeless ksyms */ insn[0].imm = (__u32)ext->ksym.addr; insn[1].imm = ext->ksym.addr >> 32; @@ -7319,7 +7321,8 @@ out: static int bpf_object__resolve_ksyms_btf_id(struct bpf_object *obj) { struct extern_desc *ext; - int i, id; + struct btf *btf; + int i, j, id, btf_fd, err; for (i = 0; i < obj->nr_extern; i++) { const struct btf_type *targ_var, *targ_type; @@ -7331,10 +7334,25 @@ static int bpf_object__resolve_ksyms_btf_id(struct bpf_object *obj) if (ext->type != EXT_KSYM || !ext->ksym.type_id) continue; - id = btf__find_by_name_kind(obj->btf_vmlinux, ext->name, - BTF_KIND_VAR); + btf = obj->btf_vmlinux; + btf_fd = 0; + id = btf__find_by_name_kind(btf, ext->name, BTF_KIND_VAR); + if (id == -ENOENT) { + err = load_module_btfs(obj); + if (err) + return err; + + for (j = 0; j < obj->btf_module_cnt; j++) { + btf = obj->btf_modules[j].btf; + /* we assume module BTF FD is always >0 */ + btf_fd = obj->btf_modules[j].fd; + id = btf__find_by_name_kind(btf, ext->name, BTF_KIND_VAR); + if (id != -ENOENT) + break; + } + } if (id <= 0) { - pr_warn("extern (ksym) '%s': failed to find BTF ID in vmlinux BTF.\n", + pr_warn("extern (ksym) '%s': failed to find BTF ID in kernel BTF(s).\n", ext->name); return -ESRCH; } @@ -7343,24 +7361,19 @@ static int bpf_object__resolve_ksyms_btf_id(struct bpf_object *obj) local_type_id = ext->ksym.type_id; /* find target type_id */ - targ_var = btf__type_by_id(obj->btf_vmlinux, id); - targ_var_name = btf__name_by_offset(obj->btf_vmlinux, - targ_var->name_off); - targ_type = skip_mods_and_typedefs(obj->btf_vmlinux, - targ_var->type, - &targ_type_id); + targ_var = btf__type_by_id(btf, id); + targ_var_name = btf__name_by_offset(btf, targ_var->name_off); + targ_type = skip_mods_and_typedefs(btf, targ_var->type, &targ_type_id); ret = bpf_core_types_are_compat(obj->btf, local_type_id, - obj->btf_vmlinux, targ_type_id); + btf, targ_type_id); if (ret <= 0) { const struct btf_type *local_type; const char *targ_name, *local_name; local_type = btf__type_by_id(obj->btf, local_type_id); - local_name = btf__name_by_offset(obj->btf, - local_type->name_off); - targ_name = btf__name_by_offset(obj->btf_vmlinux, - targ_type->name_off); + local_name = btf__name_by_offset(obj->btf, local_type->name_off); + targ_name = btf__name_by_offset(btf, targ_type->name_off); pr_warn("extern (ksym) '%s': incompatible types, expected [%d] %s %s, but kernel has [%d] %s %s\n", ext->name, local_type_id, @@ -7370,7 +7383,8 @@ static int bpf_object__resolve_ksyms_btf_id(struct bpf_object *obj) } ext->is_set = true; - ext->ksym.vmlinux_btf_id = id; + ext->ksym.kernel_btf_obj_fd = btf_fd; + ext->ksym.kernel_btf_id = id; pr_debug("extern (ksym) '%s': resolved to [%d] %s %s\n", ext->name, id, btf_kind_str(targ_var), targ_var_name); } -- cgit v1.2.3 From 430d97a8a7bf1500c081ce756ff2ead73d2303c5 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 11 Jan 2021 23:55:20 -0800 Subject: selftests/bpf: Test kernel module ksym externs Add per-CPU variable to bpf_testmod.ko and use those from new selftest to validate it works end-to-end. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Acked-by: Hao Luo Link: https://lore.kernel.org/bpf/20210112075520.4103414-8-andrii@kernel.org --- .../selftests/bpf/bpf_testmod/bpf_testmod.c | 3 +++ .../selftests/bpf/prog_tests/ksyms_module.c | 31 ++++++++++++++++++++++ .../selftests/bpf/progs/test_ksyms_module.c | 26 ++++++++++++++++++ 3 files changed, 60 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/ksyms_module.c create mode 100644 tools/testing/selftests/bpf/progs/test_ksyms_module.c diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c index 2df19d73ca49..0b991e115d1f 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include "bpf_testmod.h" @@ -10,6 +11,8 @@ #define CREATE_TRACE_POINTS #include "bpf_testmod-events.h" +DEFINE_PER_CPU(int, bpf_testmod_ksym_percpu) = 123; + noinline ssize_t bpf_testmod_test_read(struct file *file, struct kobject *kobj, struct bin_attribute *bin_attr, diff --git a/tools/testing/selftests/bpf/prog_tests/ksyms_module.c b/tools/testing/selftests/bpf/prog_tests/ksyms_module.c new file mode 100644 index 000000000000..4c232b456479 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/ksyms_module.c @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include +#include +#include +#include "test_ksyms_module.skel.h" + +static int duration; + +void test_ksyms_module(void) +{ + struct test_ksyms_module* skel; + int err; + + skel = test_ksyms_module__open_and_load(); + if (CHECK(!skel, "skel_open", "failed to open skeleton\n")) + return; + + err = test_ksyms_module__attach(skel); + if (CHECK(err, "skel_attach", "skeleton attach failed: %d\n", err)) + goto cleanup; + + usleep(1); + + ASSERT_EQ(skel->bss->triggered, true, "triggered"); + ASSERT_EQ(skel->bss->out_mod_ksym_global, 123, "global_ksym_val"); + +cleanup: + test_ksyms_module__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/test_ksyms_module.c b/tools/testing/selftests/bpf/progs/test_ksyms_module.c new file mode 100644 index 000000000000..d6a0b3086b90 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_ksyms_module.c @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include "vmlinux.h" + +#include + +extern const int bpf_testmod_ksym_percpu __ksym; + +int out_mod_ksym_global = 0; +bool triggered = false; + +SEC("raw_tp/sys_enter") +int handler(const void *ctx) +{ + int *val; + __u32 cpu; + + val = (int *)bpf_this_cpu_ptr(&bpf_testmod_ksym_percpu); + out_mod_ksym_global = *val; + triggered = true; + + return 0; +} + +char LICENSE[] SEC("license") = "GPL"; -- cgit v1.2.3 From de11ae4f56fdc53474c08b03142860428e6c5655 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Wed, 13 Jan 2021 17:33:16 +0100 Subject: selftests/bpf: Enable cross-building Build bpftool and resolve_btfids using the host toolchain when cross-compiling, since they are executed during build to generate the selftests. Add a host build directory in order to build both host and target version of libbpf. Build host tools using $(HOSTCC) defined in Makefile.include. Signed-off-by: Jean-Philippe Brucker Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210113163319.1516382-2-jean-philippe@linaro.org --- tools/testing/selftests/bpf/Makefile | 46 +++++++++++++++++++++++++++--------- 1 file changed, 35 insertions(+), 11 deletions(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index c51df6b91bef..95ce81513648 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 include ../../../../scripts/Kbuild.include include ../../../scripts/Makefile.arch +include ../../../scripts/Makefile.include CXX ?= $(CROSS_COMPILE)g++ @@ -113,7 +114,15 @@ SCRATCH_DIR := $(OUTPUT)/tools BUILD_DIR := $(SCRATCH_DIR)/build INCLUDE_DIR := $(SCRATCH_DIR)/include BPFOBJ := $(BUILD_DIR)/libbpf/libbpf.a -RESOLVE_BTFIDS := $(BUILD_DIR)/resolve_btfids/resolve_btfids +ifneq ($(CROSS_COMPILE),) +HOST_BUILD_DIR := $(BUILD_DIR)/host +HOST_SCRATCH_DIR := $(OUTPUT)/host-tools +else +HOST_BUILD_DIR := $(BUILD_DIR) +HOST_SCRATCH_DIR := $(SCRATCH_DIR) +endif +HOST_BPFOBJ := $(HOST_BUILD_DIR)/libbpf/libbpf.a +RESOLVE_BTFIDS := $(HOST_BUILD_DIR)/resolve_btfids/resolve_btfids VMLINUX_BTF_PATHS ?= $(if $(O),$(O)/vmlinux) \ $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux) \ @@ -135,6 +144,14 @@ $(notdir $(TEST_GEN_PROGS) \ $(TEST_GEN_PROGS_EXTENDED) \ $(TEST_CUSTOM_PROGS)): %: $(OUTPUT)/% ; +# sort removes libbpf duplicates when not cross-building +MAKE_DIRS := $(sort $(BUILD_DIR)/libbpf $(HOST_BUILD_DIR)/libbpf \ + $(HOST_BUILD_DIR)/bpftool $(HOST_BUILD_DIR)/resolve_btfids \ + $(INCLUDE_DIR)) +$(MAKE_DIRS): + $(call msg,MKDIR,,$@) + $(Q)mkdir -p $@ + $(OUTPUT)/%.o: %.c $(call msg,CC,,$@) $(Q)$(CC) $(CFLAGS) -c $(filter %.c,$^) $(LDLIBS) -o $@ @@ -157,7 +174,7 @@ $(OUTPUT)/test_stub.o: test_stub.c $(BPFOBJ) $(call msg,CC,,$@) $(Q)$(CC) -c $(CFLAGS) -o $@ $< -DEFAULT_BPFTOOL := $(SCRATCH_DIR)/sbin/bpftool +DEFAULT_BPFTOOL := $(HOST_SCRATCH_DIR)/sbin/bpftool $(OUTPUT)/runqslower: $(BPFOBJ) | $(DEFAULT_BPFTOOL) $(Q)$(MAKE) $(submake_extras) -C $(TOOLSDIR)/bpf/runqslower \ @@ -182,10 +199,11 @@ $(OUTPUT)/test_sysctl: cgroup_helpers.c BPFTOOL ?= $(DEFAULT_BPFTOOL) $(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile) \ - $(BPFOBJ) | $(BUILD_DIR)/bpftool + $(HOST_BPFOBJ) | $(HOST_BUILD_DIR)/bpftool $(Q)$(MAKE) $(submake_extras) -C $(BPFTOOLDIR) \ - OUTPUT=$(BUILD_DIR)/bpftool/ \ - prefix= DESTDIR=$(SCRATCH_DIR)/ install + CC=$(HOSTCC) LD=$(HOSTLD) \ + OUTPUT=$(HOST_BUILD_DIR)/bpftool/ \ + prefix= DESTDIR=$(HOST_SCRATCH_DIR)/ install $(Q)mkdir -p $(BUILD_DIR)/bpftool/Documentation $(Q)RST2MAN_OPTS="--exit-status=1" $(MAKE) $(submake_extras) \ -C $(BPFTOOLDIR)/Documentation \ @@ -198,9 +216,14 @@ $(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \ $(Q)$(MAKE) $(submake_extras) -C $(BPFDIR) OUTPUT=$(BUILD_DIR)/libbpf/ \ DESTDIR=$(SCRATCH_DIR) prefix= all install_headers -$(BUILD_DIR)/libbpf $(BUILD_DIR)/bpftool $(BUILD_DIR)/resolve_btfids $(INCLUDE_DIR): - $(call msg,MKDIR,,$@) - $(Q)mkdir -p $@ +ifneq ($(BPFOBJ),$(HOST_BPFOBJ)) +$(HOST_BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \ + ../../../include/uapi/linux/bpf.h \ + | $(INCLUDE_DIR) $(HOST_BUILD_DIR)/libbpf + $(Q)$(MAKE) $(submake_extras) -C $(BPFDIR) \ + OUTPUT=$(HOST_BUILD_DIR)/libbpf/ CC=$(HOSTCC) LD=$(HOSTLD) \ + DESTDIR=$(HOST_SCRATCH_DIR)/ prefix= all install_headers +endif $(INCLUDE_DIR)/vmlinux.h: $(VMLINUX_BTF) | $(BPFTOOL) $(INCLUDE_DIR) ifeq ($(VMLINUX_H),) @@ -211,7 +234,7 @@ else $(Q)cp "$(VMLINUX_H)" $@ endif -$(RESOLVE_BTFIDS): $(BPFOBJ) | $(BUILD_DIR)/resolve_btfids \ +$(RESOLVE_BTFIDS): $(HOST_BPFOBJ) | $(HOST_BUILD_DIR)/resolve_btfids \ $(TOOLSDIR)/bpf/resolve_btfids/main.c \ $(TOOLSDIR)/lib/rbtree.c \ $(TOOLSDIR)/lib/zalloc.c \ @@ -219,7 +242,8 @@ $(RESOLVE_BTFIDS): $(BPFOBJ) | $(BUILD_DIR)/resolve_btfids \ $(TOOLSDIR)/lib/ctype.c \ $(TOOLSDIR)/lib/str_error_r.c $(Q)$(MAKE) $(submake_extras) -C $(TOOLSDIR)/bpf/resolve_btfids \ - OUTPUT=$(BUILD_DIR)/resolve_btfids/ BPFOBJ=$(BPFOBJ) + CC=$(HOSTCC) LD=$(HOSTLD) AR=$(HOSTAR) \ + OUTPUT=$(HOST_BUILD_DIR)/resolve_btfids/ BPFOBJ=$(HOST_BPFOBJ) # Get Clang's default includes on this system, as opposed to those seen by # '-target bpf'. This fixes "missing" files on some architectures/distros, @@ -450,7 +474,7 @@ $(OUTPUT)/bench: $(OUTPUT)/bench.o $(OUTPUT)/testing_helpers.o \ $(call msg,BINARY,,$@) $(Q)$(CC) $(LDFLAGS) -o $@ $(filter %.a %.o,$^) $(LDLIBS) -EXTRA_CLEAN := $(TEST_CUSTOM_PROGS) $(SCRATCH_DIR) \ +EXTRA_CLEAN := $(TEST_CUSTOM_PROGS) $(SCRATCH_DIR) $(HOST_SCRATCH_DIR) \ prog_tests/tests.h map_tests/tests.h verifier/tests.h \ feature \ $(addprefix $(OUTPUT)/,*.o *.skel.h no_alu32 bpf_gcc bpf_testmod.ko) -- cgit v1.2.3 From 5837cedef6f33e01d1c4ad90ee4e966bcc6f9c30 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Wed, 13 Jan 2021 17:33:17 +0100 Subject: selftests/bpf: Fix out-of-tree build When building out-of-tree, the .skel.h files are generated into the $(OUTPUT) directory, rather than $(CURDIR). Add $(OUTPUT) to the include paths. Signed-off-by: Jean-Philippe Brucker Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210113163319.1516382-3-jean-philippe@linaro.org --- tools/testing/selftests/bpf/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 95ce81513648..92888eed89f3 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -25,7 +25,7 @@ BPF_GCC ?= $(shell command -v bpf-gcc;) SAN_CFLAGS ?= CFLAGS += -g -rdynamic -Wall -O2 $(GENFLAGS) $(SAN_CFLAGS) \ -I$(CURDIR) -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) \ - -I$(TOOLSINCDIR) -I$(APIDIR) \ + -I$(TOOLSINCDIR) -I$(APIDIR) -I$(OUTPUT) \ -Dbpf_prog_load=bpf_prog_test_load \ -Dbpf_load_program=bpf_test_load_program LDLIBS += -lcap -lelf -lz -lrt -lpthread -- cgit v1.2.3 From d6ac8cad50f0c0fed5cfeb61301bb19e8e88985c Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Wed, 13 Jan 2021 17:33:18 +0100 Subject: selftests/bpf: Move generated test files to $(TEST_GEN_FILES) During an out-of-tree build, attempting to install the $(TEST_FILES) into the $(OUTPUT) directory fails, because the objects were already generated into $(OUTPUT): rsync: [sender] link_stat "tools/testing/selftests/bpf/test_lwt_ip_encap.o" failed: No such file or directory (2) rsync: [sender] link_stat "tools/testing/selftests/bpf/test_tc_edt.o" failed: No such file or directory (2) Use $(TEST_GEN_FILES) instead. Signed-off-by: Jean-Philippe Brucker Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210113163319.1516382-4-jean-philippe@linaro.org --- tools/testing/selftests/bpf/Makefile | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 92888eed89f3..67cdf858f01f 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -44,10 +44,9 @@ ifneq ($(BPF_GCC),) TEST_GEN_PROGS += test_progs-bpf_gcc endif -TEST_GEN_FILES = -TEST_FILES = test_lwt_ip_encap.o \ - test_tc_edt.o \ - xsk_prereqs.sh +TEST_GEN_FILES = test_lwt_ip_encap.o \ + test_tc_edt.o +TEST_FILES = xsk_prereqs.sh # Order correspond to 'make run_tests' order TEST_PROGS := test_kmod.sh \ -- cgit v1.2.3 From ca1e846711a8f49382005eb5feb82973fa88a849 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Wed, 13 Jan 2021 17:33:19 +0100 Subject: selftests/bpf: Fix installation of urandom_read For out-of-tree builds, $(TEST_CUSTOM_PROGS) require the $(OUTPUT) prefix, otherwise the kselftest lib doesn't know how to install them: rsync: [sender] link_stat "tools/testing/selftests/bpf/urandom_read" failed: No such file or directory (2) Signed-off-by: Jean-Philippe Brucker Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210113163319.1516382-5-jean-philippe@linaro.org --- tools/testing/selftests/bpf/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 67cdf858f01f..0fafdc022ac3 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -82,7 +82,7 @@ TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \ test_lirc_mode2_user xdping test_cpp runqslower bench bpf_testmod.ko \ xdpxceiver -TEST_CUSTOM_PROGS = urandom_read +TEST_CUSTOM_PROGS = $(OUTPUT)/urandom_read # Emit succinct information message describing current building step # $1 - generic step name (e.g., CC, LINK, etc); -- cgit v1.2.3 From b8d1cbef2ea4415edcdb5a825972d93b63fcf63c Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Wed, 13 Jan 2021 17:33:20 +0100 Subject: selftests/bpf: Install btf_dump test cases The btf_dump test cannot access the original source files for comparison when running the selftests out of tree, causing several failures: awk: btf_dump_test_case_syntax.c: No such file or directory ... Add those files to $(TEST_FILES) to have "make install" pick them up. Signed-off-by: Jean-Philippe Brucker Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210113163319.1516382-6-jean-philippe@linaro.org --- tools/testing/selftests/bpf/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 0fafdc022ac3..7f8667ad113e 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -46,7 +46,8 @@ endif TEST_GEN_FILES = test_lwt_ip_encap.o \ test_tc_edt.o -TEST_FILES = xsk_prereqs.sh +TEST_FILES = xsk_prereqs.sh \ + $(wildcard progs/btf_dump_test_case_*.c) # Order correspond to 'make run_tests' order TEST_PROGS := test_kmod.sh \ -- cgit v1.2.3 From ce5a518e9de53446f10d46fac98640f7ac026100 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 13 Jan 2021 14:36:08 -0800 Subject: bpf, libbpf: Avoid unused function warning on bpf_tail_call_static Add inline to __always_inline making it match the linux/compiler.h. Adding this avoids an unused function warning on bpf_tail_call_static when compining with -Wall. Signed-off-by: Ian Rogers Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210113223609.3358812-1-irogers@google.com --- tools/lib/bpf/bpf_helpers.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h index 72b251110c4d..ae6c975e0b87 100644 --- a/tools/lib/bpf/bpf_helpers.h +++ b/tools/lib/bpf/bpf_helpers.h @@ -30,7 +30,7 @@ #define SEC(NAME) __attribute__((section(NAME), used)) #ifndef __always_inline -#define __always_inline __attribute__((always_inline)) +#define __always_inline inline __attribute__((always_inline)) #endif #ifndef __noinline #define __noinline __attribute__((noinline)) -- cgit v1.2.3 From bade5c554f1ac70a50cefe96517957629dbc0d8f Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 13 Jan 2021 14:36:09 -0800 Subject: tools/bpftool: Add -Wall when building BPF programs No additional warnings are generated by enabling this, but having it enabled will help avoid regressions. Signed-off-by: Ian Rogers Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210113223609.3358812-2-irogers@google.com --- tools/bpf/bpftool/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile index f897cb5fb12d..45ac2f9e0aa9 100644 --- a/tools/bpf/bpftool/Makefile +++ b/tools/bpf/bpftool/Makefile @@ -166,7 +166,7 @@ $(OUTPUT)%.bpf.o: skeleton/%.bpf.c $(OUTPUT)vmlinux.h $(LIBBPF) -I$(srctree)/tools/include/uapi/ \ -I$(LIBBPF_PATH) \ -I$(srctree)/tools/lib \ - -g -O2 -target bpf -c $< -o $@ && $(LLVM_STRIP) -g $@ + -g -O2 -Wall -target bpf -c $< -o $@ && $(LLVM_STRIP) -g $@ $(OUTPUT)%.skel.h: $(OUTPUT)%.bpf.o $(BPFTOOL_BOOTSTRAP) $(QUIET_GEN)$(BPFTOOL_BOOTSTRAP) gen skeleton $< > $@ -- cgit v1.2.3 From 11c11d0751fce605090761f9c066bae947a35e76 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 14 Jan 2021 18:17:41 +0000 Subject: bpf: x86: Factor out emission of ModR/M for *(reg + off) The case for JITing atomics is about to get more complicated. Let's factor out some common code to make the review and result more readable. NB the atomics code doesn't yet use the new helper - a subsequent patch will add its use as a side-effect of other changes. Signed-off-by: Brendan Jackman Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210114181751.768687-2-jackmanb@google.com --- arch/x86/net/bpf_jit_comp.c | 43 +++++++++++++++++++++++++------------------ 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 796506dcfc42..30526776fa78 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -681,6 +681,27 @@ static void emit_mov_reg(u8 **pprog, bool is64, u32 dst_reg, u32 src_reg) *pprog = prog; } +/* Emit the suffix (ModR/M etc) for addressing *(ptr_reg + off) and val_reg */ +static void emit_insn_suffix(u8 **pprog, u32 ptr_reg, u32 val_reg, int off) +{ + u8 *prog = *pprog; + int cnt = 0; + + if (is_imm8(off)) { + /* 1-byte signed displacement. + * + * If off == 0 we could skip this and save one extra byte, but + * special case of x86 R13 which always needs an offset is not + * worth the hassle + */ + EMIT2(add_2reg(0x40, ptr_reg, val_reg), off); + } else { + /* 4-byte signed displacement */ + EMIT1_off32(add_2reg(0x80, ptr_reg, val_reg), off); + } + *pprog = prog; +} + /* LDX: dst_reg = *(u8*)(src_reg + off) */ static void emit_ldx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) { @@ -708,15 +729,7 @@ static void emit_ldx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) EMIT2(add_2mod(0x48, src_reg, dst_reg), 0x8B); break; } - /* - * If insn->off == 0 we can save one extra byte, but - * special case of x86 R13 which always needs an offset - * is not worth the hassle - */ - if (is_imm8(off)) - EMIT2(add_2reg(0x40, src_reg, dst_reg), off); - else - EMIT1_off32(add_2reg(0x80, src_reg, dst_reg), off); + emit_insn_suffix(&prog, src_reg, dst_reg, off); *pprog = prog; } @@ -751,10 +764,7 @@ static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) EMIT2(add_2mod(0x48, dst_reg, src_reg), 0x89); break; } - if (is_imm8(off)) - EMIT2(add_2reg(0x40, dst_reg, src_reg), off); - else - EMIT1_off32(add_2reg(0x80, dst_reg, src_reg), off); + emit_insn_suffix(&prog, dst_reg, src_reg, off); *pprog = prog; } @@ -1240,11 +1250,8 @@ st: if (is_imm8(insn->off)) goto xadd; case BPF_STX | BPF_XADD | BPF_DW: EMIT3(0xF0, add_2mod(0x48, dst_reg, src_reg), 0x01); -xadd: if (is_imm8(insn->off)) - EMIT2(add_2reg(0x40, dst_reg, src_reg), insn->off); - else - EMIT1_off32(add_2reg(0x80, dst_reg, src_reg), - insn->off); +xadd: + emit_modrm_dstoff(&prog, dst_reg, src_reg, insn->off); break; /* call */ -- cgit v1.2.3 From 74007cfc1f71e47394ca173b93d28afd0529fc86 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 14 Jan 2021 18:17:42 +0000 Subject: bpf: x86: Factor out emission of REX byte The JIT case for encoding atomic ops is about to get more complicated. In order to make the review & resulting code easier, let's factor out some shared helpers. Signed-off-by: Brendan Jackman Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210114181751.768687-3-jackmanb@google.com --- arch/x86/net/bpf_jit_comp.c | 39 +++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 30526776fa78..f15c93275a18 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -702,6 +702,21 @@ static void emit_insn_suffix(u8 **pprog, u32 ptr_reg, u32 val_reg, int off) *pprog = prog; } +/* + * Emit a REX byte if it will be necessary to address these registers + */ +static void maybe_emit_mod(u8 **pprog, u32 dst_reg, u32 src_reg, bool is64) +{ + u8 *prog = *pprog; + int cnt = 0; + + if (is64) + EMIT1(add_2mod(0x48, dst_reg, src_reg)); + else if (is_ereg(dst_reg) || is_ereg(src_reg)) + EMIT1(add_2mod(0x40, dst_reg, src_reg)); + *pprog = prog; +} + /* LDX: dst_reg = *(u8*)(src_reg + off) */ static void emit_ldx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) { @@ -854,10 +869,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, case BPF_OR: b2 = 0x09; break; case BPF_XOR: b2 = 0x31; break; } - if (BPF_CLASS(insn->code) == BPF_ALU64) - EMIT1(add_2mod(0x48, dst_reg, src_reg)); - else if (is_ereg(dst_reg) || is_ereg(src_reg)) - EMIT1(add_2mod(0x40, dst_reg, src_reg)); + maybe_emit_mod(&prog, dst_reg, src_reg, + BPF_CLASS(insn->code) == BPF_ALU64); EMIT2(b2, add_2reg(0xC0, dst_reg, src_reg)); break; @@ -1302,20 +1315,16 @@ xadd: case BPF_JMP32 | BPF_JSGE | BPF_X: case BPF_JMP32 | BPF_JSLE | BPF_X: /* cmp dst_reg, src_reg */ - if (BPF_CLASS(insn->code) == BPF_JMP) - EMIT1(add_2mod(0x48, dst_reg, src_reg)); - else if (is_ereg(dst_reg) || is_ereg(src_reg)) - EMIT1(add_2mod(0x40, dst_reg, src_reg)); + maybe_emit_mod(&prog, dst_reg, src_reg, + BPF_CLASS(insn->code) == BPF_JMP); EMIT2(0x39, add_2reg(0xC0, dst_reg, src_reg)); goto emit_cond_jmp; case BPF_JMP | BPF_JSET | BPF_X: case BPF_JMP32 | BPF_JSET | BPF_X: /* test dst_reg, src_reg */ - if (BPF_CLASS(insn->code) == BPF_JMP) - EMIT1(add_2mod(0x48, dst_reg, src_reg)); - else if (is_ereg(dst_reg) || is_ereg(src_reg)) - EMIT1(add_2mod(0x40, dst_reg, src_reg)); + maybe_emit_mod(&prog, dst_reg, src_reg, + BPF_CLASS(insn->code) == BPF_JMP); EMIT2(0x85, add_2reg(0xC0, dst_reg, src_reg)); goto emit_cond_jmp; @@ -1351,10 +1360,8 @@ xadd: case BPF_JMP32 | BPF_JSLE | BPF_K: /* test dst_reg, dst_reg to save one extra byte */ if (imm32 == 0) { - if (BPF_CLASS(insn->code) == BPF_JMP) - EMIT1(add_2mod(0x48, dst_reg, dst_reg)); - else if (is_ereg(dst_reg)) - EMIT1(add_2mod(0x40, dst_reg, dst_reg)); + maybe_emit_mod(&prog, dst_reg, dst_reg, + BPF_CLASS(insn->code) == BPF_JMP); EMIT2(0x85, add_2reg(0xC0, dst_reg, dst_reg)); goto emit_cond_jmp; } -- cgit v1.2.3 From e5f02caccfae94f5baf6ec6dbb57ce8a7e9a40e7 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 14 Jan 2021 18:17:43 +0000 Subject: bpf: x86: Factor out a lookup table for some ALU opcodes A later commit will need to lookup a subset of these opcodes. To avoid duplicating code, pull out a table. The shift opcodes won't be needed by that later commit, but they're already duplicated, so fold them into the table anyway. Signed-off-by: Brendan Jackman Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210114181751.768687-4-jackmanb@google.com --- arch/x86/net/bpf_jit_comp.c | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index f15c93275a18..93f32e0ba0ef 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -205,6 +205,18 @@ static u8 add_2reg(u8 byte, u32 dst_reg, u32 src_reg) return byte + reg2hex[dst_reg] + (reg2hex[src_reg] << 3); } +/* Some 1-byte opcodes for binary ALU operations */ +static u8 simple_alu_opcodes[] = { + [BPF_ADD] = 0x01, + [BPF_SUB] = 0x29, + [BPF_AND] = 0x21, + [BPF_OR] = 0x09, + [BPF_XOR] = 0x31, + [BPF_LSH] = 0xE0, + [BPF_RSH] = 0xE8, + [BPF_ARSH] = 0xF8, +}; + static void jit_fill_hole(void *area, unsigned int size) { /* Fill whole space with INT3 instructions */ @@ -862,15 +874,9 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, case BPF_ALU64 | BPF_AND | BPF_X: case BPF_ALU64 | BPF_OR | BPF_X: case BPF_ALU64 | BPF_XOR | BPF_X: - switch (BPF_OP(insn->code)) { - case BPF_ADD: b2 = 0x01; break; - case BPF_SUB: b2 = 0x29; break; - case BPF_AND: b2 = 0x21; break; - case BPF_OR: b2 = 0x09; break; - case BPF_XOR: b2 = 0x31; break; - } maybe_emit_mod(&prog, dst_reg, src_reg, BPF_CLASS(insn->code) == BPF_ALU64); + b2 = simple_alu_opcodes[BPF_OP(insn->code)]; EMIT2(b2, add_2reg(0xC0, dst_reg, src_reg)); break; @@ -1050,12 +1056,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, else if (is_ereg(dst_reg)) EMIT1(add_1mod(0x40, dst_reg)); - switch (BPF_OP(insn->code)) { - case BPF_LSH: b3 = 0xE0; break; - case BPF_RSH: b3 = 0xE8; break; - case BPF_ARSH: b3 = 0xF8; break; - } - + b3 = simple_alu_opcodes[BPF_OP(insn->code)]; if (imm32 == 1) EMIT2(0xD1, add_1reg(b3, dst_reg)); else @@ -1089,11 +1090,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, else if (is_ereg(dst_reg)) EMIT1(add_1mod(0x40, dst_reg)); - switch (BPF_OP(insn->code)) { - case BPF_LSH: b3 = 0xE0; break; - case BPF_RSH: b3 = 0xE8; break; - case BPF_ARSH: b3 = 0xF8; break; - } + b3 = simple_alu_opcodes[BPF_OP(insn->code)]; EMIT2(0xD3, add_1reg(b3, dst_reg)); if (src_reg != BPF_REG_4) -- cgit v1.2.3 From 91c960b0056672e74627776655c926388350fa30 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 14 Jan 2021 18:17:44 +0000 Subject: bpf: Rename BPF_XADD and prepare to encode other atomics in .imm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A subsequent patch will add additional atomic operations. These new operations will use the same opcode field as the existing XADD, with the immediate discriminating different operations. In preparation, rename the instruction mode BPF_ATOMIC and start calling the zero immediate BPF_ADD. This is possible (doesn't break existing valid BPF progs) because the immediate field is currently reserved MBZ and BPF_ADD is zero. All uses are removed from the tree but the BPF_XADD definition is kept around to avoid breaking builds for people including kernel headers. Signed-off-by: Brendan Jackman Signed-off-by: Alexei Starovoitov Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/20210114181751.768687-5-jackmanb@google.com --- Documentation/networking/filter.rst | 30 ++++++++------ arch/arm/net/bpf_jit_32.c | 7 ++-- arch/arm64/net/bpf_jit_comp.c | 16 ++++++-- arch/mips/net/ebpf_jit.c | 11 ++++-- arch/powerpc/net/bpf_jit_comp64.c | 25 +++++++++--- arch/riscv/net/bpf_jit_comp32.c | 20 ++++++++-- arch/riscv/net/bpf_jit_comp64.c | 16 ++++++-- arch/s390/net/bpf_jit_comp.c | 27 +++++++------ arch/sparc/net/bpf_jit_comp_64.c | 17 ++++++-- arch/x86/net/bpf_jit_comp.c | 46 ++++++++++++++++------ arch/x86/net/bpf_jit_comp32.c | 6 +-- drivers/net/ethernet/netronome/nfp/bpf/jit.c | 14 +++++-- drivers/net/ethernet/netronome/nfp/bpf/main.h | 4 +- drivers/net/ethernet/netronome/nfp/bpf/verifier.c | 15 ++++--- include/linux/filter.h | 16 ++++++-- include/uapi/linux/bpf.h | 5 ++- kernel/bpf/core.c | 31 ++++++++++----- kernel/bpf/disasm.c | 6 ++- kernel/bpf/verifier.c | 24 ++++++----- lib/test_bpf.c | 14 +++---- samples/bpf/bpf_insn.h | 4 +- samples/bpf/cookie_uid_helper_example.c | 8 ++-- samples/bpf/sock_example.c | 2 +- samples/bpf/test_cgrp2_attach.c | 5 ++- tools/include/linux/filter.h | 15 +++++-- tools/include/uapi/linux/bpf.h | 5 ++- .../selftests/bpf/prog_tests/cgroup_attach_multi.c | 4 +- tools/testing/selftests/bpf/test_cgroup_storage.c | 2 +- tools/testing/selftests/bpf/verifier/ctx.c | 7 ++-- .../selftests/bpf/verifier/direct_packet_access.c | 4 +- tools/testing/selftests/bpf/verifier/leak_ptr.c | 10 ++--- tools/testing/selftests/bpf/verifier/meta_access.c | 4 +- tools/testing/selftests/bpf/verifier/unpriv.c | 3 +- .../selftests/bpf/verifier/value_illegal_alu.c | 2 +- tools/testing/selftests/bpf/verifier/xadd.c | 18 ++++----- 35 files changed, 291 insertions(+), 152 deletions(-) diff --git a/Documentation/networking/filter.rst b/Documentation/networking/filter.rst index debb59e374de..1583d59d806d 100644 --- a/Documentation/networking/filter.rst +++ b/Documentation/networking/filter.rst @@ -1006,13 +1006,13 @@ Size modifier is one of ... Mode modifier is one of:: - BPF_IMM 0x00 /* used for 32-bit mov in classic BPF and 64-bit in eBPF */ - BPF_ABS 0x20 - BPF_IND 0x40 - BPF_MEM 0x60 - BPF_LEN 0x80 /* classic BPF only, reserved in eBPF */ - BPF_MSH 0xa0 /* classic BPF only, reserved in eBPF */ - BPF_XADD 0xc0 /* eBPF only, exclusive add */ + BPF_IMM 0x00 /* used for 32-bit mov in classic BPF and 64-bit in eBPF */ + BPF_ABS 0x20 + BPF_IND 0x40 + BPF_MEM 0x60 + BPF_LEN 0x80 /* classic BPF only, reserved in eBPF */ + BPF_MSH 0xa0 /* classic BPF only, reserved in eBPF */ + BPF_ATOMIC 0xc0 /* eBPF only, atomic operations */ eBPF has two non-generic instructions: (BPF_ABS | | BPF_LD) and (BPF_IND | | BPF_LD) which are used to access packet data. @@ -1044,11 +1044,19 @@ Unlike classic BPF instruction set, eBPF has generic load/store operations:: BPF_MEM | | BPF_STX: *(size *) (dst_reg + off) = src_reg BPF_MEM | | BPF_ST: *(size *) (dst_reg + off) = imm32 BPF_MEM | | BPF_LDX: dst_reg = *(size *) (src_reg + off) - BPF_XADD | BPF_W | BPF_STX: lock xadd *(u32 *)(dst_reg + off16) += src_reg - BPF_XADD | BPF_DW | BPF_STX: lock xadd *(u64 *)(dst_reg + off16) += src_reg -Where size is one of: BPF_B or BPF_H or BPF_W or BPF_DW. Note that 1 and -2 byte atomic increments are not supported. +Where size is one of: BPF_B or BPF_H or BPF_W or BPF_DW. + +It also includes atomic operations, which use the immediate field for extra +encoding. + + .imm = BPF_ADD, .code = BPF_ATOMIC | BPF_W | BPF_STX: lock xadd *(u32 *)(dst_reg + off16) += src_reg + .imm = BPF_ADD, .code = BPF_ATOMIC | BPF_DW | BPF_STX: lock xadd *(u64 *)(dst_reg + off16) += src_reg + +Note that 1 and 2 byte atomic operations are not supported. + +You may encounter BPF_XADD - this is a legacy name for BPF_ATOMIC, referring to +the exclusive-add operation encoded when the immediate field is zero. eBPF has one 16-byte instruction: BPF_LD | BPF_DW | BPF_IMM which consists of two consecutive ``struct bpf_insn`` 8-byte blocks and interpreted as single diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index 0207b6ea6e8a..897634d0a67c 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -1620,10 +1620,9 @@ exit: } emit_str_r(dst_lo, tmp2, off, ctx, BPF_SIZE(code)); break; - /* STX XADD: lock *(u32 *)(dst + off) += src */ - case BPF_STX | BPF_XADD | BPF_W: - /* STX XADD: lock *(u64 *)(dst + off) += src */ - case BPF_STX | BPF_XADD | BPF_DW: + /* Atomic ops */ + case BPF_STX | BPF_ATOMIC | BPF_W: + case BPF_STX | BPF_ATOMIC | BPF_DW: goto notyet; /* STX: *(size *)(dst + off) = src */ case BPF_STX | BPF_MEM | BPF_W: diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index ef9f1d5e989d..f7b194878a99 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -875,10 +875,18 @@ emit_cond_jmp: } break; - /* STX XADD: lock *(u32 *)(dst + off) += src */ - case BPF_STX | BPF_XADD | BPF_W: - /* STX XADD: lock *(u64 *)(dst + off) += src */ - case BPF_STX | BPF_XADD | BPF_DW: + case BPF_STX | BPF_ATOMIC | BPF_W: + case BPF_STX | BPF_ATOMIC | BPF_DW: + if (insn->imm != BPF_ADD) { + pr_err_once("unknown atomic op code %02x\n", insn->imm); + return -EINVAL; + } + + /* STX XADD: lock *(u32 *)(dst + off) += src + * and + * STX XADD: lock *(u64 *)(dst + off) += src + */ + if (!off) { reg = dst; } else { diff --git a/arch/mips/net/ebpf_jit.c b/arch/mips/net/ebpf_jit.c index 561154cbcc40..939dd06764bc 100644 --- a/arch/mips/net/ebpf_jit.c +++ b/arch/mips/net/ebpf_jit.c @@ -1423,8 +1423,8 @@ jeq_common: case BPF_STX | BPF_H | BPF_MEM: case BPF_STX | BPF_W | BPF_MEM: case BPF_STX | BPF_DW | BPF_MEM: - case BPF_STX | BPF_W | BPF_XADD: - case BPF_STX | BPF_DW | BPF_XADD: + case BPF_STX | BPF_W | BPF_ATOMIC: + case BPF_STX | BPF_DW | BPF_ATOMIC: if (insn->dst_reg == BPF_REG_10) { ctx->flags |= EBPF_SEEN_FP; dst = MIPS_R_SP; @@ -1438,7 +1438,12 @@ jeq_common: src = ebpf_to_mips_reg(ctx, insn, src_reg_no_fp); if (src < 0) return src; - if (BPF_MODE(insn->code) == BPF_XADD) { + if (BPF_MODE(insn->code) == BPF_ATOMIC) { + if (insn->imm != BPF_ADD) { + pr_err("ATOMIC OP %02x NOT HANDLED\n", insn->imm); + return -EINVAL; + } + /* * If mem_off does not fit within the 9 bit ll/sc * instruction immediate field, use a temp reg. diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index 022103c6a201..aaf1a887f653 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -683,10 +683,18 @@ emit_clear: break; /* - * BPF_STX XADD (atomic_add) + * BPF_STX ATOMIC (atomic ops) */ - /* *(u32 *)(dst + off) += src */ - case BPF_STX | BPF_XADD | BPF_W: + case BPF_STX | BPF_ATOMIC | BPF_W: + if (insn->imm != BPF_ADD) { + pr_err_ratelimited( + "eBPF filter atomic op code %02x (@%d) unsupported\n", + code, i); + return -ENOTSUPP; + } + + /* *(u32 *)(dst + off) += src */ + /* Get EA into TMP_REG_1 */ EMIT(PPC_RAW_ADDI(b2p[TMP_REG_1], dst_reg, off)); tmp_idx = ctx->idx * 4; @@ -699,8 +707,15 @@ emit_clear: /* we're done if this succeeded */ PPC_BCC_SHORT(COND_NE, tmp_idx); break; - /* *(u64 *)(dst + off) += src */ - case BPF_STX | BPF_XADD | BPF_DW: + case BPF_STX | BPF_ATOMIC | BPF_DW: + if (insn->imm != BPF_ADD) { + pr_err_ratelimited( + "eBPF filter atomic op code %02x (@%d) unsupported\n", + code, i); + return -ENOTSUPP; + } + /* *(u64 *)(dst + off) += src */ + EMIT(PPC_RAW_ADDI(b2p[TMP_REG_1], dst_reg, off)); tmp_idx = ctx->idx * 4; EMIT(PPC_RAW_LDARX(b2p[TMP_REG_2], 0, b2p[TMP_REG_1], 0)); diff --git a/arch/riscv/net/bpf_jit_comp32.c b/arch/riscv/net/bpf_jit_comp32.c index 579575f9cdae..81de865f4c7c 100644 --- a/arch/riscv/net/bpf_jit_comp32.c +++ b/arch/riscv/net/bpf_jit_comp32.c @@ -881,7 +881,7 @@ static int emit_store_r64(const s8 *dst, const s8 *src, s16 off, const s8 *rd = bpf_get_reg64(dst, tmp1, ctx); const s8 *rs = bpf_get_reg64(src, tmp2, ctx); - if (mode == BPF_XADD && size != BPF_W) + if (mode == BPF_ATOMIC && size != BPF_W) return -1; emit_imm(RV_REG_T0, off, ctx); @@ -899,7 +899,7 @@ static int emit_store_r64(const s8 *dst, const s8 *src, s16 off, case BPF_MEM: emit(rv_sw(RV_REG_T0, 0, lo(rs)), ctx); break; - case BPF_XADD: + case BPF_ATOMIC: /* Only BPF_ADD supported */ emit(rv_amoadd_w(RV_REG_ZERO, lo(rs), RV_REG_T0, 0, 0), ctx); break; @@ -1260,7 +1260,6 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, case BPF_STX | BPF_MEM | BPF_H: case BPF_STX | BPF_MEM | BPF_W: case BPF_STX | BPF_MEM | BPF_DW: - case BPF_STX | BPF_XADD | BPF_W: if (BPF_CLASS(code) == BPF_ST) { emit_imm32(tmp2, imm, ctx); src = tmp2; @@ -1271,8 +1270,21 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, return -1; break; + case BPF_STX | BPF_ATOMIC | BPF_W: + if (insn->imm != BPF_ADD) { + pr_info_once( + "bpf-jit: not supported: atomic operation %02x ***\n", + insn->imm); + return -EFAULT; + } + + if (emit_store_r64(dst, src, off, ctx, BPF_SIZE(code), + BPF_MODE(code))) + return -1; + break; + /* No hardware support for 8-byte atomics in RV32. */ - case BPF_STX | BPF_XADD | BPF_DW: + case BPF_STX | BPF_ATOMIC | BPF_DW: /* Fallthrough. */ notsupported: diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index 8a56b5293117..b44ff52f84a6 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -1027,10 +1027,18 @@ out_be: emit_add(RV_REG_T1, RV_REG_T1, rd, ctx); emit_sd(RV_REG_T1, 0, rs, ctx); break; - /* STX XADD: lock *(u32 *)(dst + off) += src */ - case BPF_STX | BPF_XADD | BPF_W: - /* STX XADD: lock *(u64 *)(dst + off) += src */ - case BPF_STX | BPF_XADD | BPF_DW: + case BPF_STX | BPF_ATOMIC | BPF_W: + case BPF_STX | BPF_ATOMIC | BPF_DW: + if (insn->imm != BPF_ADD) { + pr_err("bpf-jit: not supported: atomic operation %02x ***\n", + insn->imm); + return -EINVAL; + } + + /* atomic_add: lock *(u32 *)(dst + off) += src + * atomic_add: lock *(u64 *)(dst + off) += src + */ + if (off) { if (is_12b_int(off)) { emit_addi(RV_REG_T1, rd, off, ctx); diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 0a4182792876..f973e2ead197 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -1205,18 +1205,23 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, jit->seen |= SEEN_MEM; break; /* - * BPF_STX XADD (atomic_add) + * BPF_ATOMIC */ - case BPF_STX | BPF_XADD | BPF_W: /* *(u32 *)(dst + off) += src */ - /* laal %w0,%src,off(%dst) */ - EMIT6_DISP_LH(0xeb000000, 0x00fa, REG_W0, src_reg, - dst_reg, off); - jit->seen |= SEEN_MEM; - break; - case BPF_STX | BPF_XADD | BPF_DW: /* *(u64 *)(dst + off) += src */ - /* laalg %w0,%src,off(%dst) */ - EMIT6_DISP_LH(0xeb000000, 0x00ea, REG_W0, src_reg, - dst_reg, off); + case BPF_STX | BPF_ATOMIC | BPF_DW: + case BPF_STX | BPF_ATOMIC | BPF_W: + if (insn->imm != BPF_ADD) { + pr_err("Unknown atomic operation %02x\n", insn->imm); + return -1; + } + + /* *(u32/u64 *)(dst + off) += src + * + * BFW_W: laal %w0,%src,off(%dst) + * BPF_DW: laalg %w0,%src,off(%dst) + */ + EMIT6_DISP_LH(0xeb000000, + BPF_SIZE(insn->code) == BPF_W ? 0x00fa : 0x00ea, + REG_W0, src_reg, dst_reg, off); jit->seen |= SEEN_MEM; break; /* diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c index 3364e2a00989..4b8d3c65d266 100644 --- a/arch/sparc/net/bpf_jit_comp_64.c +++ b/arch/sparc/net/bpf_jit_comp_64.c @@ -1366,12 +1366,18 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) break; } - /* STX XADD: lock *(u32 *)(dst + off) += src */ - case BPF_STX | BPF_XADD | BPF_W: { + case BPF_STX | BPF_ATOMIC | BPF_W: { const u8 tmp = bpf2sparc[TMP_REG_1]; const u8 tmp2 = bpf2sparc[TMP_REG_2]; const u8 tmp3 = bpf2sparc[TMP_REG_3]; + if (insn->imm != BPF_ADD) { + pr_err_once("unknown atomic op %02x\n", insn->imm); + return -EINVAL; + } + + /* lock *(u32 *)(dst + off) += src */ + if (insn->dst_reg == BPF_REG_FP) ctx->saw_frame_pointer = true; @@ -1390,11 +1396,16 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) break; } /* STX XADD: lock *(u64 *)(dst + off) += src */ - case BPF_STX | BPF_XADD | BPF_DW: { + case BPF_STX | BPF_ATOMIC | BPF_DW: { const u8 tmp = bpf2sparc[TMP_REG_1]; const u8 tmp2 = bpf2sparc[TMP_REG_2]; const u8 tmp3 = bpf2sparc[TMP_REG_3]; + if (insn->imm != BPF_ADD) { + pr_err_once("unknown atomic op %02x\n", insn->imm); + return -EINVAL; + } + if (insn->dst_reg == BPF_REG_FP) ctx->saw_frame_pointer = true; diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 93f32e0ba0ef..b1829a534da1 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -795,6 +795,33 @@ static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) *pprog = prog; } +static int emit_atomic(u8 **pprog, u8 atomic_op, + u32 dst_reg, u32 src_reg, s16 off, u8 bpf_size) +{ + u8 *prog = *pprog; + int cnt = 0; + + EMIT1(0xF0); /* lock prefix */ + + maybe_emit_mod(&prog, dst_reg, src_reg, bpf_size == BPF_DW); + + /* emit opcode */ + switch (atomic_op) { + case BPF_ADD: + /* lock *(u32/u64*)(dst_reg + off) = src_reg */ + EMIT1(simple_alu_opcodes[atomic_op]); + break; + default: + pr_err("bpf_jit: unknown atomic opcode %02x\n", atomic_op); + return -EFAULT; + } + + emit_insn_suffix(&prog, dst_reg, src_reg, off); + + *pprog = prog; + return 0; +} + static bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs, int trapnr, unsigned long error_code, unsigned long fault_addr) @@ -839,6 +866,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, int i, cnt = 0, excnt = 0; int proglen = 0; u8 *prog = temp; + int err; detect_reg_usage(insn, insn_cnt, callee_regs_used, &tail_call_seen); @@ -1250,18 +1278,12 @@ st: if (is_imm8(insn->off)) } break; - /* STX XADD: lock *(u32*)(dst_reg + off) += src_reg */ - case BPF_STX | BPF_XADD | BPF_W: - /* Emit 'lock add dword ptr [rax + off], eax' */ - if (is_ereg(dst_reg) || is_ereg(src_reg)) - EMIT3(0xF0, add_2mod(0x40, dst_reg, src_reg), 0x01); - else - EMIT2(0xF0, 0x01); - goto xadd; - case BPF_STX | BPF_XADD | BPF_DW: - EMIT3(0xF0, add_2mod(0x48, dst_reg, src_reg), 0x01); -xadd: - emit_modrm_dstoff(&prog, dst_reg, src_reg, insn->off); + case BPF_STX | BPF_ATOMIC | BPF_W: + case BPF_STX | BPF_ATOMIC | BPF_DW: + err = emit_atomic(&prog, insn->imm, dst_reg, src_reg, + insn->off, BPF_SIZE(insn->code)); + if (err) + return err; break; /* call */ diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c index 96fde03aa987..d17b67c69f89 100644 --- a/arch/x86/net/bpf_jit_comp32.c +++ b/arch/x86/net/bpf_jit_comp32.c @@ -2243,10 +2243,8 @@ emit_jmp: return -EFAULT; } break; - /* STX XADD: lock *(u32 *)(dst + off) += src */ - case BPF_STX | BPF_XADD | BPF_W: - /* STX XADD: lock *(u64 *)(dst + off) += src */ - case BPF_STX | BPF_XADD | BPF_DW: + case BPF_STX | BPF_ATOMIC | BPF_W: + case BPF_STX | BPF_ATOMIC | BPF_DW: goto notyet; case BPF_JMP | BPF_EXIT: if (seen_exit) { diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c index 0a721f6e8676..e31f8fbbc696 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c @@ -3109,13 +3109,19 @@ mem_xadd(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, bool is64) return 0; } -static int mem_xadd4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) +static int mem_atomic4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) { + if (meta->insn.imm != BPF_ADD) + return -EOPNOTSUPP; + return mem_xadd(nfp_prog, meta, false); } -static int mem_xadd8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) +static int mem_atomic8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) { + if (meta->insn.imm != BPF_ADD) + return -EOPNOTSUPP; + return mem_xadd(nfp_prog, meta, true); } @@ -3475,8 +3481,8 @@ static const instr_cb_t instr_cb[256] = { [BPF_STX | BPF_MEM | BPF_H] = mem_stx2, [BPF_STX | BPF_MEM | BPF_W] = mem_stx4, [BPF_STX | BPF_MEM | BPF_DW] = mem_stx8, - [BPF_STX | BPF_XADD | BPF_W] = mem_xadd4, - [BPF_STX | BPF_XADD | BPF_DW] = mem_xadd8, + [BPF_STX | BPF_ATOMIC | BPF_W] = mem_atomic4, + [BPF_STX | BPF_ATOMIC | BPF_DW] = mem_atomic8, [BPF_ST | BPF_MEM | BPF_B] = mem_st1, [BPF_ST | BPF_MEM | BPF_H] = mem_st2, [BPF_ST | BPF_MEM | BPF_W] = mem_st4, diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h index fac9c6f9e197..d0e17eebddd9 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.h +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h @@ -428,9 +428,9 @@ static inline bool is_mbpf_classic_store_pkt(const struct nfp_insn_meta *meta) return is_mbpf_classic_store(meta) && meta->ptr.type == PTR_TO_PACKET; } -static inline bool is_mbpf_xadd(const struct nfp_insn_meta *meta) +static inline bool is_mbpf_atomic(const struct nfp_insn_meta *meta) { - return (meta->insn.code & ~BPF_SIZE_MASK) == (BPF_STX | BPF_XADD); + return (meta->insn.code & ~BPF_SIZE_MASK) == (BPF_STX | BPF_ATOMIC); } static inline bool is_mbpf_mul(const struct nfp_insn_meta *meta) diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c index e92ee510fd52..9d235c0ce46a 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c @@ -479,7 +479,7 @@ nfp_bpf_check_ptr(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, pr_vlog(env, "map writes not supported\n"); return -EOPNOTSUPP; } - if (is_mbpf_xadd(meta)) { + if (is_mbpf_atomic(meta)) { err = nfp_bpf_map_mark_used(env, meta, reg, NFP_MAP_USE_ATOMIC_CNT); if (err) @@ -523,12 +523,17 @@ exit_check_ptr: } static int -nfp_bpf_check_xadd(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, - struct bpf_verifier_env *env) +nfp_bpf_check_atomic(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, + struct bpf_verifier_env *env) { const struct bpf_reg_state *sreg = cur_regs(env) + meta->insn.src_reg; const struct bpf_reg_state *dreg = cur_regs(env) + meta->insn.dst_reg; + if (meta->insn.imm != BPF_ADD) { + pr_vlog(env, "atomic op not implemented: %d\n", meta->insn.imm); + return -EOPNOTSUPP; + } + if (dreg->type != PTR_TO_MAP_VALUE) { pr_vlog(env, "atomic add not to a map value pointer: %d\n", dreg->type); @@ -655,8 +660,8 @@ int nfp_verify_insn(struct bpf_verifier_env *env, int insn_idx, if (is_mbpf_store(meta)) return nfp_bpf_check_store(nfp_prog, meta, env); - if (is_mbpf_xadd(meta)) - return nfp_bpf_check_xadd(nfp_prog, meta, env); + if (is_mbpf_atomic(meta)) + return nfp_bpf_check_atomic(nfp_prog, meta, env); if (is_mbpf_alu(meta)) return nfp_bpf_check_alu(nfp_prog, meta, env); diff --git a/include/linux/filter.h b/include/linux/filter.h index 5edf2b660881..392e94b79668 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -259,15 +259,23 @@ static inline bool insn_is_zext(const struct bpf_insn *insn) .off = OFF, \ .imm = 0 }) -/* Atomic memory add, *(uint *)(dst_reg + off16) += src_reg */ -#define BPF_STX_XADD(SIZE, DST, SRC, OFF) \ +/* + * Atomic operations: + * + * BPF_ADD *(uint *) (dst_reg + off16) += src_reg + */ + +#define BPF_ATOMIC_OP(SIZE, OP, DST, SRC, OFF) \ ((struct bpf_insn) { \ - .code = BPF_STX | BPF_SIZE(SIZE) | BPF_XADD, \ + .code = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ - .imm = 0 }) + .imm = OP }) + +/* Legacy alias */ +#define BPF_STX_XADD(SIZE, DST, SRC, OFF) BPF_ATOMIC_OP(SIZE, BPF_ADD, DST, SRC, OFF) /* Memory store, *(uint *) (dst_reg + off16) = imm32 */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a1ad32456f89..6b3996343e63 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -19,7 +19,8 @@ /* ld/ldx fields */ #define BPF_DW 0x18 /* double word (64-bit) */ -#define BPF_XADD 0xc0 /* exclusive add */ +#define BPF_ATOMIC 0xc0 /* atomic memory ops - op type in immediate */ +#define BPF_XADD 0xc0 /* exclusive add - legacy name */ /* alu/jmp fields */ #define BPF_MOV 0xb0 /* mov reg to reg */ @@ -2448,7 +2449,7 @@ union bpf_attr { * running simultaneously. * * A user should care about the synchronization by himself. - * For example, by using the **BPF_STX_XADD** instruction to alter + * For example, by using the **BPF_ATOMIC** instructions to alter * the shared data. * Return * A pointer to the local storage area. diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 69c3c308de5e..4836ebf459cf 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1309,8 +1309,8 @@ EXPORT_SYMBOL_GPL(__bpf_call_base); INSN_3(STX, MEM, H), \ INSN_3(STX, MEM, W), \ INSN_3(STX, MEM, DW), \ - INSN_3(STX, XADD, W), \ - INSN_3(STX, XADD, DW), \ + INSN_3(STX, ATOMIC, W), \ + INSN_3(STX, ATOMIC, DW), \ /* Immediate based. */ \ INSN_3(ST, MEM, B), \ INSN_3(ST, MEM, H), \ @@ -1618,13 +1618,25 @@ out: LDX_PROBE(DW, 8) #undef LDX_PROBE - STX_XADD_W: /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */ - atomic_add((u32) SRC, (atomic_t *)(unsigned long) - (DST + insn->off)); + STX_ATOMIC_W: + switch (IMM) { + case BPF_ADD: + /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */ + atomic_add((u32) SRC, (atomic_t *)(unsigned long) + (DST + insn->off)); + default: + goto default_label; + } CONT; - STX_XADD_DW: /* lock xadd *(u64 *)(dst_reg + off16) += src_reg */ - atomic64_add((u64) SRC, (atomic64_t *)(unsigned long) - (DST + insn->off)); + STX_ATOMIC_DW: + switch (IMM) { + case BPF_ADD: + /* lock xadd *(u64 *)(dst_reg + off16) += src_reg */ + atomic64_add((u64) SRC, (atomic64_t *)(unsigned long) + (DST + insn->off)); + default: + goto default_label; + } CONT; default_label: @@ -1634,7 +1646,8 @@ out: * * Note, verifier whitelists all opcodes in bpf_opcode_in_insntable(). */ - pr_warn("BPF interpreter: unknown opcode %02x\n", insn->code); + pr_warn("BPF interpreter: unknown opcode %02x (imm: 0x%x)\n", + insn->code, insn->imm); BUG_ON(1); return 0; } diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index b44d8c447afd..37c8d6e9b4cc 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -153,14 +153,16 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->src_reg); - else if (BPF_MODE(insn->code) == BPF_XADD) + else if (BPF_MODE(insn->code) == BPF_ATOMIC && + insn->imm == BPF_ADD) { verbose(cbs->private_data, "(%02x) lock *(%s *)(r%d %+d) += r%d\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->src_reg); - else + } else { verbose(cbs->private_data, "BUG_%02x\n", insn->code); + } } else if (class == BPF_ST) { if (BPF_MODE(insn->code) != BPF_MEM) { verbose(cbs->private_data, "BUG_st_%02x\n", insn->code); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ae2aee48cf82..cfc137b81ac6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3604,13 +3604,17 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn return err; } -static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn) +static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn) { int err; - if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) || - insn->imm != 0) { - verbose(env, "BPF_XADD uses reserved fields\n"); + if (insn->imm != BPF_ADD) { + verbose(env, "BPF_ATOMIC uses invalid atomic opcode %02x\n", insn->imm); + return -EINVAL; + } + + if (BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) { + verbose(env, "invalid atomic operand size\n"); return -EINVAL; } @@ -3633,19 +3637,19 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins is_pkt_reg(env, insn->dst_reg) || is_flow_key_reg(env, insn->dst_reg) || is_sk_reg(env, insn->dst_reg)) { - verbose(env, "BPF_XADD stores into R%d %s is not allowed\n", + verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n", insn->dst_reg, reg_type_str[reg_state(env, insn->dst_reg)->type]); return -EACCES; } - /* check whether atomic_add can read the memory */ + /* check whether we can read the memory */ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_READ, -1, true); if (err) return err; - /* check whether atomic_add can write into the same memory */ + /* check whether we can write into the same memory */ return check_mem_access(env, insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, -1, true); } @@ -9524,8 +9528,8 @@ static int do_check(struct bpf_verifier_env *env) } else if (class == BPF_STX) { enum bpf_reg_type *prev_dst_type, dst_reg_type; - if (BPF_MODE(insn->code) == BPF_XADD) { - err = check_xadd(env, env->insn_idx, insn); + if (BPF_MODE(insn->code) == BPF_ATOMIC) { + err = check_atomic(env, env->insn_idx, insn); if (err) return err; env->insn_idx++; @@ -10010,7 +10014,7 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) if (BPF_CLASS(insn->code) == BPF_STX && ((BPF_MODE(insn->code) != BPF_MEM && - BPF_MODE(insn->code) != BPF_XADD) || insn->imm != 0)) { + BPF_MODE(insn->code) != BPF_ATOMIC) || insn->imm != 0)) { verbose(env, "BPF_STX uses reserved fields\n"); return -EINVAL; } diff --git a/lib/test_bpf.c b/lib/test_bpf.c index ca7d635bccd9..49ec9e8d8aed 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -4295,13 +4295,13 @@ static struct bpf_test tests[] = { { { 0, 0xffffffff } }, .stack_depth = 40, }, - /* BPF_STX | BPF_XADD | BPF_W/DW */ + /* BPF_STX | BPF_ATOMIC | BPF_W/DW */ { "STX_XADD_W: Test: 0x12 + 0x10 = 0x22", .u.insns_int = { BPF_ALU32_IMM(BPF_MOV, R0, 0x12), BPF_ST_MEM(BPF_W, R10, -40, 0x10), - BPF_STX_XADD(BPF_W, R10, R0, -40), + BPF_ATOMIC_OP(BPF_W, BPF_ADD, R10, R0, -40), BPF_LDX_MEM(BPF_W, R0, R10, -40), BPF_EXIT_INSN(), }, @@ -4316,7 +4316,7 @@ static struct bpf_test tests[] = { BPF_ALU64_REG(BPF_MOV, R1, R10), BPF_ALU32_IMM(BPF_MOV, R0, 0x12), BPF_ST_MEM(BPF_W, R10, -40, 0x10), - BPF_STX_XADD(BPF_W, R10, R0, -40), + BPF_ATOMIC_OP(BPF_W, BPF_ADD, R10, R0, -40), BPF_ALU64_REG(BPF_MOV, R0, R10), BPF_ALU64_REG(BPF_SUB, R0, R1), BPF_EXIT_INSN(), @@ -4331,7 +4331,7 @@ static struct bpf_test tests[] = { .u.insns_int = { BPF_ALU32_IMM(BPF_MOV, R0, 0x12), BPF_ST_MEM(BPF_W, R10, -40, 0x10), - BPF_STX_XADD(BPF_W, R10, R0, -40), + BPF_ATOMIC_OP(BPF_W, BPF_ADD, R10, R0, -40), BPF_EXIT_INSN(), }, INTERNAL, @@ -4352,7 +4352,7 @@ static struct bpf_test tests[] = { .u.insns_int = { BPF_ALU32_IMM(BPF_MOV, R0, 0x12), BPF_ST_MEM(BPF_DW, R10, -40, 0x10), - BPF_STX_XADD(BPF_DW, R10, R0, -40), + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, R10, R0, -40), BPF_LDX_MEM(BPF_DW, R0, R10, -40), BPF_EXIT_INSN(), }, @@ -4367,7 +4367,7 @@ static struct bpf_test tests[] = { BPF_ALU64_REG(BPF_MOV, R1, R10), BPF_ALU32_IMM(BPF_MOV, R0, 0x12), BPF_ST_MEM(BPF_DW, R10, -40, 0x10), - BPF_STX_XADD(BPF_DW, R10, R0, -40), + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, R10, R0, -40), BPF_ALU64_REG(BPF_MOV, R0, R10), BPF_ALU64_REG(BPF_SUB, R0, R1), BPF_EXIT_INSN(), @@ -4382,7 +4382,7 @@ static struct bpf_test tests[] = { .u.insns_int = { BPF_ALU32_IMM(BPF_MOV, R0, 0x12), BPF_ST_MEM(BPF_DW, R10, -40, 0x10), - BPF_STX_XADD(BPF_DW, R10, R0, -40), + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, R10, R0, -40), BPF_EXIT_INSN(), }, INTERNAL, diff --git a/samples/bpf/bpf_insn.h b/samples/bpf/bpf_insn.h index 544237980582..db67a2847395 100644 --- a/samples/bpf/bpf_insn.h +++ b/samples/bpf/bpf_insn.h @@ -138,11 +138,11 @@ struct bpf_insn; #define BPF_STX_XADD(SIZE, DST, SRC, OFF) \ ((struct bpf_insn) { \ - .code = BPF_STX | BPF_SIZE(SIZE) | BPF_XADD, \ + .code = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ - .imm = 0 }) + .imm = BPF_ADD }) /* Memory store, *(uint *) (dst_reg + off16) = imm32 */ diff --git a/samples/bpf/cookie_uid_helper_example.c b/samples/bpf/cookie_uid_helper_example.c index deb0e3e0324d..c5ff7a13918c 100644 --- a/samples/bpf/cookie_uid_helper_example.c +++ b/samples/bpf/cookie_uid_helper_example.c @@ -147,12 +147,12 @@ static void prog_load(void) */ BPF_MOV64_REG(BPF_REG_9, BPF_REG_0), BPF_MOV64_IMM(BPF_REG_1, 1), - BPF_STX_XADD(BPF_DW, BPF_REG_9, BPF_REG_1, - offsetof(struct stats, packets)), + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_9, BPF_REG_1, + offsetof(struct stats, packets)), BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, offsetof(struct __sk_buff, len)), - BPF_STX_XADD(BPF_DW, BPF_REG_9, BPF_REG_1, - offsetof(struct stats, bytes)), + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_9, BPF_REG_1, + offsetof(struct stats, bytes)), BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_6, offsetof(struct __sk_buff, len)), BPF_EXIT_INSN(), diff --git a/samples/bpf/sock_example.c b/samples/bpf/sock_example.c index 00aae1d33fca..23d1930e1927 100644 --- a/samples/bpf/sock_example.c +++ b/samples/bpf/sock_example.c @@ -54,7 +54,7 @@ static int test_sock(void) BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */ - BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */ + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_0, BPF_REG_1, 0), BPF_MOV64_IMM(BPF_REG_0, 0), /* r0 = 0 */ BPF_EXIT_INSN(), }; diff --git a/samples/bpf/test_cgrp2_attach.c b/samples/bpf/test_cgrp2_attach.c index 20fbd1241db3..390ff38d2ac6 100644 --- a/samples/bpf/test_cgrp2_attach.c +++ b/samples/bpf/test_cgrp2_attach.c @@ -53,7 +53,7 @@ static int prog_load(int map_fd, int verdict) BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */ - BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */ + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_0, BPF_REG_1, 0), /* Count bytes */ BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_BYTES), /* r0 = 1 */ @@ -64,7 +64,8 @@ static int prog_load(int map_fd, int verdict) BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, offsetof(struct __sk_buff, len)), /* r1 = skb->len */ - BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */ + + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_0, BPF_REG_1, 0), BPF_MOV64_IMM(BPF_REG_0, verdict), /* r0 = verdict */ BPF_EXIT_INSN(), diff --git a/tools/include/linux/filter.h b/tools/include/linux/filter.h index ca28b6ab8db7..e870c9039f0d 100644 --- a/tools/include/linux/filter.h +++ b/tools/include/linux/filter.h @@ -169,15 +169,22 @@ .off = OFF, \ .imm = 0 }) -/* Atomic memory add, *(uint *)(dst_reg + off16) += src_reg */ +/* + * Atomic operations: + * + * BPF_ADD *(uint *) (dst_reg + off16) += src_reg + */ -#define BPF_STX_XADD(SIZE, DST, SRC, OFF) \ +#define BPF_ATOMIC_OP(SIZE, OP, DST, SRC, OFF) \ ((struct bpf_insn) { \ - .code = BPF_STX | BPF_SIZE(SIZE) | BPF_XADD, \ + .code = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ - .imm = 0 }) + .imm = OP }) + +/* Legacy alias */ +#define BPF_STX_XADD(SIZE, DST, SRC, OFF) BPF_ATOMIC_OP(SIZE, BPF_ADD, DST, SRC, OFF) /* Memory store, *(uint *) (dst_reg + off16) = imm32 */ diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index a1ad32456f89..6b3996343e63 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -19,7 +19,8 @@ /* ld/ldx fields */ #define BPF_DW 0x18 /* double word (64-bit) */ -#define BPF_XADD 0xc0 /* exclusive add */ +#define BPF_ATOMIC 0xc0 /* atomic memory ops - op type in immediate */ +#define BPF_XADD 0xc0 /* exclusive add - legacy name */ /* alu/jmp fields */ #define BPF_MOV 0xb0 /* mov reg to reg */ @@ -2448,7 +2449,7 @@ union bpf_attr { * running simultaneously. * * A user should care about the synchronization by himself. - * For example, by using the **BPF_STX_XADD** instruction to alter + * For example, by using the **BPF_ATOMIC** instructions to alter * the shared data. * Return * A pointer to the local storage area. diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c b/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c index b549fcfacc0b..0a1fc9816cef 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c @@ -45,13 +45,13 @@ static int prog_load_cnt(int verdict, int val) BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), BPF_MOV64_IMM(BPF_REG_1, val), /* r1 = 1 */ - BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */ + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_0, BPF_REG_1, 0), BPF_LD_MAP_FD(BPF_REG_1, cgroup_storage_fd), BPF_MOV64_IMM(BPF_REG_2, 0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_local_storage), BPF_MOV64_IMM(BPF_REG_1, val), - BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_W, BPF_REG_0, BPF_REG_1, 0, 0), + BPF_ATOMIC_OP(BPF_W, BPF_ADD, BPF_REG_0, BPF_REG_1, 0), BPF_LD_MAP_FD(BPF_REG_1, percpu_cgroup_storage_fd), BPF_MOV64_IMM(BPF_REG_2, 0), diff --git a/tools/testing/selftests/bpf/test_cgroup_storage.c b/tools/testing/selftests/bpf/test_cgroup_storage.c index d946252a25bb..0cda61da5d39 100644 --- a/tools/testing/selftests/bpf/test_cgroup_storage.c +++ b/tools/testing/selftests/bpf/test_cgroup_storage.c @@ -29,7 +29,7 @@ int main(int argc, char **argv) BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_local_storage), BPF_MOV64_IMM(BPF_REG_1, 1), - BPF_STX_XADD(BPF_DW, BPF_REG_0, BPF_REG_1, 0), + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_0, BPF_REG_1, 0), BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, 0), BPF_ALU64_IMM(BPF_AND, BPF_REG_1, 0x1), BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), diff --git a/tools/testing/selftests/bpf/verifier/ctx.c b/tools/testing/selftests/bpf/verifier/ctx.c index 93d6b1641481..23080862aafd 100644 --- a/tools/testing/selftests/bpf/verifier/ctx.c +++ b/tools/testing/selftests/bpf/verifier/ctx.c @@ -10,14 +10,13 @@ .prog_type = BPF_PROG_TYPE_SCHED_CLS, }, { - "context stores via XADD", + "context stores via BPF_ATOMIC", .insns = { BPF_MOV64_IMM(BPF_REG_0, 0), - BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_W, BPF_REG_1, - BPF_REG_0, offsetof(struct __sk_buff, mark), 0), + BPF_ATOMIC_OP(BPF_W, BPF_ADD, BPF_REG_1, BPF_REG_0, offsetof(struct __sk_buff, mark)), BPF_EXIT_INSN(), }, - .errstr = "BPF_XADD stores into R1 ctx is not allowed", + .errstr = "BPF_ATOMIC stores into R1 ctx is not allowed", .result = REJECT, .prog_type = BPF_PROG_TYPE_SCHED_CLS, }, diff --git a/tools/testing/selftests/bpf/verifier/direct_packet_access.c b/tools/testing/selftests/bpf/verifier/direct_packet_access.c index ae72536603fe..ac1e19d0f520 100644 --- a/tools/testing/selftests/bpf/verifier/direct_packet_access.c +++ b/tools/testing/selftests/bpf/verifier/direct_packet_access.c @@ -333,7 +333,7 @@ BPF_MOV64_REG(BPF_REG_4, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8), BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_2, 0), - BPF_STX_XADD(BPF_DW, BPF_REG_4, BPF_REG_5, 0), + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_4, BPF_REG_5, 0), BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_4, 0), BPF_STX_MEM(BPF_W, BPF_REG_2, BPF_REG_5, 0), BPF_MOV64_IMM(BPF_REG_0, 0), @@ -488,7 +488,7 @@ BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 11), BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_10, -8), BPF_MOV64_IMM(BPF_REG_4, 0xffffffff), - BPF_STX_XADD(BPF_DW, BPF_REG_10, BPF_REG_4, -8), + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_10, BPF_REG_4, -8), BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_10, -8), BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 49), BPF_ALU64_REG(BPF_ADD, BPF_REG_4, BPF_REG_2), diff --git a/tools/testing/selftests/bpf/verifier/leak_ptr.c b/tools/testing/selftests/bpf/verifier/leak_ptr.c index d6eec17f2cd2..73f0dea95546 100644 --- a/tools/testing/selftests/bpf/verifier/leak_ptr.c +++ b/tools/testing/selftests/bpf/verifier/leak_ptr.c @@ -5,7 +5,7 @@ BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, offsetof(struct __sk_buff, cb[0])), BPF_LD_MAP_FD(BPF_REG_2, 0), - BPF_STX_XADD(BPF_DW, BPF_REG_1, BPF_REG_2, + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_1, BPF_REG_2, offsetof(struct __sk_buff, cb[0])), BPF_EXIT_INSN(), }, @@ -13,7 +13,7 @@ .errstr_unpriv = "R2 leaks addr into mem", .result_unpriv = REJECT, .result = REJECT, - .errstr = "BPF_XADD stores into R1 ctx is not allowed", + .errstr = "BPF_ATOMIC stores into R1 ctx is not allowed", }, { "leak pointer into ctx 2", @@ -21,14 +21,14 @@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, offsetof(struct __sk_buff, cb[0])), - BPF_STX_XADD(BPF_DW, BPF_REG_1, BPF_REG_10, + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_1, BPF_REG_10, offsetof(struct __sk_buff, cb[0])), BPF_EXIT_INSN(), }, .errstr_unpriv = "R10 leaks addr into mem", .result_unpriv = REJECT, .result = REJECT, - .errstr = "BPF_XADD stores into R1 ctx is not allowed", + .errstr = "BPF_ATOMIC stores into R1 ctx is not allowed", }, { "leak pointer into ctx 3", @@ -56,7 +56,7 @@ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3), BPF_MOV64_IMM(BPF_REG_3, 0), BPF_STX_MEM(BPF_DW, BPF_REG_0, BPF_REG_3, 0), - BPF_STX_XADD(BPF_DW, BPF_REG_0, BPF_REG_6, 0), + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_0, BPF_REG_6, 0), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, diff --git a/tools/testing/selftests/bpf/verifier/meta_access.c b/tools/testing/selftests/bpf/verifier/meta_access.c index 205292b8dd65..b45e8af41420 100644 --- a/tools/testing/selftests/bpf/verifier/meta_access.c +++ b/tools/testing/selftests/bpf/verifier/meta_access.c @@ -171,7 +171,7 @@ BPF_MOV64_IMM(BPF_REG_5, 42), BPF_MOV64_IMM(BPF_REG_6, 24), BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_5, -8), - BPF_STX_XADD(BPF_DW, BPF_REG_10, BPF_REG_6, -8), + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_10, BPF_REG_6, -8), BPF_LDX_MEM(BPF_DW, BPF_REG_5, BPF_REG_10, -8), BPF_JMP_IMM(BPF_JGT, BPF_REG_5, 100, 6), BPF_ALU64_REG(BPF_ADD, BPF_REG_3, BPF_REG_5), @@ -196,7 +196,7 @@ BPF_MOV64_IMM(BPF_REG_5, 42), BPF_MOV64_IMM(BPF_REG_6, 24), BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_5, -8), - BPF_STX_XADD(BPF_DW, BPF_REG_10, BPF_REG_6, -8), + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_10, BPF_REG_6, -8), BPF_LDX_MEM(BPF_DW, BPF_REG_5, BPF_REG_10, -8), BPF_JMP_IMM(BPF_JGT, BPF_REG_5, 100, 6), BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_5), diff --git a/tools/testing/selftests/bpf/verifier/unpriv.c b/tools/testing/selftests/bpf/verifier/unpriv.c index a3fe0fbaed41..ee298627abae 100644 --- a/tools/testing/selftests/bpf/verifier/unpriv.c +++ b/tools/testing/selftests/bpf/verifier/unpriv.c @@ -207,7 +207,8 @@ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, -8), BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 0), BPF_MOV64_IMM(BPF_REG_0, 1), - BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_10, BPF_REG_0, -8, 0), + BPF_RAW_INSN(BPF_STX | BPF_ATOMIC | BPF_DW, + BPF_REG_10, BPF_REG_0, -8, BPF_ADD), BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_6, 0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_hash_recalc), BPF_EXIT_INSN(), diff --git a/tools/testing/selftests/bpf/verifier/value_illegal_alu.c b/tools/testing/selftests/bpf/verifier/value_illegal_alu.c index ed1c2cea1dea..489062867218 100644 --- a/tools/testing/selftests/bpf/verifier/value_illegal_alu.c +++ b/tools/testing/selftests/bpf/verifier/value_illegal_alu.c @@ -82,7 +82,7 @@ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_0, 0), - BPF_STX_XADD(BPF_DW, BPF_REG_2, BPF_REG_3, 0), + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_2, BPF_REG_3, 0), BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, 0), BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 22), BPF_EXIT_INSN(), diff --git a/tools/testing/selftests/bpf/verifier/xadd.c b/tools/testing/selftests/bpf/verifier/xadd.c index c5de2e62cc8b..b96ef3526815 100644 --- a/tools/testing/selftests/bpf/verifier/xadd.c +++ b/tools/testing/selftests/bpf/verifier/xadd.c @@ -3,7 +3,7 @@ .insns = { BPF_MOV64_IMM(BPF_REG_0, 1), BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8), - BPF_STX_XADD(BPF_W, BPF_REG_10, BPF_REG_0, -7), + BPF_ATOMIC_OP(BPF_W, BPF_ADD, BPF_REG_10, BPF_REG_0, -7), BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -8), BPF_EXIT_INSN(), }, @@ -22,7 +22,7 @@ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), BPF_EXIT_INSN(), BPF_MOV64_IMM(BPF_REG_1, 1), - BPF_STX_XADD(BPF_W, BPF_REG_0, BPF_REG_1, 3), + BPF_ATOMIC_OP(BPF_W, BPF_ADD, BPF_REG_0, BPF_REG_1, 3), BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 3), BPF_EXIT_INSN(), }, @@ -45,13 +45,13 @@ BPF_MOV64_IMM(BPF_REG_0, 1), BPF_ST_MEM(BPF_W, BPF_REG_2, 0, 0), BPF_ST_MEM(BPF_W, BPF_REG_2, 3, 0), - BPF_STX_XADD(BPF_W, BPF_REG_2, BPF_REG_0, 1), - BPF_STX_XADD(BPF_W, BPF_REG_2, BPF_REG_0, 2), + BPF_ATOMIC_OP(BPF_W, BPF_ADD, BPF_REG_2, BPF_REG_0, 1), + BPF_ATOMIC_OP(BPF_W, BPF_ADD, BPF_REG_2, BPF_REG_0, 2), BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_2, 1), BPF_EXIT_INSN(), }, .result = REJECT, - .errstr = "BPF_XADD stores into R2 pkt is not allowed", + .errstr = "BPF_ATOMIC stores into R2 pkt is not allowed", .prog_type = BPF_PROG_TYPE_XDP, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, @@ -62,8 +62,8 @@ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), BPF_MOV64_REG(BPF_REG_7, BPF_REG_10), BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8), - BPF_STX_XADD(BPF_DW, BPF_REG_10, BPF_REG_0, -8), - BPF_STX_XADD(BPF_DW, BPF_REG_10, BPF_REG_0, -8), + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_10, BPF_REG_0, -8), + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_10, BPF_REG_0, -8), BPF_JMP_REG(BPF_JNE, BPF_REG_6, BPF_REG_0, 3), BPF_JMP_REG(BPF_JNE, BPF_REG_7, BPF_REG_10, 2), BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -8), @@ -82,8 +82,8 @@ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), BPF_MOV64_REG(BPF_REG_7, BPF_REG_10), BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -8), - BPF_STX_XADD(BPF_W, BPF_REG_10, BPF_REG_0, -8), - BPF_STX_XADD(BPF_W, BPF_REG_10, BPF_REG_0, -8), + BPF_ATOMIC_OP(BPF_W, BPF_ADD, BPF_REG_10, BPF_REG_0, -8), + BPF_ATOMIC_OP(BPF_W, BPF_ADD, BPF_REG_10, BPF_REG_0, -8), BPF_JMP_REG(BPF_JNE, BPF_REG_6, BPF_REG_0, 3), BPF_JMP_REG(BPF_JNE, BPF_REG_7, BPF_REG_10, 2), BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_10, -8), -- cgit v1.2.3 From c5bcb5eb4db632280b4123135d583a7bc8caea3e Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 14 Jan 2021 18:17:45 +0000 Subject: bpf: Move BPF_STX reserved field check into BPF_STX verifier code I can't find a reason why this code is in resolve_pseudo_ldimm64; since I'll be modifying it in a subsequent commit, tidy it up. Signed-off-by: Brendan Jackman Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210114181751.768687-6-jackmanb@google.com --- kernel/bpf/verifier.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index cfc137b81ac6..d8a85f4e5b95 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9528,6 +9528,12 @@ static int do_check(struct bpf_verifier_env *env) } else if (class == BPF_STX) { enum bpf_reg_type *prev_dst_type, dst_reg_type; + if (((BPF_MODE(insn->code) != BPF_MEM && + BPF_MODE(insn->code) != BPF_ATOMIC) || insn->imm != 0)) { + verbose(env, "BPF_STX uses reserved fields\n"); + return -EINVAL; + } + if (BPF_MODE(insn->code) == BPF_ATOMIC) { err = check_atomic(env, env->insn_idx, insn); if (err) @@ -10012,13 +10018,6 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) return -EINVAL; } - if (BPF_CLASS(insn->code) == BPF_STX && - ((BPF_MODE(insn->code) != BPF_MEM && - BPF_MODE(insn->code) != BPF_ATOMIC) || insn->imm != 0)) { - verbose(env, "BPF_STX uses reserved fields\n"); - return -EINVAL; - } - if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) { struct bpf_insn_aux_data *aux; struct bpf_map *map; -- cgit v1.2.3 From 5ca419f2864a2c60940dcf4bbaeb69546200e36f Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 14 Jan 2021 18:17:46 +0000 Subject: bpf: Add BPF_FETCH field / create atomic_fetch_add instruction The BPF_FETCH field can be set in bpf_insn.imm, for BPF_ATOMIC instructions, in order to have the previous value of the atomically-modified memory location loaded into the src register after an atomic op is carried out. Suggested-by: Yonghong Song Signed-off-by: Brendan Jackman Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210114181751.768687-7-jackmanb@google.com --- arch/x86/net/bpf_jit_comp.c | 4 ++++ include/linux/filter.h | 1 + include/uapi/linux/bpf.h | 3 +++ kernel/bpf/core.c | 13 +++++++++++++ kernel/bpf/disasm.c | 7 +++++++ kernel/bpf/verifier.c | 33 ++++++++++++++++++++++++--------- tools/include/linux/filter.h | 1 + tools/include/uapi/linux/bpf.h | 3 +++ 8 files changed, 56 insertions(+), 9 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index b1829a534da1..eea7d8b0bb12 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -811,6 +811,10 @@ static int emit_atomic(u8 **pprog, u8 atomic_op, /* lock *(u32/u64*)(dst_reg + off) = src_reg */ EMIT1(simple_alu_opcodes[atomic_op]); break; + case BPF_ADD | BPF_FETCH: + /* src_reg = atomic_fetch_add(dst_reg + off, src_reg); */ + EMIT2(0x0F, 0xC1); + break; default: pr_err("bpf_jit: unknown atomic opcode %02x\n", atomic_op); return -EFAULT; diff --git a/include/linux/filter.h b/include/linux/filter.h index 392e94b79668..23fca41b8540 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -264,6 +264,7 @@ static inline bool insn_is_zext(const struct bpf_insn *insn) * Atomic operations: * * BPF_ADD *(uint *) (dst_reg + off16) += src_reg + * BPF_ADD | BPF_FETCH src_reg = atomic_fetch_add(dst_reg + off16, src_reg); */ #define BPF_ATOMIC_OP(SIZE, OP, DST, SRC, OFF) \ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6b3996343e63..ea262b009049 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -44,6 +44,9 @@ #define BPF_CALL 0x80 /* function call */ #define BPF_EXIT 0x90 /* function return */ +/* atomic op type fields (stored in immediate) */ +#define BPF_FETCH 0x01 /* fetch previous value into src reg */ + /* Register numbers */ enum { BPF_REG_0 = 0, diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 4836ebf459cf..28d6000463e4 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1624,16 +1624,29 @@ out: /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */ atomic_add((u32) SRC, (atomic_t *)(unsigned long) (DST + insn->off)); + break; + case BPF_ADD | BPF_FETCH: + SRC = (u32) atomic_fetch_add( + (u32) SRC, + (atomic_t *)(unsigned long) (DST + insn->off)); + break; default: goto default_label; } CONT; + STX_ATOMIC_DW: switch (IMM) { case BPF_ADD: /* lock xadd *(u64 *)(dst_reg + off16) += src_reg */ atomic64_add((u64) SRC, (atomic64_t *)(unsigned long) (DST + insn->off)); + break; + case BPF_ADD | BPF_FETCH: + SRC = (u64) atomic64_fetch_add( + (u64) SRC, + (atomic64_t *)(unsigned long) (DST + insn->off)); + break; default: goto default_label; } diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index 37c8d6e9b4cc..d2e20f6d0516 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -160,6 +160,13 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->src_reg); + } else if (BPF_MODE(insn->code) == BPF_ATOMIC && + insn->imm == (BPF_ADD | BPF_FETCH)) { + verbose(cbs->private_data, "(%02x) r%d = atomic%s_fetch_add((%s *)(r%d %+d), r%d)\n", + insn->code, insn->src_reg, + BPF_SIZE(insn->code) == BPF_DW ? "64" : "", + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, insn->off, insn->src_reg); } else { verbose(cbs->private_data, "BUG_%02x\n", insn->code); } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d8a85f4e5b95..6aa1fc919761 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3608,7 +3608,11 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i { int err; - if (insn->imm != BPF_ADD) { + switch (insn->imm) { + case BPF_ADD: + case BPF_ADD | BPF_FETCH: + break; + default: verbose(env, "BPF_ATOMIC uses invalid atomic opcode %02x\n", insn->imm); return -EINVAL; } @@ -3650,8 +3654,20 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i return err; /* check whether we can write into the same memory */ - return check_mem_access(env, insn_idx, insn->dst_reg, insn->off, - BPF_SIZE(insn->code), BPF_WRITE, -1, true); + err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, + BPF_SIZE(insn->code), BPF_WRITE, -1, true); + if (err) + return err; + + if (!(insn->imm & BPF_FETCH)) + return 0; + + /* check and record load of old value into src reg */ + err = check_reg_arg(env, insn->src_reg, DST_OP); + if (err) + return err; + + return 0; } static int __check_stack_boundary(struct bpf_verifier_env *env, u32 regno, @@ -9528,12 +9544,6 @@ static int do_check(struct bpf_verifier_env *env) } else if (class == BPF_STX) { enum bpf_reg_type *prev_dst_type, dst_reg_type; - if (((BPF_MODE(insn->code) != BPF_MEM && - BPF_MODE(insn->code) != BPF_ATOMIC) || insn->imm != 0)) { - verbose(env, "BPF_STX uses reserved fields\n"); - return -EINVAL; - } - if (BPF_MODE(insn->code) == BPF_ATOMIC) { err = check_atomic(env, env->insn_idx, insn); if (err) @@ -9542,6 +9552,11 @@ static int do_check(struct bpf_verifier_env *env) continue; } + if (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0) { + verbose(env, "BPF_STX uses reserved fields\n"); + return -EINVAL; + } + /* check src1 operand */ err = check_reg_arg(env, insn->src_reg, SRC_OP); if (err) diff --git a/tools/include/linux/filter.h b/tools/include/linux/filter.h index e870c9039f0d..7211ce9fba53 100644 --- a/tools/include/linux/filter.h +++ b/tools/include/linux/filter.h @@ -173,6 +173,7 @@ * Atomic operations: * * BPF_ADD *(uint *) (dst_reg + off16) += src_reg + * BPF_ADD | BPF_FETCH src_reg = atomic_fetch_add(dst_reg + off16, src_reg); */ #define BPF_ATOMIC_OP(SIZE, OP, DST, SRC, OFF) \ diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 6b3996343e63..ea262b009049 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -44,6 +44,9 @@ #define BPF_CALL 0x80 /* function call */ #define BPF_EXIT 0x90 /* function return */ +/* atomic op type fields (stored in immediate) */ +#define BPF_FETCH 0x01 /* fetch previous value into src reg */ + /* Register numbers */ enum { BPF_REG_0 = 0, -- cgit v1.2.3 From 5ffa25502b5ab3d639829a2d1e316cff7f59a41e Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 14 Jan 2021 18:17:47 +0000 Subject: bpf: Add instructions for atomic_[cmp]xchg This adds two atomic opcodes, both of which include the BPF_FETCH flag. XCHG without the BPF_FETCH flag would naturally encode atomic_set. This is not supported because it would be of limited value to userspace (it doesn't imply any barriers). CMPXCHG without BPF_FETCH woulud be an atomic compare-and-write. We don't have such an operation in the kernel so it isn't provided to BPF either. There are two significant design decisions made for the CMPXCHG instruction: - To solve the issue that this operation fundamentally has 3 operands, but we only have two register fields. Therefore the operand we compare against (the kernel's API calls it 'old') is hard-coded to be R0. x86 has similar design (and A64 doesn't have this problem). A potential alternative might be to encode the other operand's register number in the immediate field. - The kernel's atomic_cmpxchg returns the old value, while the C11 userspace APIs return a boolean indicating the comparison result. Which should BPF do? A64 returns the old value. x86 returns the old value in the hard-coded register (and also sets a flag). That means return-old-value is easier to JIT, so that's what we use. Signed-off-by: Brendan Jackman Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210114181751.768687-8-jackmanb@google.com --- arch/x86/net/bpf_jit_comp.c | 8 ++++++++ include/linux/filter.h | 2 ++ include/uapi/linux/bpf.h | 4 +++- kernel/bpf/core.c | 20 ++++++++++++++++++++ kernel/bpf/disasm.c | 15 +++++++++++++++ kernel/bpf/verifier.c | 19 +++++++++++++++++-- tools/include/linux/filter.h | 2 ++ tools/include/uapi/linux/bpf.h | 4 +++- 8 files changed, 70 insertions(+), 4 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index eea7d8b0bb12..308241187582 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -815,6 +815,14 @@ static int emit_atomic(u8 **pprog, u8 atomic_op, /* src_reg = atomic_fetch_add(dst_reg + off, src_reg); */ EMIT2(0x0F, 0xC1); break; + case BPF_XCHG: + /* src_reg = atomic_xchg(dst_reg + off, src_reg); */ + EMIT1(0x87); + break; + case BPF_CMPXCHG: + /* r0 = atomic_cmpxchg(dst_reg + off, r0, src_reg); */ + EMIT2(0x0F, 0xB1); + break; default: pr_err("bpf_jit: unknown atomic opcode %02x\n", atomic_op); return -EFAULT; diff --git a/include/linux/filter.h b/include/linux/filter.h index 23fca41b8540..d563820f197d 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -265,6 +265,8 @@ static inline bool insn_is_zext(const struct bpf_insn *insn) * * BPF_ADD *(uint *) (dst_reg + off16) += src_reg * BPF_ADD | BPF_FETCH src_reg = atomic_fetch_add(dst_reg + off16, src_reg); + * BPF_XCHG src_reg = atomic_xchg(dst_reg + off16, src_reg) + * BPF_CMPXCHG r0 = atomic_cmpxchg(dst_reg + off16, r0, src_reg) */ #define BPF_ATOMIC_OP(SIZE, OP, DST, SRC, OFF) \ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index ea262b009049..c001766adcbc 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -45,7 +45,9 @@ #define BPF_EXIT 0x90 /* function return */ /* atomic op type fields (stored in immediate) */ -#define BPF_FETCH 0x01 /* fetch previous value into src reg */ +#define BPF_FETCH 0x01 /* not an opcode on its own, used to build others */ +#define BPF_XCHG (0xe0 | BPF_FETCH) /* atomic exchange */ +#define BPF_CMPXCHG (0xf0 | BPF_FETCH) /* atomic compare-and-write */ /* Register numbers */ enum { diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 28d6000463e4..4df6daba43ef 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1630,6 +1630,16 @@ out: (u32) SRC, (atomic_t *)(unsigned long) (DST + insn->off)); break; + case BPF_XCHG: + SRC = (u32) atomic_xchg( + (atomic_t *)(unsigned long) (DST + insn->off), + (u32) SRC); + break; + case BPF_CMPXCHG: + BPF_R0 = (u32) atomic_cmpxchg( + (atomic_t *)(unsigned long) (DST + insn->off), + (u32) BPF_R0, (u32) SRC); + break; default: goto default_label; } @@ -1647,6 +1657,16 @@ out: (u64) SRC, (atomic64_t *)(unsigned long) (DST + insn->off)); break; + case BPF_XCHG: + SRC = (u64) atomic64_xchg( + (atomic64_t *)(unsigned long) (DST + insn->off), + (u64) SRC); + break; + case BPF_CMPXCHG: + BPF_R0 = (u64) atomic64_cmpxchg( + (atomic64_t *)(unsigned long) (DST + insn->off), + (u64) BPF_R0, (u64) SRC); + break; default: goto default_label; } diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index d2e20f6d0516..ee8d1132767b 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -167,6 +167,21 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, BPF_SIZE(insn->code) == BPF_DW ? "64" : "", bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->src_reg); + } else if (BPF_MODE(insn->code) == BPF_ATOMIC && + insn->imm == BPF_CMPXCHG) { + verbose(cbs->private_data, "(%02x) r0 = atomic%s_cmpxchg((%s *)(r%d %+d), r0, r%d)\n", + insn->code, + BPF_SIZE(insn->code) == BPF_DW ? "64" : "", + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, insn->off, + insn->src_reg); + } else if (BPF_MODE(insn->code) == BPF_ATOMIC && + insn->imm == BPF_XCHG) { + verbose(cbs->private_data, "(%02x) r%d = atomic%s_xchg((%s *)(r%d %+d), r%d)\n", + insn->code, insn->src_reg, + BPF_SIZE(insn->code) == BPF_DW ? "64" : "", + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, insn->off, insn->src_reg); } else { verbose(cbs->private_data, "BUG_%02x\n", insn->code); } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6aa1fc919761..89a4d154ab37 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3606,11 +3606,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn) { + int load_reg; int err; switch (insn->imm) { case BPF_ADD: case BPF_ADD | BPF_FETCH: + case BPF_XCHG: + case BPF_CMPXCHG: break; default: verbose(env, "BPF_ATOMIC uses invalid atomic opcode %02x\n", insn->imm); @@ -3632,6 +3635,13 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i if (err) return err; + if (insn->imm == BPF_CMPXCHG) { + /* Check comparison of R0 with memory location */ + err = check_reg_arg(env, BPF_REG_0, SRC_OP); + if (err) + return err; + } + if (is_pointer_value(env, insn->src_reg)) { verbose(env, "R%d leaks addr into mem\n", insn->src_reg); return -EACCES; @@ -3662,8 +3672,13 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i if (!(insn->imm & BPF_FETCH)) return 0; - /* check and record load of old value into src reg */ - err = check_reg_arg(env, insn->src_reg, DST_OP); + if (insn->imm == BPF_CMPXCHG) + load_reg = BPF_REG_0; + else + load_reg = insn->src_reg; + + /* check and record load of old value */ + err = check_reg_arg(env, load_reg, DST_OP); if (err) return err; diff --git a/tools/include/linux/filter.h b/tools/include/linux/filter.h index 7211ce9fba53..d75998b0d5ac 100644 --- a/tools/include/linux/filter.h +++ b/tools/include/linux/filter.h @@ -174,6 +174,8 @@ * * BPF_ADD *(uint *) (dst_reg + off16) += src_reg * BPF_ADD | BPF_FETCH src_reg = atomic_fetch_add(dst_reg + off16, src_reg); + * BPF_XCHG src_reg = atomic_xchg(dst_reg + off16, src_reg) + * BPF_CMPXCHG r0 = atomic_cmpxchg(dst_reg + off16, r0, src_reg) */ #define BPF_ATOMIC_OP(SIZE, OP, DST, SRC, OFF) \ diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index ea262b009049..c001766adcbc 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -45,7 +45,9 @@ #define BPF_EXIT 0x90 /* function return */ /* atomic op type fields (stored in immediate) */ -#define BPF_FETCH 0x01 /* fetch previous value into src reg */ +#define BPF_FETCH 0x01 /* not an opcode on its own, used to build others */ +#define BPF_XCHG (0xe0 | BPF_FETCH) /* atomic exchange */ +#define BPF_CMPXCHG (0xf0 | BPF_FETCH) /* atomic compare-and-write */ /* Register numbers */ enum { -- cgit v1.2.3 From 462910670e4ac91509829c5549bd0227668176fb Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 14 Jan 2021 18:17:48 +0000 Subject: bpf: Pull out a macro for interpreting atomic ALU operations Since the atomic operations that are added in subsequent commits are all isomorphic with BPF_ADD, pull out a macro to avoid the interpreter becoming dominated by lines of atomic-related code. Note that this sacrificies interpreter performance (combining STX_ATOMIC_W and STX_ATOMIC_DW into single switch case means that we need an extra conditional branch to differentiate them) in favour of compact and (relatively!) simple C code. Signed-off-by: Brendan Jackman Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210114181751.768687-9-jackmanb@google.com --- kernel/bpf/core.c | 80 +++++++++++++++++++++++++++---------------------------- 1 file changed, 39 insertions(+), 41 deletions(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 4df6daba43ef..8669e685825f 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1618,55 +1618,53 @@ out: LDX_PROBE(DW, 8) #undef LDX_PROBE - STX_ATOMIC_W: - switch (IMM) { - case BPF_ADD: - /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */ - atomic_add((u32) SRC, (atomic_t *)(unsigned long) - (DST + insn->off)); - break; - case BPF_ADD | BPF_FETCH: - SRC = (u32) atomic_fetch_add( - (u32) SRC, - (atomic_t *)(unsigned long) (DST + insn->off)); - break; - case BPF_XCHG: - SRC = (u32) atomic_xchg( - (atomic_t *)(unsigned long) (DST + insn->off), - (u32) SRC); - break; - case BPF_CMPXCHG: - BPF_R0 = (u32) atomic_cmpxchg( - (atomic_t *)(unsigned long) (DST + insn->off), - (u32) BPF_R0, (u32) SRC); +#define ATOMIC_ALU_OP(BOP, KOP) \ + case BOP: \ + if (BPF_SIZE(insn->code) == BPF_W) \ + atomic_##KOP((u32) SRC, (atomic_t *)(unsigned long) \ + (DST + insn->off)); \ + else \ + atomic64_##KOP((u64) SRC, (atomic64_t *)(unsigned long) \ + (DST + insn->off)); \ + break; \ + case BOP | BPF_FETCH: \ + if (BPF_SIZE(insn->code) == BPF_W) \ + SRC = (u32) atomic_fetch_##KOP( \ + (u32) SRC, \ + (atomic_t *)(unsigned long) (DST + insn->off)); \ + else \ + SRC = (u64) atomic64_fetch_##KOP( \ + (u64) SRC, \ + (atomic64_t *)(unsigned long) (DST + insn->off)); \ break; - default: - goto default_label; - } - CONT; STX_ATOMIC_DW: + STX_ATOMIC_W: switch (IMM) { - case BPF_ADD: - /* lock xadd *(u64 *)(dst_reg + off16) += src_reg */ - atomic64_add((u64) SRC, (atomic64_t *)(unsigned long) - (DST + insn->off)); - break; - case BPF_ADD | BPF_FETCH: - SRC = (u64) atomic64_fetch_add( - (u64) SRC, - (atomic64_t *)(unsigned long) (DST + insn->off)); - break; + ATOMIC_ALU_OP(BPF_ADD, add) +#undef ATOMIC_ALU_OP + case BPF_XCHG: - SRC = (u64) atomic64_xchg( - (atomic64_t *)(unsigned long) (DST + insn->off), - (u64) SRC); + if (BPF_SIZE(insn->code) == BPF_W) + SRC = (u32) atomic_xchg( + (atomic_t *)(unsigned long) (DST + insn->off), + (u32) SRC); + else + SRC = (u64) atomic64_xchg( + (atomic64_t *)(unsigned long) (DST + insn->off), + (u64) SRC); break; case BPF_CMPXCHG: - BPF_R0 = (u64) atomic64_cmpxchg( - (atomic64_t *)(unsigned long) (DST + insn->off), - (u64) BPF_R0, (u64) SRC); + if (BPF_SIZE(insn->code) == BPF_W) + BPF_R0 = (u32) atomic_cmpxchg( + (atomic_t *)(unsigned long) (DST + insn->off), + (u32) BPF_R0, (u32) SRC); + else + BPF_R0 = (u64) atomic64_cmpxchg( + (atomic64_t *)(unsigned long) (DST + insn->off), + (u64) BPF_R0, (u64) SRC); break; + default: goto default_label; } -- cgit v1.2.3 From 981f94c3e92146705baf97fb417a5ed1ab1a79a5 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 14 Jan 2021 18:17:49 +0000 Subject: bpf: Add bitwise atomic instructions This adds instructions for atomic[64]_[fetch_]and atomic[64]_[fetch_]or atomic[64]_[fetch_]xor All these operations are isomorphic enough to implement with the same verifier, interpreter, and x86 JIT code, hence being a single commit. The main interesting thing here is that x86 doesn't directly support the fetch_ version these operations, so we need to generate a CMPXCHG loop in the JIT. This requires the use of two temporary registers, IIUC it's safe to use BPF_REG_AX and x86's AUX_REG for this purpose. Signed-off-by: Brendan Jackman Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210114181751.768687-10-jackmanb@google.com --- arch/x86/net/bpf_jit_comp.c | 50 +++++++++++++++++++++++++++++++++++++++++++- include/linux/filter.h | 6 ++++++ kernel/bpf/core.c | 3 +++ kernel/bpf/disasm.c | 21 +++++++++++++++---- kernel/bpf/verifier.c | 6 ++++++ tools/include/linux/filter.h | 6 ++++++ 6 files changed, 87 insertions(+), 5 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 308241187582..1d4d50199293 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -808,6 +808,10 @@ static int emit_atomic(u8 **pprog, u8 atomic_op, /* emit opcode */ switch (atomic_op) { case BPF_ADD: + case BPF_SUB: + case BPF_AND: + case BPF_OR: + case BPF_XOR: /* lock *(u32/u64*)(dst_reg + off) = src_reg */ EMIT1(simple_alu_opcodes[atomic_op]); break; @@ -1292,8 +1296,52 @@ st: if (is_imm8(insn->off)) case BPF_STX | BPF_ATOMIC | BPF_W: case BPF_STX | BPF_ATOMIC | BPF_DW: + if (insn->imm == (BPF_AND | BPF_FETCH) || + insn->imm == (BPF_OR | BPF_FETCH) || + insn->imm == (BPF_XOR | BPF_FETCH)) { + u8 *branch_target; + bool is64 = BPF_SIZE(insn->code) == BPF_DW; + + /* + * Can't be implemented with a single x86 insn. + * Need to do a CMPXCHG loop. + */ + + /* Will need RAX as a CMPXCHG operand so save R0 */ + emit_mov_reg(&prog, true, BPF_REG_AX, BPF_REG_0); + branch_target = prog; + /* Load old value */ + emit_ldx(&prog, BPF_SIZE(insn->code), + BPF_REG_0, dst_reg, insn->off); + /* + * Perform the (commutative) operation locally, + * put the result in the AUX_REG. + */ + emit_mov_reg(&prog, is64, AUX_REG, BPF_REG_0); + maybe_emit_mod(&prog, AUX_REG, src_reg, is64); + EMIT2(simple_alu_opcodes[BPF_OP(insn->imm)], + add_2reg(0xC0, AUX_REG, src_reg)); + /* Attempt to swap in new value */ + err = emit_atomic(&prog, BPF_CMPXCHG, + dst_reg, AUX_REG, insn->off, + BPF_SIZE(insn->code)); + if (WARN_ON(err)) + return err; + /* + * ZF tells us whether we won the race. If it's + * cleared we need to try again. + */ + EMIT2(X86_JNE, -(prog - branch_target) - 2); + /* Return the pre-modification value */ + emit_mov_reg(&prog, is64, src_reg, BPF_REG_0); + /* Restore R0 after clobbering RAX */ + emit_mov_reg(&prog, true, BPF_REG_0, BPF_REG_AX); + break; + + } + err = emit_atomic(&prog, insn->imm, dst_reg, src_reg, - insn->off, BPF_SIZE(insn->code)); + insn->off, BPF_SIZE(insn->code)); if (err) return err; break; diff --git a/include/linux/filter.h b/include/linux/filter.h index d563820f197d..7fdce5407214 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -264,7 +264,13 @@ static inline bool insn_is_zext(const struct bpf_insn *insn) * Atomic operations: * * BPF_ADD *(uint *) (dst_reg + off16) += src_reg + * BPF_AND *(uint *) (dst_reg + off16) &= src_reg + * BPF_OR *(uint *) (dst_reg + off16) |= src_reg + * BPF_XOR *(uint *) (dst_reg + off16) ^= src_reg * BPF_ADD | BPF_FETCH src_reg = atomic_fetch_add(dst_reg + off16, src_reg); + * BPF_AND | BPF_FETCH src_reg = atomic_fetch_and(dst_reg + off16, src_reg); + * BPF_OR | BPF_FETCH src_reg = atomic_fetch_or(dst_reg + off16, src_reg); + * BPF_XOR | BPF_FETCH src_reg = atomic_fetch_xor(dst_reg + off16, src_reg); * BPF_XCHG src_reg = atomic_xchg(dst_reg + off16, src_reg) * BPF_CMPXCHG r0 = atomic_cmpxchg(dst_reg + off16, r0, src_reg) */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 8669e685825f..5bbd4884ff7a 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1642,6 +1642,9 @@ out: STX_ATOMIC_W: switch (IMM) { ATOMIC_ALU_OP(BPF_ADD, add) + ATOMIC_ALU_OP(BPF_AND, and) + ATOMIC_ALU_OP(BPF_OR, or) + ATOMIC_ALU_OP(BPF_XOR, xor) #undef ATOMIC_ALU_OP case BPF_XCHG: diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index ee8d1132767b..19ff8fed7f4b 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -80,6 +80,13 @@ const char *const bpf_alu_string[16] = { [BPF_END >> 4] = "endian", }; +static const char *const bpf_atomic_alu_string[16] = { + [BPF_ADD >> 4] = "add", + [BPF_AND >> 4] = "and", + [BPF_OR >> 4] = "or", + [BPF_XOR >> 4] = "or", +}; + static const char *const bpf_ldst_string[] = { [BPF_W >> 3] = "u32", [BPF_H >> 3] = "u16", @@ -154,17 +161,23 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, insn->dst_reg, insn->off, insn->src_reg); else if (BPF_MODE(insn->code) == BPF_ATOMIC && - insn->imm == BPF_ADD) { - verbose(cbs->private_data, "(%02x) lock *(%s *)(r%d %+d) += r%d\n", + (insn->imm == BPF_ADD || insn->imm == BPF_ADD || + insn->imm == BPF_OR || insn->imm == BPF_XOR)) { + verbose(cbs->private_data, "(%02x) lock *(%s *)(r%d %+d) %s r%d\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, + bpf_alu_string[BPF_OP(insn->imm) >> 4], insn->src_reg); } else if (BPF_MODE(insn->code) == BPF_ATOMIC && - insn->imm == (BPF_ADD | BPF_FETCH)) { - verbose(cbs->private_data, "(%02x) r%d = atomic%s_fetch_add((%s *)(r%d %+d), r%d)\n", + (insn->imm == (BPF_ADD | BPF_FETCH) || + insn->imm == (BPF_AND | BPF_FETCH) || + insn->imm == (BPF_OR | BPF_FETCH) || + insn->imm == (BPF_XOR | BPF_FETCH))) { + verbose(cbs->private_data, "(%02x) r%d = atomic%s_fetch_%s((%s *)(r%d %+d), r%d)\n", insn->code, insn->src_reg, BPF_SIZE(insn->code) == BPF_DW ? "64" : "", + bpf_atomic_alu_string[BPF_OP(insn->imm) >> 4], bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->src_reg); } else if (BPF_MODE(insn->code) == BPF_ATOMIC && diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 89a4d154ab37..0f82d5d46e2c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3612,6 +3612,12 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i switch (insn->imm) { case BPF_ADD: case BPF_ADD | BPF_FETCH: + case BPF_AND: + case BPF_AND | BPF_FETCH: + case BPF_OR: + case BPF_OR | BPF_FETCH: + case BPF_XOR: + case BPF_XOR | BPF_FETCH: case BPF_XCHG: case BPF_CMPXCHG: break; diff --git a/tools/include/linux/filter.h b/tools/include/linux/filter.h index d75998b0d5ac..736bdeccdfe4 100644 --- a/tools/include/linux/filter.h +++ b/tools/include/linux/filter.h @@ -173,7 +173,13 @@ * Atomic operations: * * BPF_ADD *(uint *) (dst_reg + off16) += src_reg + * BPF_AND *(uint *) (dst_reg + off16) &= src_reg + * BPF_OR *(uint *) (dst_reg + off16) |= src_reg + * BPF_XOR *(uint *) (dst_reg + off16) ^= src_reg * BPF_ADD | BPF_FETCH src_reg = atomic_fetch_add(dst_reg + off16, src_reg); + * BPF_AND | BPF_FETCH src_reg = atomic_fetch_and(dst_reg + off16, src_reg); + * BPF_OR | BPF_FETCH src_reg = atomic_fetch_or(dst_reg + off16, src_reg); + * BPF_XOR | BPF_FETCH src_reg = atomic_fetch_xor(dst_reg + off16, src_reg); * BPF_XCHG src_reg = atomic_xchg(dst_reg + off16, src_reg) * BPF_CMPXCHG r0 = atomic_cmpxchg(dst_reg + off16, r0, src_reg) */ -- cgit v1.2.3 From 98d666d05a1d9706bb3fe972157fa6155dbb180f Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 14 Jan 2021 18:17:50 +0000 Subject: bpf: Add tests for new BPF atomic operations The prog_test that's added depends on Clang/LLVM features added by Yonghong in commit 286daafd6512 (was https://reviews.llvm.org/D72184). Note the use of a define called ENABLE_ATOMICS_TESTS: this is used to: - Avoid breaking the build for people on old versions of Clang - Avoid needing separate lists of test objects for no_alu32, where atomics are not supported even if Clang has the feature. The atomics_test.o BPF object is built unconditionally both for test_progs and test_progs-no_alu32. For test_progs, if Clang supports atomics, ENABLE_ATOMICS_TESTS is defined, so it includes the proper test code. Otherwise, progs and global vars are defined anyway, as stubs; this means that the skeleton user code still builds. The atomics_test.o userspace object is built once and used for both test_progs and test_progs-no_alu32. A variable called skip_tests is defined in the BPF object's data section, which tells the userspace object whether to skip the atomics test. Signed-off-by: Brendan Jackman Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210114181751.768687-11-jackmanb@google.com --- tools/testing/selftests/bpf/Makefile | 2 + tools/testing/selftests/bpf/prog_tests/atomics.c | 246 +++++++++++++++++++++ tools/testing/selftests/bpf/progs/atomics.c | 154 +++++++++++++ tools/testing/selftests/bpf/verifier/atomic_and.c | 77 +++++++ .../selftests/bpf/verifier/atomic_cmpxchg.c | 96 ++++++++ .../selftests/bpf/verifier/atomic_fetch_add.c | 106 +++++++++ tools/testing/selftests/bpf/verifier/atomic_or.c | 77 +++++++ tools/testing/selftests/bpf/verifier/atomic_xchg.c | 46 ++++ tools/testing/selftests/bpf/verifier/atomic_xor.c | 77 +++++++ 9 files changed, 881 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/atomics.c create mode 100644 tools/testing/selftests/bpf/progs/atomics.c create mode 100644 tools/testing/selftests/bpf/verifier/atomic_and.c create mode 100644 tools/testing/selftests/bpf/verifier/atomic_cmpxchg.c create mode 100644 tools/testing/selftests/bpf/verifier/atomic_fetch_add.c create mode 100644 tools/testing/selftests/bpf/verifier/atomic_or.c create mode 100644 tools/testing/selftests/bpf/verifier/atomic_xchg.c create mode 100644 tools/testing/selftests/bpf/verifier/atomic_xor.c diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 7f8667ad113e..0552b07717b6 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -414,10 +414,12 @@ TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read $(OUTPUT)/bpf_testmod.ko \ $(wildcard progs/btf_dump_test_case_*.c) TRUNNER_BPF_BUILD_RULE := CLANG_BPF_BUILD_RULE TRUNNER_BPF_CFLAGS := $(BPF_CFLAGS) $(CLANG_CFLAGS) +TRUNNER_BPF_CFLAGS += -DENABLE_ATOMICS_TESTS $(eval $(call DEFINE_TEST_RUNNER,test_progs)) # Define test_progs-no_alu32 test runner. TRUNNER_BPF_BUILD_RULE := CLANG_NOALU32_BPF_BUILD_RULE +TRUNNER_BPF_CFLAGS := $(BPF_CFLAGS) $(CLANG_CFLAGS) $(eval $(call DEFINE_TEST_RUNNER,test_progs,no_alu32)) # Define test_progs BPF-GCC-flavored test runner. diff --git a/tools/testing/selftests/bpf/prog_tests/atomics.c b/tools/testing/selftests/bpf/prog_tests/atomics.c new file mode 100644 index 000000000000..21efe7bbf10d --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/atomics.c @@ -0,0 +1,246 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include + +#include "atomics.skel.h" + +static void test_add(struct atomics *skel) +{ + int err, prog_fd; + __u32 duration = 0, retval; + struct bpf_link *link; + + link = bpf_program__attach(skel->progs.add); + if (CHECK(IS_ERR(link), "attach(add)", "err: %ld\n", PTR_ERR(link))) + return; + + prog_fd = bpf_program__fd(skel->progs.add); + err = bpf_prog_test_run(prog_fd, 1, NULL, 0, + NULL, NULL, &retval, &duration); + if (CHECK(err || retval, "test_run add", + "err %d errno %d retval %d duration %d\n", err, errno, retval, duration)) + goto cleanup; + + ASSERT_EQ(skel->data->add64_value, 3, "add64_value"); + ASSERT_EQ(skel->bss->add64_result, 1, "add64_result"); + + ASSERT_EQ(skel->data->add32_value, 3, "add32_value"); + ASSERT_EQ(skel->bss->add32_result, 1, "add32_result"); + + ASSERT_EQ(skel->bss->add_stack_value_copy, 3, "add_stack_value"); + ASSERT_EQ(skel->bss->add_stack_result, 1, "add_stack_result"); + + ASSERT_EQ(skel->data->add_noreturn_value, 3, "add_noreturn_value"); + +cleanup: + bpf_link__destroy(link); +} + +static void test_sub(struct atomics *skel) +{ + int err, prog_fd; + __u32 duration = 0, retval; + struct bpf_link *link; + + link = bpf_program__attach(skel->progs.sub); + if (CHECK(IS_ERR(link), "attach(sub)", "err: %ld\n", PTR_ERR(link))) + return; + + prog_fd = bpf_program__fd(skel->progs.sub); + err = bpf_prog_test_run(prog_fd, 1, NULL, 0, + NULL, NULL, &retval, &duration); + if (CHECK(err || retval, "test_run sub", + "err %d errno %d retval %d duration %d\n", + err, errno, retval, duration)) + goto cleanup; + + ASSERT_EQ(skel->data->sub64_value, -1, "sub64_value"); + ASSERT_EQ(skel->bss->sub64_result, 1, "sub64_result"); + + ASSERT_EQ(skel->data->sub32_value, -1, "sub32_value"); + ASSERT_EQ(skel->bss->sub32_result, 1, "sub32_result"); + + ASSERT_EQ(skel->bss->sub_stack_value_copy, -1, "sub_stack_value"); + ASSERT_EQ(skel->bss->sub_stack_result, 1, "sub_stack_result"); + + ASSERT_EQ(skel->data->sub_noreturn_value, -1, "sub_noreturn_value"); + +cleanup: + bpf_link__destroy(link); +} + +static void test_and(struct atomics *skel) +{ + int err, prog_fd; + __u32 duration = 0, retval; + struct bpf_link *link; + + link = bpf_program__attach(skel->progs.and); + if (CHECK(IS_ERR(link), "attach(and)", "err: %ld\n", PTR_ERR(link))) + return; + + prog_fd = bpf_program__fd(skel->progs.and); + err = bpf_prog_test_run(prog_fd, 1, NULL, 0, + NULL, NULL, &retval, &duration); + if (CHECK(err || retval, "test_run and", + "err %d errno %d retval %d duration %d\n", err, errno, retval, duration)) + goto cleanup; + + ASSERT_EQ(skel->data->and64_value, 0x010ull << 32, "and64_value"); + ASSERT_EQ(skel->bss->and64_result, 0x110ull << 32, "and64_result"); + + ASSERT_EQ(skel->data->and32_value, 0x010, "and32_value"); + ASSERT_EQ(skel->bss->and32_result, 0x110, "and32_result"); + + ASSERT_EQ(skel->data->and_noreturn_value, 0x010ull << 32, "and_noreturn_value"); +cleanup: + bpf_link__destroy(link); +} + +static void test_or(struct atomics *skel) +{ + int err, prog_fd; + __u32 duration = 0, retval; + struct bpf_link *link; + + link = bpf_program__attach(skel->progs.or); + if (CHECK(IS_ERR(link), "attach(or)", "err: %ld\n", PTR_ERR(link))) + return; + + prog_fd = bpf_program__fd(skel->progs.or); + err = bpf_prog_test_run(prog_fd, 1, NULL, 0, + NULL, NULL, &retval, &duration); + if (CHECK(err || retval, "test_run or", + "err %d errno %d retval %d duration %d\n", + err, errno, retval, duration)) + goto cleanup; + + ASSERT_EQ(skel->data->or64_value, 0x111ull << 32, "or64_value"); + ASSERT_EQ(skel->bss->or64_result, 0x110ull << 32, "or64_result"); + + ASSERT_EQ(skel->data->or32_value, 0x111, "or32_value"); + ASSERT_EQ(skel->bss->or32_result, 0x110, "or32_result"); + + ASSERT_EQ(skel->data->or_noreturn_value, 0x111ull << 32, "or_noreturn_value"); +cleanup: + bpf_link__destroy(link); +} + +static void test_xor(struct atomics *skel) +{ + int err, prog_fd; + __u32 duration = 0, retval; + struct bpf_link *link; + + link = bpf_program__attach(skel->progs.xor); + if (CHECK(IS_ERR(link), "attach(xor)", "err: %ld\n", PTR_ERR(link))) + return; + + prog_fd = bpf_program__fd(skel->progs.xor); + err = bpf_prog_test_run(prog_fd, 1, NULL, 0, + NULL, NULL, &retval, &duration); + if (CHECK(err || retval, "test_run xor", + "err %d errno %d retval %d duration %d\n", err, errno, retval, duration)) + goto cleanup; + + ASSERT_EQ(skel->data->xor64_value, 0x101ull << 32, "xor64_value"); + ASSERT_EQ(skel->bss->xor64_result, 0x110ull << 32, "xor64_result"); + + ASSERT_EQ(skel->data->xor32_value, 0x101, "xor32_value"); + ASSERT_EQ(skel->bss->xor32_result, 0x110, "xor32_result"); + + ASSERT_EQ(skel->data->xor_noreturn_value, 0x101ull << 32, "xor_nxoreturn_value"); +cleanup: + bpf_link__destroy(link); +} + +static void test_cmpxchg(struct atomics *skel) +{ + int err, prog_fd; + __u32 duration = 0, retval; + struct bpf_link *link; + + link = bpf_program__attach(skel->progs.cmpxchg); + if (CHECK(IS_ERR(link), "attach(cmpxchg)", "err: %ld\n", PTR_ERR(link))) + return; + + prog_fd = bpf_program__fd(skel->progs.cmpxchg); + err = bpf_prog_test_run(prog_fd, 1, NULL, 0, + NULL, NULL, &retval, &duration); + if (CHECK(err || retval, "test_run add", + "err %d errno %d retval %d duration %d\n", err, errno, retval, duration)) + goto cleanup; + + ASSERT_EQ(skel->data->cmpxchg64_value, 2, "cmpxchg64_value"); + ASSERT_EQ(skel->bss->cmpxchg64_result_fail, 1, "cmpxchg_result_fail"); + ASSERT_EQ(skel->bss->cmpxchg64_result_succeed, 1, "cmpxchg_result_succeed"); + + ASSERT_EQ(skel->data->cmpxchg32_value, 2, "lcmpxchg32_value"); + ASSERT_EQ(skel->bss->cmpxchg32_result_fail, 1, "cmpxchg_result_fail"); + ASSERT_EQ(skel->bss->cmpxchg32_result_succeed, 1, "cmpxchg_result_succeed"); + +cleanup: + bpf_link__destroy(link); +} + +static void test_xchg(struct atomics *skel) +{ + int err, prog_fd; + __u32 duration = 0, retval; + struct bpf_link *link; + + link = bpf_program__attach(skel->progs.xchg); + if (CHECK(IS_ERR(link), "attach(xchg)", "err: %ld\n", PTR_ERR(link))) + return; + + prog_fd = bpf_program__fd(skel->progs.xchg); + err = bpf_prog_test_run(prog_fd, 1, NULL, 0, + NULL, NULL, &retval, &duration); + if (CHECK(err || retval, "test_run add", + "err %d errno %d retval %d duration %d\n", err, errno, retval, duration)) + goto cleanup; + + ASSERT_EQ(skel->data->xchg64_value, 2, "xchg64_value"); + ASSERT_EQ(skel->bss->xchg64_result, 1, "xchg64_result"); + + ASSERT_EQ(skel->data->xchg32_value, 2, "xchg32_value"); + ASSERT_EQ(skel->bss->xchg32_result, 1, "xchg32_result"); + +cleanup: + bpf_link__destroy(link); +} + +void test_atomics(void) +{ + struct atomics *skel; + __u32 duration = 0; + + skel = atomics__open_and_load(); + if (CHECK(!skel, "skel_load", "atomics skeleton failed\n")) + return; + + if (skel->data->skip_tests) { + printf("%s:SKIP:no ENABLE_ATOMICS_TESTS (missing Clang BPF atomics support)", + __func__); + test__skip(); + goto cleanup; + } + + if (test__start_subtest("add")) + test_add(skel); + if (test__start_subtest("sub")) + test_sub(skel); + if (test__start_subtest("and")) + test_and(skel); + if (test__start_subtest("or")) + test_or(skel); + if (test__start_subtest("xor")) + test_xor(skel); + if (test__start_subtest("cmpxchg")) + test_cmpxchg(skel); + if (test__start_subtest("xchg")) + test_xchg(skel); + +cleanup: + atomics__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/atomics.c b/tools/testing/selftests/bpf/progs/atomics.c new file mode 100644 index 000000000000..c245345e41ca --- /dev/null +++ b/tools/testing/selftests/bpf/progs/atomics.c @@ -0,0 +1,154 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include + +#ifdef ENABLE_ATOMICS_TESTS +bool skip_tests __attribute((__section__(".data"))) = false; +#else +bool skip_tests = true; +#endif + +__u64 add64_value = 1; +__u64 add64_result = 0; +__u32 add32_value = 1; +__u32 add32_result = 0; +__u64 add_stack_value_copy = 0; +__u64 add_stack_result = 0; +__u64 add_noreturn_value = 1; + +SEC("fentry/bpf_fentry_test1") +int BPF_PROG(add, int a) +{ +#ifdef ENABLE_ATOMICS_TESTS + __u64 add_stack_value = 1; + + add64_result = __sync_fetch_and_add(&add64_value, 2); + add32_result = __sync_fetch_and_add(&add32_value, 2); + add_stack_result = __sync_fetch_and_add(&add_stack_value, 2); + add_stack_value_copy = add_stack_value; + __sync_fetch_and_add(&add_noreturn_value, 2); +#endif + + return 0; +} + +__s64 sub64_value = 1; +__s64 sub64_result = 0; +__s32 sub32_value = 1; +__s32 sub32_result = 0; +__s64 sub_stack_value_copy = 0; +__s64 sub_stack_result = 0; +__s64 sub_noreturn_value = 1; + +SEC("fentry/bpf_fentry_test1") +int BPF_PROG(sub, int a) +{ +#ifdef ENABLE_ATOMICS_TESTS + __u64 sub_stack_value = 1; + + sub64_result = __sync_fetch_and_sub(&sub64_value, 2); + sub32_result = __sync_fetch_and_sub(&sub32_value, 2); + sub_stack_result = __sync_fetch_and_sub(&sub_stack_value, 2); + sub_stack_value_copy = sub_stack_value; + __sync_fetch_and_sub(&sub_noreturn_value, 2); +#endif + + return 0; +} + +__u64 and64_value = (0x110ull << 32); +__u64 and64_result = 0; +__u32 and32_value = 0x110; +__u32 and32_result = 0; +__u64 and_noreturn_value = (0x110ull << 32); + +SEC("fentry/bpf_fentry_test1") +int BPF_PROG(and, int a) +{ +#ifdef ENABLE_ATOMICS_TESTS + + and64_result = __sync_fetch_and_and(&and64_value, 0x011ull << 32); + and32_result = __sync_fetch_and_and(&and32_value, 0x011); + __sync_fetch_and_and(&and_noreturn_value, 0x011ull << 32); +#endif + + return 0; +} + +__u64 or64_value = (0x110ull << 32); +__u64 or64_result = 0; +__u32 or32_value = 0x110; +__u32 or32_result = 0; +__u64 or_noreturn_value = (0x110ull << 32); + +SEC("fentry/bpf_fentry_test1") +int BPF_PROG(or, int a) +{ +#ifdef ENABLE_ATOMICS_TESTS + or64_result = __sync_fetch_and_or(&or64_value, 0x011ull << 32); + or32_result = __sync_fetch_and_or(&or32_value, 0x011); + __sync_fetch_and_or(&or_noreturn_value, 0x011ull << 32); +#endif + + return 0; +} + +__u64 xor64_value = (0x110ull << 32); +__u64 xor64_result = 0; +__u32 xor32_value = 0x110; +__u32 xor32_result = 0; +__u64 xor_noreturn_value = (0x110ull << 32); + +SEC("fentry/bpf_fentry_test1") +int BPF_PROG(xor, int a) +{ +#ifdef ENABLE_ATOMICS_TESTS + xor64_result = __sync_fetch_and_xor(&xor64_value, 0x011ull << 32); + xor32_result = __sync_fetch_and_xor(&xor32_value, 0x011); + __sync_fetch_and_xor(&xor_noreturn_value, 0x011ull << 32); +#endif + + return 0; +} + +__u64 cmpxchg64_value = 1; +__u64 cmpxchg64_result_fail = 0; +__u64 cmpxchg64_result_succeed = 0; +__u32 cmpxchg32_value = 1; +__u32 cmpxchg32_result_fail = 0; +__u32 cmpxchg32_result_succeed = 0; + +SEC("fentry/bpf_fentry_test1") +int BPF_PROG(cmpxchg, int a) +{ +#ifdef ENABLE_ATOMICS_TESTS + cmpxchg64_result_fail = __sync_val_compare_and_swap(&cmpxchg64_value, 0, 3); + cmpxchg64_result_succeed = __sync_val_compare_and_swap(&cmpxchg64_value, 1, 2); + + cmpxchg32_result_fail = __sync_val_compare_and_swap(&cmpxchg32_value, 0, 3); + cmpxchg32_result_succeed = __sync_val_compare_and_swap(&cmpxchg32_value, 1, 2); +#endif + + return 0; +} + +__u64 xchg64_value = 1; +__u64 xchg64_result = 0; +__u32 xchg32_value = 1; +__u32 xchg32_result = 0; + +SEC("fentry/bpf_fentry_test1") +int BPF_PROG(xchg, int a) +{ +#ifdef ENABLE_ATOMICS_TESTS + __u64 val64 = 2; + __u32 val32 = 2; + + xchg64_result = __sync_lock_test_and_set(&xchg64_value, val64); + xchg32_result = __sync_lock_test_and_set(&xchg32_value, val32); +#endif + + return 0; +} diff --git a/tools/testing/selftests/bpf/verifier/atomic_and.c b/tools/testing/selftests/bpf/verifier/atomic_and.c new file mode 100644 index 000000000000..600bc5e0f143 --- /dev/null +++ b/tools/testing/selftests/bpf/verifier/atomic_and.c @@ -0,0 +1,77 @@ +{ + "BPF_ATOMIC_AND without fetch", + .insns = { + /* val = 0x110; */ + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0x110), + /* atomic_and(&val, 0x011); */ + BPF_MOV64_IMM(BPF_REG_1, 0x011), + BPF_ATOMIC_OP(BPF_DW, BPF_AND, BPF_REG_10, BPF_REG_1, -8), + /* if (val != 0x010) exit(2); */ + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_10, -8), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0x010, 2), + BPF_MOV64_IMM(BPF_REG_0, 2), + BPF_EXIT_INSN(), + /* r1 should not be clobbered, no BPF_FETCH flag */ + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0x011, 1), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, +}, +{ + "BPF_ATOMIC_AND with fetch", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 123), + /* val = 0x110; */ + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0x110), + /* old = atomic_fetch_and(&val, 0x011); */ + BPF_MOV64_IMM(BPF_REG_1, 0x011), + BPF_ATOMIC_OP(BPF_DW, BPF_AND | BPF_FETCH, BPF_REG_10, BPF_REG_1, -8), + /* if (old != 0x110) exit(3); */ + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0x110, 2), + BPF_MOV64_IMM(BPF_REG_0, 3), + BPF_EXIT_INSN(), + /* if (val != 0x010) exit(2); */ + BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -8), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0x010, 2), + BPF_MOV64_IMM(BPF_REG_1, 2), + BPF_EXIT_INSN(), + /* Check R0 wasn't clobbered (for fear of x86 JIT bug) */ + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 123, 2), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + /* exit(0); */ + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, +}, +{ + "BPF_ATOMIC_AND with fetch 32bit", + .insns = { + /* r0 = (s64) -1 */ + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_ALU64_IMM(BPF_SUB, BPF_REG_0, 1), + /* val = 0x110; */ + BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0x110), + /* old = atomic_fetch_and(&val, 0x011); */ + BPF_MOV32_IMM(BPF_REG_1, 0x011), + BPF_ATOMIC_OP(BPF_W, BPF_AND | BPF_FETCH, BPF_REG_10, BPF_REG_1, -4), + /* if (old != 0x110) exit(3); */ + BPF_JMP32_IMM(BPF_JEQ, BPF_REG_1, 0x110, 2), + BPF_MOV32_IMM(BPF_REG_0, 3), + BPF_EXIT_INSN(), + /* if (val != 0x010) exit(2); */ + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_10, -4), + BPF_JMP32_IMM(BPF_JEQ, BPF_REG_1, 0x010, 2), + BPF_MOV32_IMM(BPF_REG_1, 2), + BPF_EXIT_INSN(), + /* Check R0 wasn't clobbered (for fear of x86 JIT bug) + * It should be -1 so add 1 to get exit code. + */ + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, +}, diff --git a/tools/testing/selftests/bpf/verifier/atomic_cmpxchg.c b/tools/testing/selftests/bpf/verifier/atomic_cmpxchg.c new file mode 100644 index 000000000000..2efd8bcf57a1 --- /dev/null +++ b/tools/testing/selftests/bpf/verifier/atomic_cmpxchg.c @@ -0,0 +1,96 @@ +{ + "atomic compare-and-exchange smoketest - 64bit", + .insns = { + /* val = 3; */ + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 3), + /* old = atomic_cmpxchg(&val, 2, 4); */ + BPF_MOV64_IMM(BPF_REG_1, 4), + BPF_MOV64_IMM(BPF_REG_0, 2), + BPF_ATOMIC_OP(BPF_DW, BPF_CMPXCHG, BPF_REG_10, BPF_REG_1, -8), + /* if (old != 3) exit(2); */ + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 3, 2), + BPF_MOV64_IMM(BPF_REG_0, 2), + BPF_EXIT_INSN(), + /* if (val != 3) exit(3); */ + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -8), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 3, 2), + BPF_MOV64_IMM(BPF_REG_0, 3), + BPF_EXIT_INSN(), + /* old = atomic_cmpxchg(&val, 3, 4); */ + BPF_MOV64_IMM(BPF_REG_1, 4), + BPF_MOV64_IMM(BPF_REG_0, 3), + BPF_ATOMIC_OP(BPF_DW, BPF_CMPXCHG, BPF_REG_10, BPF_REG_1, -8), + /* if (old != 3) exit(4); */ + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 3, 2), + BPF_MOV64_IMM(BPF_REG_0, 4), + BPF_EXIT_INSN(), + /* if (val != 4) exit(5); */ + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -8), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 4, 2), + BPF_MOV64_IMM(BPF_REG_0, 5), + BPF_EXIT_INSN(), + /* exit(0); */ + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, +}, +{ + "atomic compare-and-exchange smoketest - 32bit", + .insns = { + /* val = 3; */ + BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 3), + /* old = atomic_cmpxchg(&val, 2, 4); */ + BPF_MOV32_IMM(BPF_REG_1, 4), + BPF_MOV32_IMM(BPF_REG_0, 2), + BPF_ATOMIC_OP(BPF_W, BPF_CMPXCHG, BPF_REG_10, BPF_REG_1, -4), + /* if (old != 3) exit(2); */ + BPF_JMP32_IMM(BPF_JEQ, BPF_REG_0, 3, 2), + BPF_MOV32_IMM(BPF_REG_0, 2), + BPF_EXIT_INSN(), + /* if (val != 3) exit(3); */ + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_10, -4), + BPF_JMP32_IMM(BPF_JEQ, BPF_REG_0, 3, 2), + BPF_MOV32_IMM(BPF_REG_0, 3), + BPF_EXIT_INSN(), + /* old = atomic_cmpxchg(&val, 3, 4); */ + BPF_MOV32_IMM(BPF_REG_1, 4), + BPF_MOV32_IMM(BPF_REG_0, 3), + BPF_ATOMIC_OP(BPF_W, BPF_CMPXCHG, BPF_REG_10, BPF_REG_1, -4), + /* if (old != 3) exit(4); */ + BPF_JMP32_IMM(BPF_JEQ, BPF_REG_0, 3, 2), + BPF_MOV32_IMM(BPF_REG_0, 4), + BPF_EXIT_INSN(), + /* if (val != 4) exit(5); */ + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_10, -4), + BPF_JMP32_IMM(BPF_JEQ, BPF_REG_0, 4, 2), + BPF_MOV32_IMM(BPF_REG_0, 5), + BPF_EXIT_INSN(), + /* exit(0); */ + BPF_MOV32_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, +}, +{ + "Can't use cmpxchg on uninit src reg", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 3), + BPF_MOV64_IMM(BPF_REG_0, 3), + BPF_ATOMIC_OP(BPF_DW, BPF_CMPXCHG, BPF_REG_10, BPF_REG_2, -8), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr = "!read_ok", +}, +{ + "Can't use cmpxchg on uninit memory", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 3), + BPF_MOV64_IMM(BPF_REG_2, 4), + BPF_ATOMIC_OP(BPF_DW, BPF_CMPXCHG, BPF_REG_10, BPF_REG_2, -8), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr = "invalid read from stack", +}, diff --git a/tools/testing/selftests/bpf/verifier/atomic_fetch_add.c b/tools/testing/selftests/bpf/verifier/atomic_fetch_add.c new file mode 100644 index 000000000000..a91de8cd9def --- /dev/null +++ b/tools/testing/selftests/bpf/verifier/atomic_fetch_add.c @@ -0,0 +1,106 @@ +{ + "BPF_ATOMIC_FETCH_ADD smoketest - 64bit", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + /* Write 3 to stack */ + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 3), + /* Put a 1 in R1, add it to the 3 on the stack, and load the value back into R1 */ + BPF_MOV64_IMM(BPF_REG_1, 1), + BPF_ATOMIC_OP(BPF_DW, BPF_ADD | BPF_FETCH, BPF_REG_10, BPF_REG_1, -8), + /* Check the value we loaded back was 3 */ + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 3, 2), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + /* Load value from stack */ + BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -8), + /* Check value loaded from stack was 4 */ + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 4, 1), + BPF_MOV64_IMM(BPF_REG_0, 2), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, +}, +{ + "BPF_ATOMIC_FETCH_ADD smoketest - 32bit", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + /* Write 3 to stack */ + BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 3), + /* Put a 1 in R1, add it to the 3 on the stack, and load the value back into R1 */ + BPF_MOV32_IMM(BPF_REG_1, 1), + BPF_ATOMIC_OP(BPF_W, BPF_ADD | BPF_FETCH, BPF_REG_10, BPF_REG_1, -4), + /* Check the value we loaded back was 3 */ + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 3, 2), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + /* Load value from stack */ + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_10, -4), + /* Check value loaded from stack was 4 */ + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 4, 1), + BPF_MOV64_IMM(BPF_REG_0, 2), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, +}, +{ + "Can't use ATM_FETCH_ADD on frame pointer", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 3), + BPF_ATOMIC_OP(BPF_DW, BPF_ADD | BPF_FETCH, BPF_REG_10, BPF_REG_10, -8), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr_unpriv = "R10 leaks addr into mem", + .errstr = "frame pointer is read only", +}, +{ + "Can't use ATM_FETCH_ADD on uninit src reg", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 3), + BPF_ATOMIC_OP(BPF_DW, BPF_ADD | BPF_FETCH, BPF_REG_10, BPF_REG_2, -8), + BPF_EXIT_INSN(), + }, + .result = REJECT, + /* It happens that the address leak check is first, but it would also be + * complain about the fact that we're trying to modify R10. + */ + .errstr = "!read_ok", +}, +{ + "Can't use ATM_FETCH_ADD on uninit dst reg", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_ATOMIC_OP(BPF_DW, BPF_ADD | BPF_FETCH, BPF_REG_2, BPF_REG_0, -8), + BPF_EXIT_INSN(), + }, + .result = REJECT, + /* It happens that the address leak check is first, but it would also be + * complain about the fact that we're trying to modify R10. + */ + .errstr = "!read_ok", +}, +{ + "Can't use ATM_FETCH_ADD on kernel memory", + .insns = { + /* This is an fentry prog, context is array of the args of the + * kernel function being called. Load first arg into R2. + */ + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, 0), + /* First arg of bpf_fentry_test7 is a pointer to a struct. + * Attempt to modify that struct. Verifier shouldn't let us + * because it's kernel memory. + */ + BPF_MOV64_IMM(BPF_REG_3, 1), + BPF_ATOMIC_OP(BPF_DW, BPF_ADD | BPF_FETCH, BPF_REG_2, BPF_REG_3, 0), + /* Done */ + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_TRACING, + .expected_attach_type = BPF_TRACE_FENTRY, + .kfunc = "bpf_fentry_test7", + .result = REJECT, + .errstr = "only read is supported", +}, diff --git a/tools/testing/selftests/bpf/verifier/atomic_or.c b/tools/testing/selftests/bpf/verifier/atomic_or.c new file mode 100644 index 000000000000..ebe6e51455ba --- /dev/null +++ b/tools/testing/selftests/bpf/verifier/atomic_or.c @@ -0,0 +1,77 @@ +{ + "BPF_ATOMIC OR without fetch", + .insns = { + /* val = 0x110; */ + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0x110), + /* atomic_or(&val, 0x011); */ + BPF_MOV64_IMM(BPF_REG_1, 0x011), + BPF_ATOMIC_OP(BPF_DW, BPF_OR, BPF_REG_10, BPF_REG_1, -8), + /* if (val != 0x111) exit(2); */ + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_10, -8), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0x111, 2), + BPF_MOV64_IMM(BPF_REG_0, 2), + BPF_EXIT_INSN(), + /* r1 should not be clobbered, no BPF_FETCH flag */ + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0x011, 1), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, +}, +{ + "BPF_ATOMIC OR with fetch", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 123), + /* val = 0x110; */ + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0x110), + /* old = atomic_fetch_or(&val, 0x011); */ + BPF_MOV64_IMM(BPF_REG_1, 0x011), + BPF_ATOMIC_OP(BPF_DW, BPF_OR | BPF_FETCH, BPF_REG_10, BPF_REG_1, -8), + /* if (old != 0x110) exit(3); */ + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0x110, 2), + BPF_MOV64_IMM(BPF_REG_0, 3), + BPF_EXIT_INSN(), + /* if (val != 0x111) exit(2); */ + BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -8), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0x111, 2), + BPF_MOV64_IMM(BPF_REG_1, 2), + BPF_EXIT_INSN(), + /* Check R0 wasn't clobbered (for fear of x86 JIT bug) */ + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 123, 2), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + /* exit(0); */ + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, +}, +{ + "BPF_ATOMIC OR with fetch 32bit", + .insns = { + /* r0 = (s64) -1 */ + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_ALU64_IMM(BPF_SUB, BPF_REG_0, 1), + /* val = 0x110; */ + BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0x110), + /* old = atomic_fetch_or(&val, 0x011); */ + BPF_MOV32_IMM(BPF_REG_1, 0x011), + BPF_ATOMIC_OP(BPF_W, BPF_OR | BPF_FETCH, BPF_REG_10, BPF_REG_1, -4), + /* if (old != 0x110) exit(3); */ + BPF_JMP32_IMM(BPF_JEQ, BPF_REG_1, 0x110, 2), + BPF_MOV32_IMM(BPF_REG_0, 3), + BPF_EXIT_INSN(), + /* if (val != 0x111) exit(2); */ + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_10, -4), + BPF_JMP32_IMM(BPF_JEQ, BPF_REG_1, 0x111, 2), + BPF_MOV32_IMM(BPF_REG_1, 2), + BPF_EXIT_INSN(), + /* Check R0 wasn't clobbered (for fear of x86 JIT bug) + * It should be -1 so add 1 to get exit code. + */ + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, +}, diff --git a/tools/testing/selftests/bpf/verifier/atomic_xchg.c b/tools/testing/selftests/bpf/verifier/atomic_xchg.c new file mode 100644 index 000000000000..33e2d6c973ee --- /dev/null +++ b/tools/testing/selftests/bpf/verifier/atomic_xchg.c @@ -0,0 +1,46 @@ +{ + "atomic exchange smoketest - 64bit", + .insns = { + /* val = 3; */ + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 3), + /* old = atomic_xchg(&val, 4); */ + BPF_MOV64_IMM(BPF_REG_1, 4), + BPF_ATOMIC_OP(BPF_DW, BPF_XCHG, BPF_REG_10, BPF_REG_1, -8), + /* if (old != 3) exit(1); */ + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 3, 2), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + /* if (val != 4) exit(2); */ + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -8), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 4, 2), + BPF_MOV64_IMM(BPF_REG_0, 2), + BPF_EXIT_INSN(), + /* exit(0); */ + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, +}, +{ + "atomic exchange smoketest - 32bit", + .insns = { + /* val = 3; */ + BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 3), + /* old = atomic_xchg(&val, 4); */ + BPF_MOV32_IMM(BPF_REG_1, 4), + BPF_ATOMIC_OP(BPF_W, BPF_XCHG, BPF_REG_10, BPF_REG_1, -4), + /* if (old != 3) exit(1); */ + BPF_JMP32_IMM(BPF_JEQ, BPF_REG_1, 3, 2), + BPF_MOV32_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + /* if (val != 4) exit(2); */ + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_10, -4), + BPF_JMP32_IMM(BPF_JEQ, BPF_REG_0, 4, 2), + BPF_MOV32_IMM(BPF_REG_0, 2), + BPF_EXIT_INSN(), + /* exit(0); */ + BPF_MOV32_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, +}, diff --git a/tools/testing/selftests/bpf/verifier/atomic_xor.c b/tools/testing/selftests/bpf/verifier/atomic_xor.c new file mode 100644 index 000000000000..eb791e547b47 --- /dev/null +++ b/tools/testing/selftests/bpf/verifier/atomic_xor.c @@ -0,0 +1,77 @@ +{ + "BPF_ATOMIC XOR without fetch", + .insns = { + /* val = 0x110; */ + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0x110), + /* atomic_xor(&val, 0x011); */ + BPF_MOV64_IMM(BPF_REG_1, 0x011), + BPF_ATOMIC_OP(BPF_DW, BPF_XOR, BPF_REG_10, BPF_REG_1, -8), + /* if (val != 0x101) exit(2); */ + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_10, -8), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0x101, 2), + BPF_MOV64_IMM(BPF_REG_0, 2), + BPF_EXIT_INSN(), + /* r1 should not be clobbered, no BPF_FETCH flag */ + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0x011, 1), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, +}, +{ + "BPF_ATOMIC XOR with fetch", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 123), + /* val = 0x110; */ + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0x110), + /* old = atomic_fetch_xor(&val, 0x011); */ + BPF_MOV64_IMM(BPF_REG_1, 0x011), + BPF_ATOMIC_OP(BPF_DW, BPF_XOR | BPF_FETCH, BPF_REG_10, BPF_REG_1, -8), + /* if (old != 0x110) exit(3); */ + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0x110, 2), + BPF_MOV64_IMM(BPF_REG_0, 3), + BPF_EXIT_INSN(), + /* if (val != 0x101) exit(2); */ + BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -8), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0x101, 2), + BPF_MOV64_IMM(BPF_REG_1, 2), + BPF_EXIT_INSN(), + /* Check R0 wasn't clobbered (fxor fear of x86 JIT bug) */ + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 123, 2), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + /* exit(0); */ + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, +}, +{ + "BPF_ATOMIC XOR with fetch 32bit", + .insns = { + /* r0 = (s64) -1 */ + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_ALU64_IMM(BPF_SUB, BPF_REG_0, 1), + /* val = 0x110; */ + BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0x110), + /* old = atomic_fetch_xor(&val, 0x011); */ + BPF_MOV32_IMM(BPF_REG_1, 0x011), + BPF_ATOMIC_OP(BPF_W, BPF_XOR | BPF_FETCH, BPF_REG_10, BPF_REG_1, -4), + /* if (old != 0x110) exit(3); */ + BPF_JMP32_IMM(BPF_JEQ, BPF_REG_1, 0x110, 2), + BPF_MOV32_IMM(BPF_REG_0, 3), + BPF_EXIT_INSN(), + /* if (val != 0x101) exit(2); */ + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_10, -4), + BPF_JMP32_IMM(BPF_JEQ, BPF_REG_1, 0x101, 2), + BPF_MOV32_IMM(BPF_REG_1, 2), + BPF_EXIT_INSN(), + /* Check R0 wasn't clobbered (fxor fear of x86 JIT bug) + * It should be -1 so add 1 to get exit code. + */ + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, +}, -- cgit v1.2.3 From de948576f8e7d7fa1b5db04f56184ffe176177c5 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 14 Jan 2021 18:17:51 +0000 Subject: bpf: Document new atomic instructions Document new atomic instructions. Signed-off-by: Brendan Jackman Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210114181751.768687-12-jackmanb@google.com --- Documentation/networking/filter.rst | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/Documentation/networking/filter.rst b/Documentation/networking/filter.rst index 1583d59d806d..f6d8f90e9a56 100644 --- a/Documentation/networking/filter.rst +++ b/Documentation/networking/filter.rst @@ -1053,8 +1053,39 @@ encoding. .imm = BPF_ADD, .code = BPF_ATOMIC | BPF_W | BPF_STX: lock xadd *(u32 *)(dst_reg + off16) += src_reg .imm = BPF_ADD, .code = BPF_ATOMIC | BPF_DW | BPF_STX: lock xadd *(u64 *)(dst_reg + off16) += src_reg +The basic atomic operations supported are: + + BPF_ADD + BPF_AND + BPF_OR + BPF_XOR + +Each having equivalent semantics with the ``BPF_ADD`` example, that is: the +memory location addresed by ``dst_reg + off`` is atomically modified, with +``src_reg`` as the other operand. If the ``BPF_FETCH`` flag is set in the +immediate, then these operations also overwrite ``src_reg`` with the +value that was in memory before it was modified. + +The more special operations are: + + BPF_XCHG + +This atomically exchanges ``src_reg`` with the value addressed by ``dst_reg + +off``. + + BPF_CMPXCHG + +This atomically compares the value addressed by ``dst_reg + off`` with +``R0``. If they match it is replaced with ``src_reg``, The value that was there +before is loaded back to ``R0``. + Note that 1 and 2 byte atomic operations are not supported. +Except ``BPF_ADD`` _without_ ``BPF_FETCH`` (for legacy reasons), all 4 byte +atomic operations require alu32 mode. Clang enables this mode by default in +architecture v3 (``-mcpu=v3``). For older versions it can be enabled with +``-Xclang -target-feature -Xclang +alu32``. + You may encounter BPF_XADD - this is a legacy name for BPF_ATOMIC, referring to the exclusive-add operation encoded when the immediate field is zero. -- cgit v1.2.3 From bd7525dacd7e204f8cae061941fb9001c89d6988 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 14 Jan 2021 14:40:42 +0100 Subject: bpf: Move stack_map_get_build_id into lib Moving stack_map_get_build_id into lib with declaration in linux/buildid.h header: int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id); This function returns build id for given struct vm_area_struct. There is no functional change to stack_map_get_build_id function. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20210114134044.1418404-2-jolsa@kernel.org --- include/linux/buildid.h | 11 ++++ kernel/bpf/stackmap.c | 143 ++---------------------------------------------- lib/Makefile | 3 +- lib/buildid.c | 136 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 153 insertions(+), 140 deletions(-) create mode 100644 include/linux/buildid.h create mode 100644 lib/buildid.c diff --git a/include/linux/buildid.h b/include/linux/buildid.h new file mode 100644 index 000000000000..08028a212589 --- /dev/null +++ b/include/linux/buildid.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_BUILDID_H +#define _LINUX_BUILDID_H + +#include + +#define BUILD_ID_SIZE_MAX 20 + +int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id); + +#endif diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index aea96b638473..55d254a59f07 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -7,10 +7,9 @@ #include #include #include -#include -#include #include #include +#include #include "percpu_freelist.h" #define STACK_CREATE_FLAG_MASK \ @@ -143,140 +142,6 @@ free_smap: return ERR_PTR(err); } -#define BPF_BUILD_ID 3 -/* - * Parse build id from the note segment. This logic can be shared between - * 32-bit and 64-bit system, because Elf32_Nhdr and Elf64_Nhdr are - * identical. - */ -static inline int stack_map_parse_build_id(void *page_addr, - unsigned char *build_id, - void *note_start, - Elf32_Word note_size) -{ - Elf32_Word note_offs = 0, new_offs; - - /* check for overflow */ - if (note_start < page_addr || note_start + note_size < note_start) - return -EINVAL; - - /* only supports note that fits in the first page */ - if (note_start + note_size > page_addr + PAGE_SIZE) - return -EINVAL; - - while (note_offs + sizeof(Elf32_Nhdr) < note_size) { - Elf32_Nhdr *nhdr = (Elf32_Nhdr *)(note_start + note_offs); - - if (nhdr->n_type == BPF_BUILD_ID && - nhdr->n_namesz == sizeof("GNU") && - nhdr->n_descsz > 0 && - nhdr->n_descsz <= BPF_BUILD_ID_SIZE) { - memcpy(build_id, - note_start + note_offs + - ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr), - nhdr->n_descsz); - memset(build_id + nhdr->n_descsz, 0, - BPF_BUILD_ID_SIZE - nhdr->n_descsz); - return 0; - } - new_offs = note_offs + sizeof(Elf32_Nhdr) + - ALIGN(nhdr->n_namesz, 4) + ALIGN(nhdr->n_descsz, 4); - if (new_offs <= note_offs) /* overflow */ - break; - note_offs = new_offs; - } - return -EINVAL; -} - -/* Parse build ID from 32-bit ELF */ -static int stack_map_get_build_id_32(void *page_addr, - unsigned char *build_id) -{ - Elf32_Ehdr *ehdr = (Elf32_Ehdr *)page_addr; - Elf32_Phdr *phdr; - int i; - - /* only supports phdr that fits in one page */ - if (ehdr->e_phnum > - (PAGE_SIZE - sizeof(Elf32_Ehdr)) / sizeof(Elf32_Phdr)) - return -EINVAL; - - phdr = (Elf32_Phdr *)(page_addr + sizeof(Elf32_Ehdr)); - - for (i = 0; i < ehdr->e_phnum; ++i) { - if (phdr[i].p_type == PT_NOTE && - !stack_map_parse_build_id(page_addr, build_id, - page_addr + phdr[i].p_offset, - phdr[i].p_filesz)) - return 0; - } - return -EINVAL; -} - -/* Parse build ID from 64-bit ELF */ -static int stack_map_get_build_id_64(void *page_addr, - unsigned char *build_id) -{ - Elf64_Ehdr *ehdr = (Elf64_Ehdr *)page_addr; - Elf64_Phdr *phdr; - int i; - - /* only supports phdr that fits in one page */ - if (ehdr->e_phnum > - (PAGE_SIZE - sizeof(Elf64_Ehdr)) / sizeof(Elf64_Phdr)) - return -EINVAL; - - phdr = (Elf64_Phdr *)(page_addr + sizeof(Elf64_Ehdr)); - - for (i = 0; i < ehdr->e_phnum; ++i) { - if (phdr[i].p_type == PT_NOTE && - !stack_map_parse_build_id(page_addr, build_id, - page_addr + phdr[i].p_offset, - phdr[i].p_filesz)) - return 0; - } - return -EINVAL; -} - -/* Parse build ID of ELF file mapped to vma */ -static int stack_map_get_build_id(struct vm_area_struct *vma, - unsigned char *build_id) -{ - Elf32_Ehdr *ehdr; - struct page *page; - void *page_addr; - int ret; - - /* only works for page backed storage */ - if (!vma->vm_file) - return -EINVAL; - - page = find_get_page(vma->vm_file->f_mapping, 0); - if (!page) - return -EFAULT; /* page not mapped */ - - ret = -EINVAL; - page_addr = kmap_atomic(page); - ehdr = (Elf32_Ehdr *)page_addr; - - /* compare magic x7f "ELF" */ - if (memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0) - goto out; - - /* only support executable file and shared object file */ - if (ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN) - goto out; - - if (ehdr->e_ident[EI_CLASS] == ELFCLASS32) - ret = stack_map_get_build_id_32(page_addr, build_id); - else if (ehdr->e_ident[EI_CLASS] == ELFCLASS64) - ret = stack_map_get_build_id_64(page_addr, build_id); -out: - kunmap_atomic(page_addr); - put_page(page); - return ret; -} - static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, u64 *ips, u32 trace_nr, bool user) { @@ -317,18 +182,18 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, for (i = 0; i < trace_nr; i++) { id_offs[i].status = BPF_STACK_BUILD_ID_IP; id_offs[i].ip = ips[i]; - memset(id_offs[i].build_id, 0, BPF_BUILD_ID_SIZE); + memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX); } return; } for (i = 0; i < trace_nr; i++) { vma = find_vma(current->mm, ips[i]); - if (!vma || stack_map_get_build_id(vma, id_offs[i].build_id)) { + if (!vma || build_id_parse(vma, id_offs[i].build_id)) { /* per entry fall back to ips */ id_offs[i].status = BPF_STACK_BUILD_ID_IP; id_offs[i].ip = ips[i]; - memset(id_offs[i].build_id, 0, BPF_BUILD_ID_SIZE); + memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX); continue; } id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i] diff --git a/lib/Makefile b/lib/Makefile index afeff05fa8c5..a6b160c3a4fa 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -36,7 +36,8 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ flex_proportions.o ratelimit.o show_mem.o \ is_single_threaded.o plist.o decompress.o kobject_uevent.o \ earlycpio.o seq_buf.o siphash.o dec_and_lock.o \ - nmi_backtrace.o nodemask.o win_minmax.o memcat_p.o + nmi_backtrace.o nodemask.o win_minmax.o memcat_p.o \ + buildid.o lib-$(CONFIG_PRINTK) += dump_stack.o lib-$(CONFIG_SMP) += cpumask.o diff --git a/lib/buildid.c b/lib/buildid.c new file mode 100644 index 000000000000..4a4f520c0e29 --- /dev/null +++ b/lib/buildid.c @@ -0,0 +1,136 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include + +#define BUILD_ID 3 +/* + * Parse build id from the note segment. This logic can be shared between + * 32-bit and 64-bit system, because Elf32_Nhdr and Elf64_Nhdr are + * identical. + */ +static inline int parse_build_id(void *page_addr, + unsigned char *build_id, + void *note_start, + Elf32_Word note_size) +{ + Elf32_Word note_offs = 0, new_offs; + + /* check for overflow */ + if (note_start < page_addr || note_start + note_size < note_start) + return -EINVAL; + + /* only supports note that fits in the first page */ + if (note_start + note_size > page_addr + PAGE_SIZE) + return -EINVAL; + + while (note_offs + sizeof(Elf32_Nhdr) < note_size) { + Elf32_Nhdr *nhdr = (Elf32_Nhdr *)(note_start + note_offs); + + if (nhdr->n_type == BUILD_ID && + nhdr->n_namesz == sizeof("GNU") && + nhdr->n_descsz > 0 && + nhdr->n_descsz <= BUILD_ID_SIZE_MAX) { + memcpy(build_id, + note_start + note_offs + + ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr), + nhdr->n_descsz); + memset(build_id + nhdr->n_descsz, 0, + BUILD_ID_SIZE_MAX - nhdr->n_descsz); + return 0; + } + new_offs = note_offs + sizeof(Elf32_Nhdr) + + ALIGN(nhdr->n_namesz, 4) + ALIGN(nhdr->n_descsz, 4); + if (new_offs <= note_offs) /* overflow */ + break; + note_offs = new_offs; + } + return -EINVAL; +} + +/* Parse build ID from 32-bit ELF */ +static int get_build_id_32(void *page_addr, unsigned char *build_id) +{ + Elf32_Ehdr *ehdr = (Elf32_Ehdr *)page_addr; + Elf32_Phdr *phdr; + int i; + + /* only supports phdr that fits in one page */ + if (ehdr->e_phnum > + (PAGE_SIZE - sizeof(Elf32_Ehdr)) / sizeof(Elf32_Phdr)) + return -EINVAL; + + phdr = (Elf32_Phdr *)(page_addr + sizeof(Elf32_Ehdr)); + + for (i = 0; i < ehdr->e_phnum; ++i) { + if (phdr[i].p_type == PT_NOTE && + !parse_build_id(page_addr, build_id, + page_addr + phdr[i].p_offset, + phdr[i].p_filesz)) + return 0; + } + return -EINVAL; +} + +/* Parse build ID from 64-bit ELF */ +static int get_build_id_64(void *page_addr, unsigned char *build_id) +{ + Elf64_Ehdr *ehdr = (Elf64_Ehdr *)page_addr; + Elf64_Phdr *phdr; + int i; + + /* only supports phdr that fits in one page */ + if (ehdr->e_phnum > + (PAGE_SIZE - sizeof(Elf64_Ehdr)) / sizeof(Elf64_Phdr)) + return -EINVAL; + + phdr = (Elf64_Phdr *)(page_addr + sizeof(Elf64_Ehdr)); + + for (i = 0; i < ehdr->e_phnum; ++i) { + if (phdr[i].p_type == PT_NOTE && + !parse_build_id(page_addr, build_id, + page_addr + phdr[i].p_offset, + phdr[i].p_filesz)) + return 0; + } + return -EINVAL; +} + +/* Parse build ID of ELF file mapped to vma */ +int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id) +{ + Elf32_Ehdr *ehdr; + struct page *page; + void *page_addr; + int ret; + + /* only works for page backed storage */ + if (!vma->vm_file) + return -EINVAL; + + page = find_get_page(vma->vm_file->f_mapping, 0); + if (!page) + return -EFAULT; /* page not mapped */ + + ret = -EINVAL; + page_addr = kmap_atomic(page); + ehdr = (Elf32_Ehdr *)page_addr; + + /* compare magic x7f "ELF" */ + if (memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0) + goto out; + + /* only support executable file and shared object file */ + if (ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN) + goto out; + + if (ehdr->e_ident[EI_CLASS] == ELFCLASS32) + ret = get_build_id_32(page_addr, build_id); + else if (ehdr->e_ident[EI_CLASS] == ELFCLASS64) + ret = get_build_id_64(page_addr, build_id); +out: + kunmap_atomic(page_addr); + put_page(page); + return ret; +} -- cgit v1.2.3 From 921f88fc891922b325b3668cd026a386571ed602 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 14 Jan 2021 14:40:43 +0100 Subject: bpf: Add size arg to build_id_parse function It's possible to have other build id types (other than default SHA1). Currently there's also ld support for MD5 build id. Adding size argument to build_id_parse function, that returns (if defined) size of the parsed build id, so we can recognize the build id type. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210114134044.1418404-3-jolsa@kernel.org --- include/linux/buildid.h | 3 ++- kernel/bpf/stackmap.c | 2 +- lib/buildid.c | 29 +++++++++++++++++++++-------- 3 files changed, 24 insertions(+), 10 deletions(-) diff --git a/include/linux/buildid.h b/include/linux/buildid.h index 08028a212589..40232f90db6e 100644 --- a/include/linux/buildid.h +++ b/include/linux/buildid.h @@ -6,6 +6,7 @@ #define BUILD_ID_SIZE_MAX 20 -int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id); +int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id, + __u32 *size); #endif diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 55d254a59f07..cabaf7db8efc 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -189,7 +189,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, for (i = 0; i < trace_nr; i++) { vma = find_vma(current->mm, ips[i]); - if (!vma || build_id_parse(vma, id_offs[i].build_id)) { + if (!vma || build_id_parse(vma, id_offs[i].build_id, NULL)) { /* per entry fall back to ips */ id_offs[i].status = BPF_STACK_BUILD_ID_IP; id_offs[i].ip = ips[i]; diff --git a/lib/buildid.c b/lib/buildid.c index 4a4f520c0e29..6156997c3895 100644 --- a/lib/buildid.c +++ b/lib/buildid.c @@ -12,6 +12,7 @@ */ static inline int parse_build_id(void *page_addr, unsigned char *build_id, + __u32 *size, void *note_start, Elf32_Word note_size) { @@ -38,6 +39,8 @@ static inline int parse_build_id(void *page_addr, nhdr->n_descsz); memset(build_id + nhdr->n_descsz, 0, BUILD_ID_SIZE_MAX - nhdr->n_descsz); + if (size) + *size = nhdr->n_descsz; return 0; } new_offs = note_offs + sizeof(Elf32_Nhdr) + @@ -50,7 +53,8 @@ static inline int parse_build_id(void *page_addr, } /* Parse build ID from 32-bit ELF */ -static int get_build_id_32(void *page_addr, unsigned char *build_id) +static int get_build_id_32(void *page_addr, unsigned char *build_id, + __u32 *size) { Elf32_Ehdr *ehdr = (Elf32_Ehdr *)page_addr; Elf32_Phdr *phdr; @@ -65,7 +69,7 @@ static int get_build_id_32(void *page_addr, unsigned char *build_id) for (i = 0; i < ehdr->e_phnum; ++i) { if (phdr[i].p_type == PT_NOTE && - !parse_build_id(page_addr, build_id, + !parse_build_id(page_addr, build_id, size, page_addr + phdr[i].p_offset, phdr[i].p_filesz)) return 0; @@ -74,7 +78,8 @@ static int get_build_id_32(void *page_addr, unsigned char *build_id) } /* Parse build ID from 64-bit ELF */ -static int get_build_id_64(void *page_addr, unsigned char *build_id) +static int get_build_id_64(void *page_addr, unsigned char *build_id, + __u32 *size) { Elf64_Ehdr *ehdr = (Elf64_Ehdr *)page_addr; Elf64_Phdr *phdr; @@ -89,7 +94,7 @@ static int get_build_id_64(void *page_addr, unsigned char *build_id) for (i = 0; i < ehdr->e_phnum; ++i) { if (phdr[i].p_type == PT_NOTE && - !parse_build_id(page_addr, build_id, + !parse_build_id(page_addr, build_id, size, page_addr + phdr[i].p_offset, phdr[i].p_filesz)) return 0; @@ -97,8 +102,16 @@ static int get_build_id_64(void *page_addr, unsigned char *build_id) return -EINVAL; } -/* Parse build ID of ELF file mapped to vma */ -int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id) +/* + * Parse build ID of ELF file mapped to vma + * @vma: vma object + * @build_id: buffer to store build id, at least BUILD_ID_SIZE long + * @size: returns actual build id size in case of success + * + * Returns 0 on success, otherwise error (< 0). + */ +int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id, + __u32 *size) { Elf32_Ehdr *ehdr; struct page *page; @@ -126,9 +139,9 @@ int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id) goto out; if (ehdr->e_ident[EI_CLASS] == ELFCLASS32) - ret = get_build_id_32(page_addr, build_id); + ret = get_build_id_32(page_addr, build_id, size); else if (ehdr->e_ident[EI_CLASS] == ELFCLASS64) - ret = get_build_id_64(page_addr, build_id); + ret = get_build_id_64(page_addr, build_id, size); out: kunmap_atomic(page_addr); put_page(page); -- cgit v1.2.3 From 88a16a1309333e43d328621ece3e9fa37027e8eb Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 14 Jan 2021 14:40:44 +0100 Subject: perf: Add build id data in mmap2 event Adding support to carry build id data in mmap2 event. The build id data replaces maj/min/ino/ino_generation fields, which are also used to identify map's binary, so it's ok to replace them with build id data: union { struct { u32 maj; u32 min; u64 ino; u64 ino_generation; }; struct { u8 build_id_size; u8 __reserved_1; u16 __reserved_2; u8 build_id[20]; }; }; Replaced maj/min/ino/ino_generation fields give us size of 24 bytes. We use 20 bytes for build id data, 1 byte for size and rest is unused. There's new misc bit for mmap2 to signal there's build id data in it: #define PERF_RECORD_MISC_MMAP_BUILD_ID (1 << 14) Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/bpf/20210114134044.1418404-4-jolsa@kernel.org --- include/uapi/linux/perf_event.h | 42 ++++++++++++++++++++++++++++++++++++----- kernel/events/core.c | 32 +++++++++++++++++++++++++++---- 2 files changed, 65 insertions(+), 9 deletions(-) diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index b15e3447cd9f..cb6f84103560 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -386,7 +386,8 @@ struct perf_event_attr { aux_output : 1, /* generate AUX records instead of events */ cgroup : 1, /* include cgroup events */ text_poke : 1, /* include text poke events */ - __reserved_1 : 30; + build_id : 1, /* use build id in mmap2 events */ + __reserved_1 : 29; union { __u32 wakeup_events; /* wakeup every n events */ @@ -659,6 +660,22 @@ struct perf_event_mmap_page { __u64 aux_size; }; +/* + * The current state of perf_event_header::misc bits usage: + * ('|' used bit, '-' unused bit) + * + * 012 CDEF + * |||---------|||| + * + * Where: + * 0-2 CPUMODE_MASK + * + * C PROC_MAP_PARSE_TIMEOUT + * D MMAP_DATA / COMM_EXEC / FORK_EXEC / SWITCH_OUT + * E MMAP_BUILD_ID / EXACT_IP / SCHED_OUT_PREEMPT + * F (reserved) + */ + #define PERF_RECORD_MISC_CPUMODE_MASK (7 << 0) #define PERF_RECORD_MISC_CPUMODE_UNKNOWN (0 << 0) #define PERF_RECORD_MISC_KERNEL (1 << 0) @@ -690,6 +707,7 @@ struct perf_event_mmap_page { * * PERF_RECORD_MISC_EXACT_IP - PERF_RECORD_SAMPLE of precise events * PERF_RECORD_MISC_SWITCH_OUT_PREEMPT - PERF_RECORD_SWITCH* events + * PERF_RECORD_MISC_MMAP_BUILD_ID - PERF_RECORD_MMAP2 event * * * PERF_RECORD_MISC_EXACT_IP: @@ -699,9 +717,13 @@ struct perf_event_mmap_page { * * PERF_RECORD_MISC_SWITCH_OUT_PREEMPT: * Indicates that thread was preempted in TASK_RUNNING state. + * + * PERF_RECORD_MISC_MMAP_BUILD_ID: + * Indicates that mmap2 event carries build id data. */ #define PERF_RECORD_MISC_EXACT_IP (1 << 14) #define PERF_RECORD_MISC_SWITCH_OUT_PREEMPT (1 << 14) +#define PERF_RECORD_MISC_MMAP_BUILD_ID (1 << 14) /* * Reserve the last bit to indicate some extended misc field */ @@ -915,10 +937,20 @@ enum perf_event_type { * u64 addr; * u64 len; * u64 pgoff; - * u32 maj; - * u32 min; - * u64 ino; - * u64 ino_generation; + * union { + * struct { + * u32 maj; + * u32 min; + * u64 ino; + * u64 ino_generation; + * }; + * struct { + * u8 build_id_size; + * u8 __reserved_1; + * u16 __reserved_2; + * u8 build_id[20]; + * }; + * }; * u32 prot, flags; * char filename[]; * struct sample_id sample_id; diff --git a/kernel/events/core.c b/kernel/events/core.c index 55d18791a72d..c37401e3e5f7 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -53,6 +53,7 @@ #include #include #include +#include #include "internal.h" @@ -397,6 +398,7 @@ static atomic_t nr_ksymbol_events __read_mostly; static atomic_t nr_bpf_events __read_mostly; static atomic_t nr_cgroup_events __read_mostly; static atomic_t nr_text_poke_events __read_mostly; +static atomic_t nr_build_id_events __read_mostly; static LIST_HEAD(pmus); static DEFINE_MUTEX(pmus_lock); @@ -4673,6 +4675,8 @@ static void unaccount_event(struct perf_event *event) dec = true; if (event->attr.mmap || event->attr.mmap_data) atomic_dec(&nr_mmap_events); + if (event->attr.build_id) + atomic_dec(&nr_build_id_events); if (event->attr.comm) atomic_dec(&nr_comm_events); if (event->attr.namespaces) @@ -8046,6 +8050,8 @@ struct perf_mmap_event { u64 ino; u64 ino_generation; u32 prot, flags; + u8 build_id[BUILD_ID_SIZE_MAX]; + u32 build_id_size; struct { struct perf_event_header header; @@ -8077,6 +8083,7 @@ static void perf_event_mmap_output(struct perf_event *event, struct perf_sample_data sample; int size = mmap_event->event_id.header.size; u32 type = mmap_event->event_id.header.type; + bool use_build_id; int ret; if (!perf_event_mmap_match(event, data)) @@ -8101,13 +8108,25 @@ static void perf_event_mmap_output(struct perf_event *event, mmap_event->event_id.pid = perf_event_pid(event, current); mmap_event->event_id.tid = perf_event_tid(event, current); + use_build_id = event->attr.build_id && mmap_event->build_id_size; + + if (event->attr.mmap2 && use_build_id) + mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_BUILD_ID; + perf_output_put(&handle, mmap_event->event_id); if (event->attr.mmap2) { - perf_output_put(&handle, mmap_event->maj); - perf_output_put(&handle, mmap_event->min); - perf_output_put(&handle, mmap_event->ino); - perf_output_put(&handle, mmap_event->ino_generation); + if (use_build_id) { + u8 size[4] = { (u8) mmap_event->build_id_size, 0, 0, 0 }; + + __output_copy(&handle, size, 4); + __output_copy(&handle, mmap_event->build_id, BUILD_ID_SIZE_MAX); + } else { + perf_output_put(&handle, mmap_event->maj); + perf_output_put(&handle, mmap_event->min); + perf_output_put(&handle, mmap_event->ino); + perf_output_put(&handle, mmap_event->ino_generation); + } perf_output_put(&handle, mmap_event->prot); perf_output_put(&handle, mmap_event->flags); } @@ -8236,6 +8255,9 @@ got_name: mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; + if (atomic_read(&nr_build_id_events)) + build_id_parse(vma, mmap_event->build_id, &mmap_event->build_id_size); + perf_iterate_sb(perf_event_mmap_output, mmap_event, NULL); @@ -11172,6 +11194,8 @@ static void account_event(struct perf_event *event) inc = true; if (event->attr.mmap || event->attr.mmap_data) atomic_inc(&nr_mmap_events); + if (event->attr.build_id) + atomic_inc(&nr_build_id_events); if (event->attr.comm) atomic_inc(&nr_comm_events); if (event->attr.namespaces) -- cgit v1.2.3