diff options
159 files changed, 5225 insertions, 1358 deletions
diff --git a/arch/mips/net/bpf_jit_comp32.c b/arch/mips/net/bpf_jit_comp32.c index 83c975d5cca2..ace5db3fbd17 100644 --- a/arch/mips/net/bpf_jit_comp32.c +++ b/arch/mips/net/bpf_jit_comp32.c @@ -1377,11 +1377,19 @@ void build_prologue(struct jit_context *ctx) int stack, saved, locals, reserved; /* + * In the unlikely event that the TCC limit is raised to more + * than 16 bits, it is clamped to the maximum value allowed for + * the generated code (0xffff). It is better fail to compile + * instead of degrading gracefully. + */ + BUILD_BUG_ON(MAX_TAIL_CALL_CNT > 0xffff); + + /* * The first two instructions initialize TCC in the reserved (for us) * 16-byte area in the parent's stack frame. On a tail call, the * calling function jumps into the prologue after these instructions. */ - emit(ctx, ori, MIPS_R_T9, MIPS_R_ZERO, min(MAX_TAIL_CALL_CNT, 0xffff)); + emit(ctx, ori, MIPS_R_T9, MIPS_R_ZERO, MAX_TAIL_CALL_CNT); emit(ctx, sw, MIPS_R_T9, 0, MIPS_R_SP); /* diff --git a/arch/mips/net/bpf_jit_comp64.c b/arch/mips/net/bpf_jit_comp64.c index 6475828ffb36..0e7c1bdcf914 100644 --- a/arch/mips/net/bpf_jit_comp64.c +++ b/arch/mips/net/bpf_jit_comp64.c @@ -548,11 +548,19 @@ void build_prologue(struct jit_context *ctx) int stack, saved, locals, reserved; /* + * In the unlikely event that the TCC limit is raised to more + * than 16 bits, it is clamped to the maximum value allowed for + * the generated code (0xffff). It is better fail to compile + * instead of degrading gracefully. + */ + BUILD_BUG_ON(MAX_TAIL_CALL_CNT > 0xffff); + + /* * The first instruction initializes the tail call count register. * On a tail call, the calling function jumps into the prologue * after this instruction. */ - emit(ctx, ori, tc, MIPS_R_ZERO, min(MAX_TAIL_CALL_CNT, 0xffff)); + emit(ctx, ori, tc, MIPS_R_ZERO, MAX_TAIL_CALL_CNT); /* === Entry-point for tail calls === */ diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 2bd1b5f8de9b..57e9e109257e 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -414,6 +414,11 @@ int cgroup_bpf_prog_detach(const union bpf_attr *attr, int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); int cgroup_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr); + +const struct bpf_func_proto * +cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog); +const struct bpf_func_proto * +cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog); #else static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } @@ -444,6 +449,18 @@ static inline int cgroup_bpf_prog_query(const union bpf_attr *attr, return -EINVAL; } +static inline const struct bpf_func_proto * +cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + return NULL; +} + +static inline const struct bpf_func_proto * +cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + return NULL; +} + static inline int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *map) { return 0; } static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc( diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a627a02cf8ab..9c1674973e03 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -48,6 +48,7 @@ struct mem_cgroup; struct module; struct bpf_func_state; struct ftrace_ops; +struct cgroup; extern struct idr btf_idr; extern spinlock_t btf_idr_lock; @@ -1730,7 +1731,14 @@ int bpf_obj_get_user(const char __user *pathname, int flags); int __init bpf_iter_ ## target(args) { return 0; } struct bpf_iter_aux_info { + /* for map_elem iter */ struct bpf_map *map; + + /* for cgroup iter */ + struct { + struct cgroup *start; /* starting cgroup */ + enum bpf_cgroup_iter_order order; + } cgroup; }; typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog, @@ -1966,6 +1974,15 @@ static inline bool unprivileged_ebpf_enabled(void) return !sysctl_unprivileged_bpf_disabled; } +/* Not all bpf prog type has the bpf_ctx. + * For the bpf prog type that has initialized the bpf_ctx, + * this function can be used to decide if a kernel function + * is called by a bpf program. + */ +static inline bool has_current_bpf_ctx(void) +{ + return !!current->bpf_ctx; +} #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { @@ -2175,6 +2192,10 @@ static inline bool unprivileged_ebpf_enabled(void) return false; } +static inline bool has_current_bpf_ctx(void) +{ + return false; +} #endif /* CONFIG_BPF_SYSCALL */ void __bpf_free_used_btfs(struct bpf_prog_aux *aux, @@ -2362,6 +2383,7 @@ extern const struct bpf_func_proto bpf_sock_map_update_proto; extern const struct bpf_func_proto bpf_sock_hash_update_proto; extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto; extern const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto; +extern const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto; extern const struct bpf_func_proto bpf_msg_redirect_hash_proto; extern const struct bpf_func_proto bpf_msg_redirect_map_proto; extern const struct bpf_func_proto bpf_sk_redirect_hash_proto; diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h new file mode 100644 index 000000000000..3e164b8efaa9 --- /dev/null +++ b/include/linux/bpf_mem_alloc.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ +#ifndef _BPF_MEM_ALLOC_H +#define _BPF_MEM_ALLOC_H +#include <linux/compiler_types.h> +#include <linux/workqueue.h> + +struct bpf_mem_cache; +struct bpf_mem_caches; + +struct bpf_mem_alloc { + struct bpf_mem_caches __percpu *caches; + struct bpf_mem_cache __percpu *cache; + struct work_struct work; +}; + +int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu); +void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma); + +/* kmalloc/kfree equivalent: */ +void *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size); +void bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr); + +/* kmem_cache_alloc/free equivalent: */ +void *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma); +void bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr); + +#endif /* _BPF_MEM_ALLOC_H */ diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 2e3bad8640dc..1fdddbf3546b 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -212,6 +212,17 @@ struct bpf_reference_state { * is used purely to inform the user of a reference leak. */ int insn_idx; + /* There can be a case like: + * main (frame 0) + * cb (frame 1) + * func (frame 3) + * cb (frame 4) + * Hence for frame 4, if callback_ref just stored boolean, it would be + * impossible to distinguish nested callback refs. Hence store the + * frameno and compare that to callback_ref in check_reference_leak when + * exiting a callback function. + */ + int callback_ref; }; /* state of the program: diff --git a/include/linux/filter.h b/include/linux/filter.h index a5f21dc3c432..527ae1d64e27 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -900,8 +900,7 @@ int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk); int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk); void sk_reuseport_prog_free(struct bpf_prog *prog); int sk_detach_filter(struct sock *sk); -int sk_get_filter(struct sock *sk, struct sock_filter __user *filter, - unsigned int len); +int sk_get_filter(struct sock *sk, sockptr_t optval, unsigned int len); bool sk_filter_charge(struct sock *sk, struct sk_filter *fp); void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp); diff --git a/include/linux/igmp.h b/include/linux/igmp.h index 93c262ecbdc9..78890143f079 100644 --- a/include/linux/igmp.h +++ b/include/linux/igmp.h @@ -118,9 +118,9 @@ extern int ip_mc_source(int add, int omode, struct sock *sk, struct ip_mreq_source *mreqs, int ifindex); extern int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf,int ifindex); extern int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf, - struct ip_msfilter __user *optval, int __user *optlen); + sockptr_t optval, sockptr_t optlen); extern int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf, - struct sockaddr_storage __user *p); + sockptr_t optval, size_t offset); extern int ip_mc_sf_allow(struct sock *sk, __be32 local, __be32 rmt, int dif, int sdif); extern void ip_mc_init_dev(struct in_device *); diff --git a/include/linux/mroute.h b/include/linux/mroute.h index 6cbbfe94348c..80b8400ab8b2 100644 --- a/include/linux/mroute.h +++ b/include/linux/mroute.h @@ -17,7 +17,7 @@ static inline int ip_mroute_opt(int opt) } int ip_mroute_setsockopt(struct sock *, int, sockptr_t, unsigned int); -int ip_mroute_getsockopt(struct sock *, int, char __user *, int __user *); +int ip_mroute_getsockopt(struct sock *, int, sockptr_t, sockptr_t); int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg); int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg); int ip_mr_init(void); @@ -29,8 +29,8 @@ static inline int ip_mroute_setsockopt(struct sock *sock, int optname, return -ENOPROTOOPT; } -static inline int ip_mroute_getsockopt(struct sock *sock, int optname, - char __user *optval, int __user *optlen) +static inline int ip_mroute_getsockopt(struct sock *sk, int optname, + sockptr_t optval, sockptr_t optlen) { return -ENOPROTOOPT; } diff --git a/include/linux/mroute6.h b/include/linux/mroute6.h index bc351a85ce9b..8f2b307fb124 100644 --- a/include/linux/mroute6.h +++ b/include/linux/mroute6.h @@ -27,7 +27,7 @@ struct sock; #ifdef CONFIG_IPV6_MROUTE extern int ip6_mroute_setsockopt(struct sock *, int, sockptr_t, unsigned int); -extern int ip6_mroute_getsockopt(struct sock *, int, char __user *, int __user *); +extern int ip6_mroute_getsockopt(struct sock *, int, sockptr_t, sockptr_t); extern int ip6_mr_input(struct sk_buff *skb); extern int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg); extern int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg); @@ -42,7 +42,7 @@ static inline int ip6_mroute_setsockopt(struct sock *sock, int optname, static inline int ip6_mroute_getsockopt(struct sock *sock, - int optname, char __user *optval, int __user *optlen) + int optname, sockptr_t optval, sockptr_t optlen) { return -ENOPROTOOPT; } diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b51d07a727c9..43c37385f1e9 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1461,8 +1461,8 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector, unsigned int key_count); struct bpf_flow_dissector; -bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, - __be16 proto, int nhoff, int hlen, unsigned int flags); +u32 bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, + __be16 proto, int nhoff, int hlen, unsigned int flags); bool __skb_flow_dissect(const struct net *net, const struct sk_buff *skb, diff --git a/include/linux/sockptr.h b/include/linux/sockptr.h index d45902fb4cad..bae5e2369b4f 100644 --- a/include/linux/sockptr.h +++ b/include/linux/sockptr.h @@ -64,6 +64,11 @@ static inline int copy_to_sockptr_offset(sockptr_t dst, size_t offset, return 0; } +static inline int copy_to_sockptr(sockptr_t dst, const void *src, size_t size) +{ + return copy_to_sockptr_offset(dst, 0, src, size); +} + static inline void *memdup_sockptr(sockptr_t src, size_t len) { void *p = kmalloc_track_caller(len, GFP_USER | __GFP_NOWARN); diff --git a/include/linux/tnum.h b/include/linux/tnum.h index 498dbcedb451..1c3948a1d6ad 100644 --- a/include/linux/tnum.h +++ b/include/linux/tnum.h @@ -21,7 +21,12 @@ struct tnum { struct tnum tnum_const(u64 value); /* A completely unknown value */ extern const struct tnum tnum_unknown; -/* A value that's unknown except that @min <= value <= @max */ +/* An unknown value that is a superset of @min <= value <= @max. + * + * Could include values outside the range of [@min, @max]. + * For example tnum_range(0, 2) is represented by {0, 1, 2, *3*}, + * rather than the intended set of {0, 1, 2}. + */ struct tnum tnum_range(u64 min, u64 max); /* Arithmetic and logical ops */ @@ -73,7 +78,18 @@ static inline bool tnum_is_unknown(struct tnum a) */ bool tnum_is_aligned(struct tnum a, u64 size); -/* Returns true if @b represents a subset of @a. */ +/* Returns true if @b represents a subset of @a. + * + * Note that using tnum_range() as @a requires extra cautions as tnum_in() may + * return true unexpectedly due to tnum limited ability to represent tight + * range, e.g. + * + * tnum_in(tnum_range(0, 2), tnum_const(3)) == true + * + * As a rule of thumb, if @a is explicitly coded rather than coming from + * reg->var_off, it should be in form of tnum_const(), tnum_range(0, 2**n - 1), + * or tnum_range(2**n, 2**(n+1) - 1). + */ bool tnum_in(struct tnum a, struct tnum b); /* Formatting functions. These have snprintf-like semantics: they will write diff --git a/include/net/ip.h b/include/net/ip.h index 1c979fd1904c..038097c2a152 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -743,8 +743,12 @@ void ip_cmsg_recv_offset(struct msghdr *msg, struct sock *sk, int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc, bool allow_ipv6); DECLARE_STATIC_KEY_FALSE(ip4_min_ttl); +int do_ip_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, + unsigned int optlen); int ip_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); +int do_ip_getsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, sockptr_t optlen); int ip_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); int ip_ra_control(struct sock *sk, unsigned char on, diff --git a/include/net/ipv6.h b/include/net/ipv6.h index de9dcc5652c4..d664ba5812d8 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -1156,8 +1156,12 @@ struct in6_addr *fl6_update_dst(struct flowi6 *fl6, */ DECLARE_STATIC_KEY_FALSE(ip6_min_hopcount); +int do_ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, + unsigned int optlen); int ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); +int do_ipv6_getsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, sockptr_t optlen); int ipv6_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); @@ -1207,7 +1211,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk, int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf, struct sockaddr_storage *list); int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, - struct sockaddr_storage __user *p); + sockptr_t optval, size_t ss_offset); #ifdef CONFIG_PROC_FS int ac6_proc_init(struct net *net); diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h index 45e0339be6fa..c48186bf4737 100644 --- a/include/net/ipv6_stubs.h +++ b/include/net/ipv6_stubs.h @@ -81,6 +81,10 @@ struct ipv6_bpf_stub { const struct in6_addr *daddr, __be16 dport, int dif, int sdif, struct udp_table *tbl, struct sk_buff *skb); + int (*ipv6_setsockopt)(struct sock *sk, int level, int optname, + sockptr_t optval, unsigned int optlen); + int (*ipv6_getsockopt)(struct sock *sk, int level, int optname, + sockptr_t optval, sockptr_t optlen); }; extern const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly; diff --git a/include/net/sock.h b/include/net/sock.h index ca469980e006..96a31026e35d 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1788,6 +1788,11 @@ static inline void unlock_sock_fast(struct sock *sk, bool slow) } } +void sockopt_lock_sock(struct sock *sk); +void sockopt_release_sock(struct sock *sk); +bool sockopt_ns_capable(struct user_namespace *ns, int cap); +bool sockopt_capable(int cap); + /* Used by processes to "lock" a socket state, so that * interrupts and bottom half handlers won't change it * from under us. It essentially blocks any incoming @@ -1862,9 +1867,13 @@ void sock_pfree(struct sk_buff *skb); #define sock_edemux sock_efree #endif +int sk_setsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, unsigned int optlen); int sock_setsockopt(struct socket *sock, int level, int op, sockptr_t optval, unsigned int optlen); +int sk_getsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, sockptr_t optlen); int sock_getsockopt(struct socket *sock, int level, int op, char __user *optval, int __user *optlen); int sock_gettstamp(struct socket *sock, void __user *userstamp, diff --git a/include/net/tcp.h b/include/net/tcp.h index d10962b9f0d0..735e957f7f4b 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -402,9 +402,13 @@ void tcp_init_sock(struct sock *sk); void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb); __poll_t tcp_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait); +int do_tcp_getsockopt(struct sock *sk, int level, + int optname, sockptr_t optval, sockptr_t optlen); int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); bool tcp_bpf_bypass_getsockopt(int level, int optname); +int do_tcp_setsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, unsigned int optlen); int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); void tcp_set_keepalive(struct sock *sk, int val); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1d6085e15fc8..793103b10eab 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -87,10 +87,29 @@ struct bpf_cgroup_storage_key { __u32 attach_type; /* program attach type (enum bpf_attach_type) */ }; +enum bpf_cgroup_iter_order { + BPF_CGROUP_ITER_ORDER_UNSPEC = 0, + BPF_CGROUP_ITER_SELF_ONLY, /* process only a single object. */ + BPF_CGROUP_ITER_DESCENDANTS_PRE, /* walk descendants in pre-order. */ + BPF_CGROUP_ITER_DESCENDANTS_POST, /* walk descendants in post-order. */ + BPF_CGROUP_ITER_ANCESTORS_UP, /* walk ancestors upward. */ +}; + union bpf_iter_link_info { struct { __u32 map_fd; } map; + struct { + enum bpf_cgroup_iter_order order; + + /* At most one of cgroup_fd and cgroup_id can be non-zero. If + * both are zero, the walk starts from the default cgroup v2 + * root. For walking v1 hierarchy, one should always explicitly + * specify cgroup_fd. + */ + __u32 cgroup_fd; + __u64 cgroup_id; + } cgroup; }; /* BPF syscall commands, see bpf(2) man-page for more details. */ @@ -4437,7 +4456,7 @@ union bpf_attr { * * **-EEXIST** if the option already exists. * - * **-EFAULT** on failrue to parse the existing header options. + * **-EFAULT** on failure to parse the existing header options. * * **-EPERM** if the helper cannot be used under the current * *skops*\ **->op**. @@ -4646,7 +4665,7 @@ union bpf_attr { * a *map* with *task* as the **key**. From this * perspective, the usage is not much different from * **bpf_map_lookup_elem**\ (*map*, **&**\ *task*) except this - * helper enforces the key must be an task_struct and the map must also + * helper enforces the key must be a task_struct and the map must also * be a **BPF_MAP_TYPE_TASK_STORAGE**. * * Underneath, the value is stored locally at *task* instead of @@ -4704,7 +4723,7 @@ union bpf_attr { * * long bpf_ima_inode_hash(struct inode *inode, void *dst, u32 size) * Description - * Returns the stored IMA hash of the *inode* (if it's avaialable). + * Returns the stored IMA hash of the *inode* (if it's available). * If the hash is larger than *size*, then only *size* * bytes will be copied to *dst* * Return @@ -4728,12 +4747,12 @@ union bpf_attr { * * The argument *len_diff* can be used for querying with a planned * size change. This allows to check MTU prior to changing packet - * ctx. Providing an *len_diff* adjustment that is larger than the + * ctx. Providing a *len_diff* adjustment that is larger than the * actual packet size (resulting in negative packet size) will in - * principle not exceed the MTU, why it is not considered a - * failure. Other BPF-helpers are needed for performing the - * planned size change, why the responsability for catch a negative - * packet size belong in those helpers. + * principle not exceed the MTU, which is why it is not considered + * a failure. Other BPF helpers are needed for performing the + * planned size change; therefore the responsibility for catching + * a negative packet size belongs in those helpers. * * Specifying *ifindex* zero means the MTU check is performed * against the current net device. This is practical if this isn't @@ -5085,17 +5104,29 @@ union bpf_attr { * * int bpf_get_retval(void) * Description - * Get the syscall's return value that will be returned to userspace. + * Get the BPF program's return value that will be returned to the upper layers. * - * This helper is currently supported by cgroup programs only. + * This helper is currently supported by cgroup programs and only by the hooks + * where BPF program's return value is returned to the userspace via errno. * Return - * The syscall's return value. + * The BPF program's return value. * * int bpf_set_retval(int retval) * Description - * Set the syscall's return value that will be returned to userspace. + * Set the BPF program's return value that will be returned to the upper layers. + * + * This helper is currently supported by cgroup programs and only by the hooks + * where BPF program's return value is returned to the userspace via errno. + * + * Note that there is the following corner case where the program exports an error + * via bpf_set_retval but signals success via 'return 1': + * + * bpf_set_retval(-EPERM); + * return 1; + * + * In this case, the BPF program's return value will use helper's -EPERM. This + * still holds true for cgroup/bind{4,6} which supports extra 'return 3' success case. * - * This helper is currently supported by cgroup programs only. * Return * 0 on success, or a negative error in case of failure. * @@ -5628,6 +5659,11 @@ enum { BPF_F_SEQ_NUMBER = (1ULL << 3), }; +/* BPF_FUNC_skb_get_tunnel_key flags. */ +enum { + BPF_F_TUNINFO_FLAGS = (1ULL << 4), +}; + /* BPF_FUNC_perf_event_output, BPF_FUNC_perf_event_read and * BPF_FUNC_perf_event_read_value flags. */ @@ -5817,7 +5853,10 @@ struct bpf_tunnel_key { }; __u8 tunnel_tos; __u8 tunnel_ttl; - __u16 tunnel_ext; /* Padding, future use. */ + union { + __u16 tunnel_ext; /* compat */ + __be16 tunnel_flags; + }; __u32 tunnel_label; union { __u32 local_ipv4; @@ -5861,6 +5900,11 @@ enum bpf_ret_code { * represented by BPF_REDIRECT above). */ BPF_LWT_REROUTE = 128, + /* BPF_FLOW_DISSECTOR_CONTINUE: used by BPF_PROG_TYPE_FLOW_DISSECTOR + * to indicate that no custom dissection was performed, and + * fallback to standard dissector is requested. + */ + BPF_FLOW_DISSECTOR_CONTINUE = 129, }; struct bpf_sock { @@ -6159,11 +6203,22 @@ struct bpf_link_info { struct { __aligned_u64 target_name; /* in/out: target_name buffer ptr */ __u32 target_name_len; /* in/out: target_name buffer len */ + + /* If the iter specific field is 32 bits, it can be put + * in the first or second union. Otherwise it should be + * put in the second union. + */ union { struct { __u32 map_id; } map; }; + union { + struct { + __u64 cgroup_id; + __u32 order; + } cgroup; + }; } iter; struct { __u32 netns_ino; diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 057ba8e01e70..341c94f208f4 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -13,7 +13,7 @@ obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o obj-$(CONFIG_BPF_JIT) += trampoline.o -obj-$(CONFIG_BPF_SYSCALL) += btf.o +obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o obj-$(CONFIG_BPF_JIT) += dispatcher.o ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o @@ -24,6 +24,9 @@ endif ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o endif +ifeq ($(CONFIG_CGROUPS),y) +obj-$(CONFIG_BPF_SYSCALL) += cgroup_iter.o +endif obj-$(CONFIG_CGROUP_BPF) += cgroup.o ifeq ($(CONFIG_INET),y) obj-$(CONFIG_BPF_SYSCALL) += reuseport_array.o diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 97bb57493ed5..5dc307bdeaeb 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -694,19 +694,24 @@ struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop) int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx) { + struct bpf_run_ctx run_ctx, *old_run_ctx; int ret; if (prog->aux->sleepable) { rcu_read_lock_trace(); migrate_disable(); might_fault(); + old_run_ctx = bpf_set_run_ctx(&run_ctx); ret = bpf_prog_run(prog, ctx); + bpf_reset_run_ctx(old_run_ctx); migrate_enable(); rcu_read_unlock_trace(); } else { rcu_read_lock(); migrate_disable(); + old_run_ctx = bpf_set_run_ctx(&run_ctx); ret = bpf_prog_run(prog, ctx); + bpf_reset_run_ctx(old_run_ctx); migrate_enable(); rcu_read_unlock(); } diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 4ee2e7286c23..802fc15b0d73 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -555,11 +555,11 @@ void bpf_local_storage_map_free(struct bpf_local_storage_map *smap, struct bpf_local_storage_elem, map_node))) { if (busy_counter) { migrate_disable(); - __this_cpu_inc(*busy_counter); + this_cpu_inc(*busy_counter); } bpf_selem_unlink(selem, false); if (busy_counter) { - __this_cpu_dec(*busy_counter); + this_cpu_dec(*busy_counter); migrate_enable(); } cond_resched_rcu(); diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index fa71d58b7ded..4fd845bc5a12 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -41,17 +41,21 @@ BTF_SET_END(bpf_lsm_hooks) */ BTF_SET_START(bpf_lsm_current_hooks) /* operate on freshly allocated sk without any cgroup association */ +#ifdef CONFIG_SECURITY_NETWORK BTF_ID(func, bpf_lsm_sk_alloc_security) BTF_ID(func, bpf_lsm_sk_free_security) +#endif BTF_SET_END(bpf_lsm_current_hooks) /* List of LSM hooks that trigger while the socket is properly locked. */ BTF_SET_START(bpf_lsm_locked_sockopt_hooks) +#ifdef CONFIG_SECURITY_NETWORK BTF_ID(func, bpf_lsm_socket_sock_rcv_skb) BTF_ID(func, bpf_lsm_sock_graft) BTF_ID(func, bpf_lsm_inet_csk_clone) BTF_ID(func, bpf_lsm_inet_conn_established) +#endif BTF_SET_END(bpf_lsm_locked_sockopt_hooks) /* List of LSM hooks that trigger while the socket is _not_ locked, @@ -59,8 +63,10 @@ BTF_SET_END(bpf_lsm_locked_sockopt_hooks) * in the early init phase. */ BTF_SET_START(bpf_lsm_unlocked_sockopt_hooks) +#ifdef CONFIG_SECURITY_NETWORK BTF_ID(func, bpf_lsm_socket_post_create) BTF_ID(func, bpf_lsm_socket_socketpair) +#endif BTF_SET_END(bpf_lsm_unlocked_sockopt_hooks) #ifdef CONFIG_CGROUP_BPF @@ -189,6 +195,14 @@ static const struct bpf_func_proto bpf_get_attach_cookie_proto = { static const struct bpf_func_proto * bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { + const struct bpf_func_proto *func_proto; + + if (prog->expected_attach_type == BPF_LSM_CGROUP) { + func_proto = cgroup_common_func_proto(func_id, prog); + if (func_proto) + return func_proto; + } + switch (func_id) { case BPF_FUNC_inode_storage_get: return &bpf_inode_storage_get_proto; @@ -212,15 +226,6 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return prog->aux->sleepable ? &bpf_ima_file_hash_proto : NULL; case BPF_FUNC_get_attach_cookie: return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto : NULL; - case BPF_FUNC_get_local_storage: - return prog->expected_attach_type == BPF_LSM_CGROUP ? - &bpf_get_local_storage_proto : NULL; - case BPF_FUNC_set_retval: - return prog->expected_attach_type == BPF_LSM_CGROUP ? - &bpf_set_retval_proto : NULL; - case BPF_FUNC_get_retval: - return prog->expected_attach_type == BPF_LSM_CGROUP ? - &bpf_get_retval_proto : NULL; #ifdef CONFIG_NET case BPF_FUNC_setsockopt: if (prog->expected_attach_type != BPF_LSM_CGROUP) diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index e9014dc62682..6f290623347e 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -26,20 +26,20 @@ static DEFINE_PER_CPU(int, bpf_task_storage_busy); static void bpf_task_storage_lock(void) { migrate_disable(); - __this_cpu_inc(bpf_task_storage_busy); + this_cpu_inc(bpf_task_storage_busy); } static void bpf_task_storage_unlock(void) { - __this_cpu_dec(bpf_task_storage_busy); + this_cpu_dec(bpf_task_storage_busy); migrate_enable(); } static bool bpf_task_storage_trylock(void) { migrate_disable(); - if (unlikely(__this_cpu_inc_return(bpf_task_storage_busy) != 1)) { - __this_cpu_dec(bpf_task_storage_busy); + if (unlikely(this_cpu_inc_return(bpf_task_storage_busy) != 1)) { + this_cpu_dec(bpf_task_storage_busy); migrate_enable(); return false; } diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 4a400cd63731..00c7f864900e 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1529,6 +1529,37 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, return ret; } +BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags) +{ + /* flags argument is not used now, + * but provides an ability to extend the API. + * verifier checks that its value is correct. + */ + enum bpf_cgroup_storage_type stype = cgroup_storage_type(map); + struct bpf_cgroup_storage *storage; + struct bpf_cg_run_ctx *ctx; + void *ptr; + + /* get current cgroup storage from BPF run context */ + ctx = container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx); + storage = ctx->prog_item->cgroup_storage[stype]; + + if (stype == BPF_CGROUP_STORAGE_SHARED) + ptr = &READ_ONCE(storage->buf)->data[0]; + else + ptr = this_cpu_ptr(storage->percpu_buf); + + return (unsigned long)ptr; +} + +const struct bpf_func_proto bpf_get_local_storage_proto = { + .func = bpf_get_local_storage, + .gpl_only = false, + .ret_type = RET_PTR_TO_MAP_VALUE, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_ANYTHING, +}; + BPF_CALL_0(bpf_get_retval) { struct bpf_cg_run_ctx *ctx = @@ -1560,32 +1591,26 @@ const struct bpf_func_proto bpf_set_retval_proto = { }; static const struct bpf_func_proto * -cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { + const struct bpf_func_proto *func_proto; + + func_proto = cgroup_common_func_proto(func_id, prog); + if (func_proto) + return func_proto; + + func_proto = cgroup_current_func_proto(func_id, prog); + if (func_proto) + return func_proto; + switch (func_id) { - case BPF_FUNC_get_current_uid_gid: - return &bpf_get_current_uid_gid_proto; - case BPF_FUNC_get_local_storage: - return &bpf_get_local_storage_proto; - case BPF_FUNC_get_current_cgroup_id: - return &bpf_get_current_cgroup_id_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; - case BPF_FUNC_get_retval: - return &bpf_get_retval_proto; - case BPF_FUNC_set_retval: - return &bpf_set_retval_proto; default: return bpf_base_func_proto(func_id); } } -static const struct bpf_func_proto * -cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) -{ - return cgroup_base_func_proto(func_id, prog); -} - static bool cgroup_dev_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, @@ -2098,11 +2123,17 @@ static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = { static const struct bpf_func_proto * sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { + const struct bpf_func_proto *func_proto; + + func_proto = cgroup_common_func_proto(func_id, prog); + if (func_proto) + return func_proto; + + func_proto = cgroup_current_func_proto(func_id, prog); + if (func_proto) + return func_proto; + switch (func_id) { - case BPF_FUNC_strtol: - return &bpf_strtol_proto; - case BPF_FUNC_strtoul: - return &bpf_strtoul_proto; case BPF_FUNC_sysctl_get_name: return &bpf_sysctl_get_name_proto; case BPF_FUNC_sysctl_get_current_value: @@ -2113,8 +2144,10 @@ sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sysctl_set_new_value_proto; case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; + case BPF_FUNC_perf_event_output: + return &bpf_event_output_data_proto; default: - return cgroup_base_func_proto(func_id, prog); + return bpf_base_func_proto(func_id); } } @@ -2235,6 +2268,16 @@ static const struct bpf_func_proto bpf_get_netns_cookie_sockopt_proto = { static const struct bpf_func_proto * cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { + const struct bpf_func_proto *func_proto; + + func_proto = cgroup_common_func_proto(func_id, prog); + if (func_proto) + return func_proto; + + func_proto = cgroup_current_func_proto(func_id, prog); + if (func_proto) + return func_proto; + switch (func_id) { #ifdef CONFIG_NET case BPF_FUNC_get_netns_cookie: @@ -2256,8 +2299,10 @@ cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_tcp_sock: return &bpf_tcp_sock_proto; #endif + case BPF_FUNC_perf_event_output: + return &bpf_event_output_data_proto; default: - return cgroup_base_func_proto(func_id, prog); + return bpf_base_func_proto(func_id); } } @@ -2422,3 +2467,69 @@ const struct bpf_verifier_ops cg_sockopt_verifier_ops = { const struct bpf_prog_ops cg_sockopt_prog_ops = { }; + +/* Common helpers for cgroup hooks. */ +const struct bpf_func_proto * +cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_get_local_storage: + return &bpf_get_local_storage_proto; + case BPF_FUNC_get_retval: + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET_INGRESS: + case BPF_CGROUP_INET_EGRESS: + case BPF_CGROUP_SOCK_OPS: + case BPF_CGROUP_UDP4_RECVMSG: + case BPF_CGROUP_UDP6_RECVMSG: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: + return NULL; + default: + return &bpf_get_retval_proto; + } + case BPF_FUNC_set_retval: + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET_INGRESS: + case BPF_CGROUP_INET_EGRESS: + case BPF_CGROUP_SOCK_OPS: + case BPF_CGROUP_UDP4_RECVMSG: + case BPF_CGROUP_UDP6_RECVMSG: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: + return NULL; + default: + return &bpf_set_retval_proto; + } + default: + return NULL; + } +} + +/* Common helpers for cgroup hooks with valid process context. */ +const struct bpf_func_proto * +cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_get_current_uid_gid: + return &bpf_get_current_uid_gid_proto; + case BPF_FUNC_get_current_pid_tgid: + return &bpf_get_current_pid_tgid_proto; + case BPF_FUNC_get_current_comm: + return &bpf_get_current_comm_proto; + case BPF_FUNC_get_current_cgroup_id: + return &bpf_get_current_cgroup_id_proto; + case BPF_FUNC_get_current_ancestor_cgroup_id: + return &bpf_get_current_ancestor_cgroup_id_proto; +#ifdef CONFIG_CGROUP_NET_CLASSID + case BPF_FUNC_get_cgroup_classid: + return &bpf_get_cgroup_classid_curr_proto; +#endif + default: + return NULL; + } +} diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c new file mode 100644 index 000000000000..0d200a993489 --- /dev/null +++ b/kernel/bpf/cgroup_iter.c @@ -0,0 +1,282 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2022 Google */ +#include <linux/bpf.h> +#include <linux/btf_ids.h> +#include <linux/cgroup.h> +#include <linux/kernel.h> +#include <linux/seq_file.h> + +#include "../cgroup/cgroup-internal.h" /* cgroup_mutex and cgroup_is_dead */ + +/* cgroup_iter provides four modes of traversal to the cgroup hierarchy. + * + * 1. Walk the descendants of a cgroup in pre-order. + * 2. Walk the descendants of a cgroup in post-order. + * 3. Walk the ancestors of a cgroup. + * 4. Show the given cgroup only. + * + * For walking descendants, cgroup_iter can walk in either pre-order or + * post-order. For walking ancestors, the iter walks up from a cgroup to + * the root. + * + * The iter program can terminate the walk early by returning 1. Walk + * continues if prog returns 0. + * + * The prog can check (seq->num == 0) to determine whether this is + * the first element. The prog may also be passed a NULL cgroup, + * which means the walk has completed and the prog has a chance to + * do post-processing, such as outputting an epilogue. + * + * Note: the iter_prog is called with cgroup_mutex held. + * + * Currently only one session is supported, which means, depending on the + * volume of data bpf program intends to send to user space, the number + * of cgroups that can be walked is limited. For example, given the current + * buffer size is 8 * PAGE_SIZE, if the program sends 64B data for each + * cgroup, assuming PAGE_SIZE is 4kb, the total number of cgroups that can + * be walked is 512. This is a limitation of cgroup_iter. If the output data + * is larger than the kernel buffer size, after all data in the kernel buffer + * is consumed by user space, the subsequent read() syscall will signal + * EOPNOTSUPP. In order to work around, the user may have to update their + * program to reduce the volume of data sent to output. For example, skip + * some uninteresting cgroups. + */ + +struct bpf_iter__cgroup { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct cgroup *, cgroup); +}; + +struct cgroup_iter_priv { + struct cgroup_subsys_state *start_css; + bool visited_all; + bool terminate; + int order; +}; + +static void *cgroup_iter_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct cgroup_iter_priv *p = seq->private; + + mutex_lock(&cgroup_mutex); + + /* cgroup_iter doesn't support read across multiple sessions. */ + if (*pos > 0) { + if (p->visited_all) + return NULL; + + /* Haven't visited all, but because cgroup_mutex has dropped, + * return -EOPNOTSUPP to indicate incomplete iteration. + */ + return ERR_PTR(-EOPNOTSUPP); + } + + ++*pos; + p->terminate = false; + p->visited_all = false; + if (p->order == BPF_CGROUP_ITER_DESCENDANTS_PRE) + return css_next_descendant_pre(NULL, p->start_css); + else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST) + return css_next_descendant_post(NULL, p->start_css); + else /* BPF_CGROUP_ITER_SELF_ONLY and BPF_CGROUP_ITER_ANCESTORS_UP */ + return p->start_css; +} + +static int __cgroup_iter_seq_show(struct seq_file *seq, + struct cgroup_subsys_state *css, int in_stop); + +static void cgroup_iter_seq_stop(struct seq_file *seq, void *v) +{ + struct cgroup_iter_priv *p = seq->private; + + mutex_unlock(&cgroup_mutex); + + /* pass NULL to the prog for post-processing */ + if (!v) { + __cgroup_iter_seq_show(seq, NULL, true); + p->visited_all = true; + } +} + +static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct cgroup_subsys_state *curr = (struct cgroup_subsys_state *)v; + struct cgroup_iter_priv *p = seq->private; + + ++*pos; + if (p->terminate) + return NULL; + + if (p->order == BPF_CGROUP_ITER_DESCENDANTS_PRE) + return css_next_descendant_pre(curr, p->start_css); + else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST) + return css_next_descendant_post(curr, p->start_css); + else if (p->order == BPF_CGROUP_ITER_ANCESTORS_UP) + return curr->parent; + else /* BPF_CGROUP_ITER_SELF_ONLY */ + return NULL; +} + +static int __cgroup_iter_seq_show(struct seq_file *seq, + struct cgroup_subsys_state *css, int in_stop) +{ + struct cgroup_iter_priv *p = seq->private; + struct bpf_iter__cgroup ctx; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + int ret = 0; + + /* cgroup is dead, skip this element */ + if (css && cgroup_is_dead(css->cgroup)) + return 0; + + ctx.meta = &meta; + ctx.cgroup = css ? css->cgroup : NULL; + meta.seq = seq; + prog = bpf_iter_get_info(&meta, in_stop); + if (prog) + ret = bpf_iter_run_prog(prog, &ctx); + + /* if prog returns > 0, terminate after this element. */ + if (ret != 0) + p->terminate = true; + + return 0; +} + +static int cgroup_iter_seq_show(struct seq_file *seq, void *v) +{ + return __cgroup_iter_seq_show(seq, (struct cgroup_subsys_state *)v, + false); +} + +static const struct seq_operations cgroup_iter_seq_ops = { + .start = cgroup_iter_seq_start, + .next = cgroup_iter_seq_next, + .stop = cgroup_iter_seq_stop, + .show = cgroup_iter_seq_show, +}; + +BTF_ID_LIST_SINGLE(bpf_cgroup_btf_id, struct, cgroup) + +static int cgroup_iter_seq_init(void *priv, struct bpf_iter_aux_info *aux) +{ + struct cgroup_iter_priv *p = (struct cgroup_iter_priv *)priv; + struct cgroup *cgrp = aux->cgroup.start; + + p->start_css = &cgrp->self; + p->terminate = false; + p->visited_all = false; + p->order = aux->cgroup.order; + return 0; +} + +static const struct bpf_iter_seq_info cgroup_iter_seq_info = { + .seq_ops = &cgroup_iter_seq_ops, + .init_seq_private = cgroup_iter_seq_init, + .seq_priv_size = sizeof(struct cgroup_iter_priv), +}; + +static int bpf_iter_attach_cgroup(struct bpf_prog *prog, + union bpf_iter_link_info *linfo, + struct bpf_iter_aux_info *aux) +{ + int fd = linfo->cgroup.cgroup_fd; + u64 id = linfo->cgroup.cgroup_id; + int order = linfo->cgroup.order; + struct cgroup *cgrp; + + if (order != BPF_CGROUP_ITER_DESCENDANTS_PRE && + order != BPF_CGROUP_ITER_DESCENDANTS_POST && + order != BPF_CGROUP_ITER_ANCESTORS_UP && + order != BPF_CGROUP_ITER_SELF_ONLY) + return -EINVAL; + + if (fd && id) + return -EINVAL; + + if (fd) + cgrp = cgroup_get_from_fd(fd); + else if (id) + cgrp = cgroup_get_from_id(id); + else /* walk the entire hierarchy by default. */ + cgrp = cgroup_get_from_path("/"); + + if (IS_ERR(cgrp)) + return PTR_ERR(cgrp); + + aux->cgroup.start = cgrp; + aux->cgroup.order = order; + return 0; +} + +static void bpf_iter_detach_cgroup(struct bpf_iter_aux_info *aux) +{ + cgroup_put(aux->cgroup.start); +} + +static void bpf_iter_cgroup_show_fdinfo(const struct bpf_iter_aux_info *aux, + struct seq_file *seq) +{ + char *buf; + + buf = kzalloc(PATH_MAX, GFP_KERNEL); + if (!buf) { + seq_puts(seq, "cgroup_path:\t<unknown>\n"); + goto show_order; + } + + /* If cgroup_path_ns() fails, buf will be an empty string, cgroup_path + * will print nothing. + * + * Path is in the calling process's cgroup namespace. + */ + cgroup_path_ns(aux->cgroup.start, buf, PATH_MAX, + current->nsproxy->cgroup_ns); + seq_printf(seq, "cgroup_path:\t%s\n", buf); + kfree(buf); + +show_order: + if (aux->cgroup.order == BPF_CGROUP_ITER_DESCENDANTS_PRE) + seq_puts(seq, "order: descendants_pre\n"); + else if (aux->cgroup.order == BPF_CGROUP_ITER_DESCENDANTS_POST) + seq_puts(seq, "order: descendants_post\n"); + else if (aux->cgroup.order == BPF_CGROUP_ITER_ANCESTORS_UP) + seq_puts(seq, "order: ancestors_up\n"); + else /* BPF_CGROUP_ITER_SELF_ONLY */ + seq_puts(seq, "order: self_only\n"); +} + +static int bpf_iter_cgroup_fill_link_info(const struct bpf_iter_aux_info *aux, + struct bpf_link_info *info) +{ + info->iter.cgroup.order = aux->cgroup.order; + info->iter.cgroup.cgroup_id = cgroup_id(aux->cgroup.start); + return 0; +} + +DEFINE_BPF_ITER_FUNC(cgroup, struct bpf_iter_meta *meta, + struct cgroup *cgroup) + +static struct bpf_iter_reg bpf_cgroup_reg_info = { + .target = "cgroup", + .feature = BPF_ITER_RESCHED, + .attach_target = bpf_iter_attach_cgroup, + .detach_target = bpf_iter_detach_cgroup, + .show_fdinfo = bpf_iter_cgroup_show_fdinfo, + .fill_link_info = bpf_iter_cgroup_fill_link_info, + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__cgroup, cgroup), + PTR_TO_BTF_ID_OR_NULL }, + }, + .seq_info = &cgroup_iter_seq_info, +}; + +static int __init bpf_cgroup_iter_init(void) +{ + bpf_cgroup_reg_info.ctx_arg_info[0].btf_id = bpf_cgroup_btf_id[0]; + return bpf_iter_reg_target(&bpf_cgroup_reg_info); +} + +late_initcall(bpf_cgroup_iter_init); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index b301a63afa2f..0fe3f136cbbe 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -14,6 +14,7 @@ #include "percpu_freelist.h" #include "bpf_lru_list.h" #include "map_in_map.h" +#include <linux/bpf_mem_alloc.h> #define HTAB_CREATE_FLAG_MASK \ (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \ @@ -92,6 +93,8 @@ struct bucket { struct bpf_htab { struct bpf_map map; + struct bpf_mem_alloc ma; + struct bpf_mem_alloc pcpu_ma; struct bucket *buckets; void *elems; union { @@ -99,7 +102,12 @@ struct bpf_htab { struct bpf_lru lru; }; struct htab_elem *__percpu *extra_elems; - atomic_t count; /* number of elements in this hashtable */ + /* number of elements in non-preallocated hashtable are kept + * in either pcount or count + */ + struct percpu_counter pcount; + atomic_t count; + bool use_percpu_counter; u32 n_buckets; /* number of hash buckets */ u32 elem_size; /* size of each element in bytes */ u32 hashrnd; @@ -114,14 +122,14 @@ struct htab_elem { struct { void *padding; union { - struct bpf_htab *htab; struct pcpu_freelist_node fnode; struct htab_elem *batch_flink; }; }; }; union { - struct rcu_head rcu; + /* pointer to per-cpu pointer */ + void *ptr_to_pptr; struct bpf_lru_node lru_node; }; u32 hash; @@ -162,17 +170,25 @@ static inline int htab_lock_bucket(const struct bpf_htab *htab, unsigned long *pflags) { unsigned long flags; + bool use_raw_lock; hash = hash & HASHTAB_MAP_LOCK_MASK; - migrate_disable(); + use_raw_lock = htab_use_raw_lock(htab); + if (use_raw_lock) + preempt_disable(); + else + migrate_disable(); if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) { __this_cpu_dec(*(htab->map_locked[hash])); - migrate_enable(); + if (use_raw_lock) + preempt_enable(); + else + migrate_enable(); return -EBUSY; } - if (htab_use_raw_lock(htab)) + if (use_raw_lock) raw_spin_lock_irqsave(&b->raw_lock, flags); else spin_lock_irqsave(&b->lock, flags); @@ -185,13 +201,18 @@ static inline void htab_unlock_bucket(const struct bpf_htab *htab, struct bucket *b, u32 hash, unsigned long flags) { + bool use_raw_lock = htab_use_raw_lock(htab); + hash = hash & HASHTAB_MAP_LOCK_MASK; - if (htab_use_raw_lock(htab)) + if (use_raw_lock) raw_spin_unlock_irqrestore(&b->raw_lock, flags); else spin_unlock_irqrestore(&b->lock, flags); __this_cpu_dec(*(htab->map_locked[hash])); - migrate_enable(); + if (use_raw_lock) + preempt_enable(); + else + migrate_enable(); } static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node); @@ -428,8 +449,6 @@ static int htab_map_alloc_check(union bpf_attr *attr) bool zero_seed = (attr->map_flags & BPF_F_ZERO_SEED); int numa_node = bpf_map_attr_numa_node(attr); - BUILD_BUG_ON(offsetof(struct htab_elem, htab) != - offsetof(struct htab_elem, hash_node.pprev)); BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) != offsetof(struct htab_elem, hash_node.pprev)); @@ -550,6 +569,29 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) htab_init_buckets(htab); +/* compute_batch_value() computes batch value as num_online_cpus() * 2 + * and __percpu_counter_compare() needs + * htab->max_entries - cur_number_of_elems to be more than batch * num_online_cpus() + * for percpu_counter to be faster than atomic_t. In practice the average bpf + * hash map size is 10k, which means that a system with 64 cpus will fill + * hashmap to 20% of 10k before percpu_counter becomes ineffective. Therefore + * define our own batch count as 32 then 10k hash map can be filled up to 80%: + * 10k - 8k > 32 _batch_ * 64 _cpus_ + * and __percpu_counter_compare() will still be fast. At that point hash map + * collisions will dominate its performance anyway. Assume that hash map filled + * to 50+% isn't going to be O(1) and use the following formula to choose + * between percpu_counter and atomic_t. + */ +#define PERCPU_COUNTER_BATCH 32 + if (attr->max_entries / 2 > num_online_cpus() * PERCPU_COUNTER_BATCH) + htab->use_percpu_counter = true; + + if (htab->use_percpu_counter) { + err = percpu_counter_init(&htab->pcount, 0, GFP_KERNEL); + if (err) + goto free_map_locked; + } + if (prealloc) { err = prealloc_init(htab); if (err) @@ -563,6 +605,16 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) if (err) goto free_prealloc; } + } else { + err = bpf_mem_alloc_init(&htab->ma, htab->elem_size, false); + if (err) + goto free_map_locked; + if (percpu) { + err = bpf_mem_alloc_init(&htab->pcpu_ma, + round_up(htab->map.value_size, 8), true); + if (err) + goto free_map_locked; + } } return &htab->map; @@ -573,6 +625,8 @@ free_map_locked: for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) free_percpu(htab->map_locked[i]); bpf_map_area_free(htab->buckets); + bpf_mem_alloc_destroy(&htab->pcpu_ma); + bpf_mem_alloc_destroy(&htab->ma); free_htab: lockdep_unregister_key(&htab->lockdep_key); bpf_map_area_free(htab); @@ -847,17 +901,9 @@ find_first_elem: static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l) { if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) - free_percpu(htab_elem_get_ptr(l, htab->map.key_size)); + bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr); check_and_free_fields(htab, l); - kfree(l); -} - -static void htab_elem_free_rcu(struct rcu_head *head) -{ - struct htab_elem *l = container_of(head, struct htab_elem, rcu); - struct bpf_htab *htab = l->htab; - - htab_elem_free(htab, l); + bpf_mem_cache_free(&htab->ma, l); } static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l) @@ -871,6 +917,31 @@ static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l) } } +static bool is_map_full(struct bpf_htab *htab) +{ + if (htab->use_percpu_counter) + return __percpu_counter_compare(&htab->pcount, htab->map.max_entries, + PERCPU_COUNTER_BATCH) >= 0; + return atomic_read(&htab->count) >= htab->map.max_entries; +} + +static void inc_elem_count(struct bpf_htab *htab) +{ + if (htab->use_percpu_counter) + percpu_counter_add_batch(&htab->pcount, 1, PERCPU_COUNTER_BATCH); + else + atomic_inc(&htab->count); +} + +static void dec_elem_count(struct bpf_htab *htab) +{ + if (htab->use_percpu_counter) + percpu_counter_add_batch(&htab->pcount, -1, PERCPU_COUNTER_BATCH); + else + atomic_dec(&htab->count); +} + + static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) { htab_put_fd_value(htab, l); @@ -879,9 +950,8 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) check_and_free_fields(htab, l); __pcpu_freelist_push(&htab->freelist, &l->fnode); } else { - atomic_dec(&htab->count); - l->htab = htab; - call_rcu(&l->rcu, htab_elem_free_rcu); + dec_elem_count(htab); + htab_elem_free(htab, l); } } @@ -906,13 +976,12 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr, void *value, bool onallcpus) { - /* When using prealloc and not setting the initial value on all cpus, - * zero-fill element values for other cpus (just as what happens when - * not using prealloc). Otherwise, bpf program has no way to ensure + /* When not setting the initial value on all cpus, zero-fill element + * values for other cpus. Otherwise, bpf program has no way to ensure * known initial values for cpus other than current one * (onallcpus=false always when coming from bpf prog). */ - if (htab_is_prealloc(htab) && !onallcpus) { + if (!onallcpus) { u32 size = round_up(htab->map.value_size, 8); int current_cpu = raw_smp_processor_id(); int cpu; @@ -963,19 +1032,16 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, l_new = container_of(l, struct htab_elem, fnode); } } else { - if (atomic_inc_return(&htab->count) > htab->map.max_entries) - if (!old_elem) { + if (is_map_full(htab)) + if (!old_elem) /* when map is full and update() is replacing * old element, it's ok to allocate, since * old element will be freed immediately. * Otherwise return an error */ - l_new = ERR_PTR(-E2BIG); - goto dec_count; - } - l_new = bpf_map_kmalloc_node(&htab->map, htab->elem_size, - GFP_NOWAIT | __GFP_NOWARN, - htab->map.numa_node); + return ERR_PTR(-E2BIG); + inc_elem_count(htab); + l_new = bpf_mem_cache_alloc(&htab->ma); if (!l_new) { l_new = ERR_PTR(-ENOMEM); goto dec_count; @@ -986,18 +1052,18 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, memcpy(l_new->key, key, key_size); if (percpu) { - size = round_up(size, 8); if (prealloc) { pptr = htab_elem_get_ptr(l_new, key_size); } else { /* alloc_percpu zero-fills */ - pptr = bpf_map_alloc_percpu(&htab->map, size, 8, - GFP_NOWAIT | __GFP_NOWARN); + pptr = bpf_mem_cache_alloc(&htab->pcpu_ma); if (!pptr) { - kfree(l_new); + bpf_mem_cache_free(&htab->ma, l_new); l_new = ERR_PTR(-ENOMEM); goto dec_count; } + l_new->ptr_to_pptr = pptr; + pptr = *(void **)pptr; } pcpu_init_value(htab, pptr, value, onallcpus); @@ -1016,7 +1082,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, l_new->hash = hash; return l_new; dec_count: - atomic_dec(&htab->count); + dec_elem_count(htab); return l_new; } @@ -1416,6 +1482,10 @@ static void delete_all_elements(struct bpf_htab *htab) { int i; + /* It's called from a worker thread, so disable migration here, + * since bpf_mem_cache_free() relies on that. + */ + migrate_disable(); for (i = 0; i < htab->n_buckets; i++) { struct hlist_nulls_head *head = select_bucket(htab, i); struct hlist_nulls_node *n; @@ -1426,6 +1496,7 @@ static void delete_all_elements(struct bpf_htab *htab) htab_elem_free(htab, l); } } + migrate_enable(); } static void htab_free_malloced_timers(struct bpf_htab *htab) @@ -1475,10 +1546,10 @@ static void htab_map_free(struct bpf_map *map) * There is no need to synchronize_rcu() here to protect map elements. */ - /* some of free_htab_elem() callbacks for elements of this map may - * not have executed. Wait for them. + /* htab no longer uses call_rcu() directly. bpf_mem_alloc does it + * underneath and is reponsible for waiting for callbacks to finish + * during bpf_mem_alloc_destroy(). */ - rcu_barrier(); if (!htab_is_prealloc(htab)) { delete_all_elements(htab); } else { @@ -1489,6 +1560,10 @@ static void htab_map_free(struct bpf_map *map) bpf_map_free_kptr_off_tab(map); free_percpu(htab->extra_elems); bpf_map_area_free(htab->buckets); + bpf_mem_alloc_destroy(&htab->pcpu_ma); + bpf_mem_alloc_destroy(&htab->ma); + if (htab->use_percpu_counter) + percpu_counter_destroy(&htab->pcount); for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) free_percpu(htab->map_locked[i]); lockdep_unregister_key(&htab->lockdep_key); @@ -1691,8 +1766,11 @@ again_nocopy: /* do not grab the lock unless need it (bucket_cnt > 0). */ if (locked) { ret = htab_lock_bucket(htab, b, batch, &flags); - if (ret) - goto next_batch; + if (ret) { + rcu_read_unlock(); + bpf_enable_instrumentation(); + goto after_loop; + } } bucket_cnt = 0; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 3c1b9bbcf971..fc08035f14ed 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -427,40 +427,7 @@ const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, }; - -#ifdef CONFIG_CGROUP_BPF - -BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags) -{ - /* flags argument is not used now, - * but provides an ability to extend the API. - * verifier checks that its value is correct. - */ - enum bpf_cgroup_storage_type stype = cgroup_storage_type(map); - struct bpf_cgroup_storage *storage; - struct bpf_cg_run_ctx *ctx; - void *ptr; - - /* get current cgroup storage from BPF run context */ - ctx = container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx); - storage = ctx->prog_item->cgroup_storage[stype]; - - if (stype == BPF_CGROUP_STORAGE_SHARED) - ptr = &READ_ONCE(storage->buf)->data[0]; - else - ptr = this_cpu_ptr(storage->percpu_buf); - - return (unsigned long)ptr; -} - -const struct bpf_func_proto bpf_get_local_storage_proto = { - .func = bpf_get_local_storage, - .gpl_only = false, - .ret_type = RET_PTR_TO_MAP_VALUE, - .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_ANYTHING, -}; -#endif +#endif /* CONFIG_CGROUPS */ #define BPF_STRTOX_BASE_MASK 0x1F @@ -589,7 +556,6 @@ const struct bpf_func_proto bpf_strtoul_proto = { .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_LONG, }; -#endif BPF_CALL_3(bpf_strncmp, const char *, s1, u32, s1_sz, const char *, s2) { @@ -1647,12 +1613,12 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_ringbuf_submit_dynptr_proto; case BPF_FUNC_ringbuf_discard_dynptr: return &bpf_ringbuf_discard_dynptr_proto; - case BPF_FUNC_for_each_map_elem: - return &bpf_for_each_map_elem_proto; - case BPF_FUNC_loop: - return &bpf_loop_proto; case BPF_FUNC_strncmp: return &bpf_strncmp_proto; + case BPF_FUNC_strtol: + return &bpf_strtol_proto; + case BPF_FUNC_strtoul: + return &bpf_strtoul_proto; case BPF_FUNC_dynptr_from_mem: return &bpf_dynptr_from_mem_proto; case BPF_FUNC_dynptr_read: @@ -1689,6 +1655,10 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_timer_cancel_proto; case BPF_FUNC_kptr_xchg: return &bpf_kptr_xchg_proto; + case BPF_FUNC_for_each_map_elem: + return &bpf_for_each_map_elem_proto; + case BPF_FUNC_loop: + return &bpf_loop_proto; default: break; } diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c new file mode 100644 index 000000000000..5cc952da7d41 --- /dev/null +++ b/kernel/bpf/memalloc.c @@ -0,0 +1,634 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ +#include <linux/mm.h> +#include <linux/llist.h> +#include <linux/bpf.h> +#include <linux/irq_work.h> +#include <linux/bpf_mem_alloc.h> +#include <linux/memcontrol.h> +#include <asm/local.h> + +/* Any context (including NMI) BPF specific memory allocator. + * + * Tracing BPF programs can attach to kprobe and fentry. Hence they + * run in unknown context where calling plain kmalloc() might not be safe. + * + * Front-end kmalloc() with per-cpu per-bucket cache of free elements. + * Refill this cache asynchronously from irq_work. + * + * CPU_0 buckets + * 16 32 64 96 128 196 256 512 1024 2048 4096 + * ... + * CPU_N buckets + * 16 32 64 96 128 196 256 512 1024 2048 4096 + * + * The buckets are prefilled at the start. + * BPF programs always run with migration disabled. + * It's safe to allocate from cache of the current cpu with irqs disabled. + * Free-ing is always done into bucket of the current cpu as well. + * irq_work trims extra free elements from buckets with kfree + * and refills them with kmalloc, so global kmalloc logic takes care + * of freeing objects allocated by one cpu and freed on another. + * + * Every allocated objected is padded with extra 8 bytes that contains + * struct llist_node. + */ +#define LLIST_NODE_SZ sizeof(struct llist_node) + +/* similar to kmalloc, but sizeof == 8 bucket is gone */ +static u8 size_index[24] __ro_after_init = { + 3, /* 8 */ + 3, /* 16 */ + 4, /* 24 */ + 4, /* 32 */ + 5, /* 40 */ + 5, /* 48 */ + 5, /* 56 */ + 5, /* 64 */ + 1, /* 72 */ + 1, /* 80 */ + 1, /* 88 */ + 1, /* 96 */ + 6, /* 104 */ + 6, /* 112 */ + 6, /* 120 */ + 6, /* 128 */ + 2, /* 136 */ + 2, /* 144 */ + 2, /* 152 */ + 2, /* 160 */ + 2, /* 168 */ + 2, /* 176 */ + 2, /* 184 */ + 2 /* 192 */ +}; + +static int bpf_mem_cache_idx(size_t size) +{ + if (!size || size > 4096) + return -1; + + if (size <= 192) + return size_index[(size - 1) / 8] - 1; + + return fls(size - 1) - 1; +} + +#define NUM_CACHES 11 + +struct bpf_mem_cache { + /* per-cpu list of free objects of size 'unit_size'. + * All accesses are done with interrupts disabled and 'active' counter + * protection with __llist_add() and __llist_del_first(). + */ + struct llist_head free_llist; + local_t active; + + /* Operations on the free_list from unit_alloc/unit_free/bpf_mem_refill + * are sequenced by per-cpu 'active' counter. But unit_free() cannot + * fail. When 'active' is busy the unit_free() will add an object to + * free_llist_extra. + */ + struct llist_head free_llist_extra; + + struct irq_work refill_work; + struct obj_cgroup *objcg; + int unit_size; + /* count of objects in free_llist */ + int free_cnt; + int low_watermark, high_watermark, batch; + int percpu_size; + + struct rcu_head rcu; + struct llist_head free_by_rcu; + struct llist_head waiting_for_gp; + atomic_t call_rcu_in_progress; +}; + +struct bpf_mem_caches { + struct bpf_mem_cache cache[NUM_CACHES]; +}; + +static struct llist_node notrace *__llist_del_first(struct llist_head *head) +{ + struct llist_node *entry, *next; + + entry = head->first; + if (!entry) + return NULL; + next = entry->next; + head->first = next; + return entry; +} + +static void *__alloc(struct bpf_mem_cache *c, int node) +{ + /* Allocate, but don't deplete atomic reserves that typical + * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc + * will allocate from the current numa node which is what we + * want here. + */ + gfp_t flags = GFP_NOWAIT | __GFP_NOWARN | __GFP_ACCOUNT; + + if (c->percpu_size) { + void **obj = kmalloc_node(c->percpu_size, flags, node); + void *pptr = __alloc_percpu_gfp(c->unit_size, 8, flags); + + if (!obj || !pptr) { + free_percpu(pptr); + kfree(obj); + return NULL; + } + obj[1] = pptr; + return obj; + } + + return kmalloc_node(c->unit_size, flags, node); +} + +static struct mem_cgroup *get_memcg(const struct bpf_mem_cache *c) +{ +#ifdef CONFIG_MEMCG_KMEM + if (c->objcg) + return get_mem_cgroup_from_objcg(c->objcg); +#endif + +#ifdef CONFIG_MEMCG + return root_mem_cgroup; +#else + return NULL; +#endif +} + +/* Mostly runs from irq_work except __init phase. */ +static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node) +{ + struct mem_cgroup *memcg = NULL, *old_memcg; + unsigned long flags; + void *obj; + int i; + + memcg = get_memcg(c); + old_memcg = set_active_memcg(memcg); + for (i = 0; i < cnt; i++) { + obj = __alloc(c, node); + if (!obj) + break; + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + /* In RT irq_work runs in per-cpu kthread, so disable + * interrupts to avoid preemption and interrupts and + * reduce the chance of bpf prog executing on this cpu + * when active counter is busy. + */ + local_irq_save(flags); + /* alloc_bulk runs from irq_work which will not preempt a bpf + * program that does unit_alloc/unit_free since IRQs are + * disabled there. There is no race to increment 'active' + * counter. It protects free_llist from corruption in case NMI + * bpf prog preempted this loop. + */ + WARN_ON_ONCE(local_inc_return(&c->active) != 1); + __llist_add(obj, &c->free_llist); + c->free_cnt++; + local_dec(&c->active); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_restore(flags); + } + set_active_memcg(old_memcg); + mem_cgroup_put(memcg); +} + +static void free_one(struct bpf_mem_cache *c, void *obj) +{ + if (c->percpu_size) { + free_percpu(((void **)obj)[1]); + kfree(obj); + return; + } + + kfree(obj); +} + +static void __free_rcu(struct rcu_head *head) +{ + struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu); + struct llist_node *llnode = llist_del_all(&c->waiting_for_gp); + struct llist_node *pos, *t; + + llist_for_each_safe(pos, t, llnode) + free_one(c, pos); + atomic_set(&c->call_rcu_in_progress, 0); +} + +static void __free_rcu_tasks_trace(struct rcu_head *head) +{ + struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu); + + call_rcu(&c->rcu, __free_rcu); +} + +static void enque_to_free(struct bpf_mem_cache *c, void *obj) +{ + struct llist_node *llnode = obj; + + /* bpf_mem_cache is a per-cpu object. Freeing happens in irq_work. + * Nothing races to add to free_by_rcu list. + */ + __llist_add(llnode, &c->free_by_rcu); +} + +static void do_call_rcu(struct bpf_mem_cache *c) +{ + struct llist_node *llnode, *t; + + if (atomic_xchg(&c->call_rcu_in_progress, 1)) + return; + + WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp)); + llist_for_each_safe(llnode, t, __llist_del_all(&c->free_by_rcu)) + /* There is no concurrent __llist_add(waiting_for_gp) access. + * It doesn't race with llist_del_all either. + * But there could be two concurrent llist_del_all(waiting_for_gp): + * from __free_rcu() and from drain_mem_cache(). + */ + __llist_add(llnode, &c->waiting_for_gp); + /* Use call_rcu_tasks_trace() to wait for sleepable progs to finish. + * Then use call_rcu() to wait for normal progs to finish + * and finally do free_one() on each element. + */ + call_rcu_tasks_trace(&c->rcu, __free_rcu_tasks_trace); +} + +static void free_bulk(struct bpf_mem_cache *c) +{ + struct llist_node *llnode, *t; + unsigned long flags; + int cnt; + + do { + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_save(flags); + WARN_ON_ONCE(local_inc_return(&c->active) != 1); + llnode = __llist_del_first(&c->free_llist); + if (llnode) + cnt = --c->free_cnt; + else + cnt = 0; + local_dec(&c->active); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_restore(flags); + enque_to_free(c, llnode); + } while (cnt > (c->high_watermark + c->low_watermark) / 2); + + /* and drain free_llist_extra */ + llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra)) + enque_to_free(c, llnode); + do_call_rcu(c); +} + +static void bpf_mem_refill(struct irq_work *work) +{ + struct bpf_mem_cache *c = container_of(work, struct bpf_mem_cache, refill_work); + int cnt; + + /* Racy access to free_cnt. It doesn't need to be 100% accurate */ + cnt = c->free_cnt; + if (cnt < c->low_watermark) + /* irq_work runs on this cpu and kmalloc will allocate + * from the current numa node which is what we want here. + */ + alloc_bulk(c, c->batch, NUMA_NO_NODE); + else if (cnt > c->high_watermark) + free_bulk(c); +} + +static void notrace irq_work_raise(struct bpf_mem_cache *c) +{ + irq_work_queue(&c->refill_work); +} + +/* For typical bpf map case that uses bpf_mem_cache_alloc and single bucket + * the freelist cache will be elem_size * 64 (or less) on each cpu. + * + * For bpf programs that don't have statically known allocation sizes and + * assuming (low_mark + high_mark) / 2 as an average number of elements per + * bucket and all buckets are used the total amount of memory in freelists + * on each cpu will be: + * 64*16 + 64*32 + 64*64 + 64*96 + 64*128 + 64*196 + 64*256 + 32*512 + 16*1024 + 8*2048 + 4*4096 + * == ~ 116 Kbyte using below heuristic. + * Initialized, but unused bpf allocator (not bpf map specific one) will + * consume ~ 11 Kbyte per cpu. + * Typical case will be between 11K and 116K closer to 11K. + * bpf progs can and should share bpf_mem_cache when possible. + */ + +static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu) +{ + init_irq_work(&c->refill_work, bpf_mem_refill); + if (c->unit_size <= 256) { + c->low_watermark = 32; + c->high_watermark = 96; + } else { + /* When page_size == 4k, order-0 cache will have low_mark == 2 + * and high_mark == 6 with batch alloc of 3 individual pages at + * a time. + * 8k allocs and above low == 1, high == 3, batch == 1. + */ + c->low_watermark = max(32 * 256 / c->unit_size, 1); + c->high_watermark = max(96 * 256 / c->unit_size, 3); + } + c->batch = max((c->high_watermark - c->low_watermark) / 4 * 3, 1); + + /* To avoid consuming memory assume that 1st run of bpf + * prog won't be doing more than 4 map_update_elem from + * irq disabled region + */ + alloc_bulk(c, c->unit_size <= 256 ? 4 : 1, cpu_to_node(cpu)); +} + +/* When size != 0 bpf_mem_cache for each cpu. + * This is typical bpf hash map use case when all elements have equal size. + * + * When size == 0 allocate 11 bpf_mem_cache-s for each cpu, then rely on + * kmalloc/kfree. Max allocation size is 4096 in this case. + * This is bpf_dynptr and bpf_kptr use case. + */ +int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) +{ + static u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096}; + struct bpf_mem_caches *cc, __percpu *pcc; + struct bpf_mem_cache *c, __percpu *pc; + struct obj_cgroup *objcg = NULL; + int cpu, i, unit_size, percpu_size = 0; + + if (size) { + pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL); + if (!pc) + return -ENOMEM; + + if (percpu) + /* room for llist_node and per-cpu pointer */ + percpu_size = LLIST_NODE_SZ + sizeof(void *); + else + size += LLIST_NODE_SZ; /* room for llist_node */ + unit_size = size; + +#ifdef CONFIG_MEMCG_KMEM + objcg = get_obj_cgroup_from_current(); +#endif + for_each_possible_cpu(cpu) { + c = per_cpu_ptr(pc, cpu); + c->unit_size = unit_size; + c->objcg = objcg; + c->percpu_size = percpu_size; + prefill_mem_cache(c, cpu); + } + ma->cache = pc; + return 0; + } + + /* size == 0 && percpu is an invalid combination */ + if (WARN_ON_ONCE(percpu)) + return -EINVAL; + + pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL); + if (!pcc) + return -ENOMEM; +#ifdef CONFIG_MEMCG_KMEM + objcg = get_obj_cgroup_from_current(); +#endif + for_each_possible_cpu(cpu) { + cc = per_cpu_ptr(pcc, cpu); + for (i = 0; i < NUM_CACHES; i++) { + c = &cc->cache[i]; + c->unit_size = sizes[i]; + c->objcg = objcg; + prefill_mem_cache(c, cpu); + } + } + ma->caches = pcc; + return 0; +} + +static void drain_mem_cache(struct bpf_mem_cache *c) +{ + struct llist_node *llnode, *t; + + /* No progs are using this bpf_mem_cache, but htab_map_free() called + * bpf_mem_cache_free() for all remaining elements and they can be in + * free_by_rcu or in waiting_for_gp lists, so drain those lists now. + */ + llist_for_each_safe(llnode, t, __llist_del_all(&c->free_by_rcu)) + free_one(c, llnode); + llist_for_each_safe(llnode, t, llist_del_all(&c->waiting_for_gp)) + free_one(c, llnode); + llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist)) + free_one(c, llnode); + llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra)) + free_one(c, llnode); +} + +static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma) +{ + free_percpu(ma->cache); + free_percpu(ma->caches); + ma->cache = NULL; + ma->caches = NULL; +} + +static void free_mem_alloc(struct bpf_mem_alloc *ma) +{ + /* waiting_for_gp lists was drained, but __free_rcu might + * still execute. Wait for it now before we freeing percpu caches. + */ + rcu_barrier_tasks_trace(); + rcu_barrier(); + free_mem_alloc_no_barrier(ma); +} + +static void free_mem_alloc_deferred(struct work_struct *work) +{ + struct bpf_mem_alloc *ma = container_of(work, struct bpf_mem_alloc, work); + + free_mem_alloc(ma); + kfree(ma); +} + +static void destroy_mem_alloc(struct bpf_mem_alloc *ma, int rcu_in_progress) +{ + struct bpf_mem_alloc *copy; + + if (!rcu_in_progress) { + /* Fast path. No callbacks are pending, hence no need to do + * rcu_barrier-s. + */ + free_mem_alloc_no_barrier(ma); + return; + } + + copy = kmalloc(sizeof(*ma), GFP_KERNEL); + if (!copy) { + /* Slow path with inline barrier-s */ + free_mem_alloc(ma); + return; + } + + /* Defer barriers into worker to let the rest of map memory to be freed */ + copy->cache = ma->cache; + ma->cache = NULL; + copy->caches = ma->caches; + ma->caches = NULL; + INIT_WORK(©->work, free_mem_alloc_deferred); + queue_work(system_unbound_wq, ©->work); +} + +void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) +{ + struct bpf_mem_caches *cc; + struct bpf_mem_cache *c; + int cpu, i, rcu_in_progress; + + if (ma->cache) { + rcu_in_progress = 0; + for_each_possible_cpu(cpu) { + c = per_cpu_ptr(ma->cache, cpu); + drain_mem_cache(c); + rcu_in_progress += atomic_read(&c->call_rcu_in_progress); + } + /* objcg is the same across cpus */ + if (c->objcg) + obj_cgroup_put(c->objcg); + destroy_mem_alloc(ma, rcu_in_progress); + } + if (ma->caches) { + rcu_in_progress = 0; + for_each_possible_cpu(cpu) { + cc = per_cpu_ptr(ma->caches, cpu); + for (i = 0; i < NUM_CACHES; i++) { + c = &cc->cache[i]; + drain_mem_cache(c); + rcu_in_progress += atomic_read(&c->call_rcu_in_progress); + } + } + if (c->objcg) + obj_cgroup_put(c->objcg); + destroy_mem_alloc(ma, rcu_in_progress); + } +} + +/* notrace is necessary here and in other functions to make sure + * bpf programs cannot attach to them and cause llist corruptions. + */ +static void notrace *unit_alloc(struct bpf_mem_cache *c) +{ + struct llist_node *llnode = NULL; + unsigned long flags; + int cnt = 0; + + /* Disable irqs to prevent the following race for majority of prog types: + * prog_A + * bpf_mem_alloc + * preemption or irq -> prog_B + * bpf_mem_alloc + * + * but prog_B could be a perf_event NMI prog. + * Use per-cpu 'active' counter to order free_list access between + * unit_alloc/unit_free/bpf_mem_refill. + */ + local_irq_save(flags); + if (local_inc_return(&c->active) == 1) { + llnode = __llist_del_first(&c->free_llist); + if (llnode) + cnt = --c->free_cnt; + } + local_dec(&c->active); + local_irq_restore(flags); + + WARN_ON(cnt < 0); + + if (cnt < c->low_watermark) + irq_work_raise(c); + return llnode; +} + +/* Though 'ptr' object could have been allocated on a different cpu + * add it to the free_llist of the current cpu. + * Let kfree() logic deal with it when it's later called from irq_work. + */ +static void notrace unit_free(struct bpf_mem_cache *c, void *ptr) +{ + struct llist_node *llnode = ptr - LLIST_NODE_SZ; + unsigned long flags; + int cnt = 0; + + BUILD_BUG_ON(LLIST_NODE_SZ > 8); + + local_irq_save(flags); + if (local_inc_return(&c->active) == 1) { + __llist_add(llnode, &c->free_llist); + cnt = ++c->free_cnt; + } else { + /* unit_free() cannot fail. Therefore add an object to atomic + * llist. free_bulk() will drain it. Though free_llist_extra is + * a per-cpu list we have to use atomic llist_add here, since + * it also can be interrupted by bpf nmi prog that does another + * unit_free() into the same free_llist_extra. + */ + llist_add(llnode, &c->free_llist_extra); + } + local_dec(&c->active); + local_irq_restore(flags); + + if (cnt > c->high_watermark) + /* free few objects from current cpu into global kmalloc pool */ + irq_work_raise(c); +} + +/* Called from BPF program or from sys_bpf syscall. + * In both cases migration is disabled. + */ +void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size) +{ + int idx; + void *ret; + + if (!size) + return ZERO_SIZE_PTR; + + idx = bpf_mem_cache_idx(size + LLIST_NODE_SZ); + if (idx < 0) + return NULL; + + ret = unit_alloc(this_cpu_ptr(ma->caches)->cache + idx); + return !ret ? NULL : ret + LLIST_NODE_SZ; +} + +void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr) +{ + int idx; + + if (!ptr) + return; + + idx = bpf_mem_cache_idx(__ksize(ptr - LLIST_NODE_SZ)); + if (idx < 0) + return; + + unit_free(this_cpu_ptr(ma->caches)->cache + idx, ptr); +} + +void notrace *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma) +{ + void *ret; + + ret = unit_alloc(this_cpu_ptr(ma->cache)); + return !ret ? NULL : ret + LLIST_NODE_SZ; +} + +void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr) +{ + if (!ptr) + return; + + unit_free(this_cpu_ptr(ma->cache), ptr); +} diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 27760627370d..4fb08c43420d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -638,7 +638,10 @@ static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock) bpf_map_free_id(map, do_idr_lock); btf_put(map->btf); INIT_WORK(&map->work, bpf_map_free_deferred); - schedule_work(&map->work); + /* Avoid spawning kworkers, since they all might contend + * for the same mutex like slab_mutex. + */ + queue_work(system_unbound_wq, &map->work); } } @@ -1437,9 +1440,9 @@ err_put: #define BPF_MAP_DELETE_ELEM_LAST_FIELD key -static int map_delete_elem(union bpf_attr *attr) +static int map_delete_elem(union bpf_attr *attr, bpfptr_t uattr) { - void __user *ukey = u64_to_user_ptr(attr->key); + bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel); int ufd = attr->map_fd; struct bpf_map *map; struct fd f; @@ -1459,7 +1462,7 @@ static int map_delete_elem(union bpf_attr *attr) goto err_put; } - key = __bpf_copy_key(ukey, map->key_size); + key = ___bpf_copy_key(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); goto err_put; @@ -4941,7 +4944,7 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size) err = map_update_elem(&attr, uattr); break; case BPF_MAP_DELETE_ELEM: - err = map_delete_elem(&attr); + err = map_delete_elem(&attr, uattr); break; case BPF_MAP_GET_NEXT_KEY: err = map_get_next_key(&attr); @@ -5073,8 +5076,10 @@ BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size) { switch (cmd) { case BPF_MAP_CREATE: + case BPF_MAP_DELETE_ELEM: case BPF_MAP_UPDATE_ELEM: case BPF_MAP_FREEZE: + case BPF_MAP_GET_FD_BY_ID: case BPF_PROG_LOAD: case BPF_BTF_LOAD: case BPF_LINK_CREATE: diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index ff87e38af8a7..ad76940b02cc 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -895,7 +895,7 @@ u64 notrace __bpf_prog_enter(struct bpf_prog *prog, struct bpf_tramp_run_ctx *ru run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); - if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) { + if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { inc_misses_counter(prog); return 0; } @@ -930,7 +930,7 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start, struct bpf_tramp_ bpf_reset_run_ctx(run_ctx->saved_run_ctx); update_prog_stats(prog, start); - __this_cpu_dec(*(prog->active)); + this_cpu_dec(*(prog->active)); migrate_enable(); rcu_read_unlock(); } @@ -966,7 +966,7 @@ u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, struct bpf_tramp_r migrate_disable(); might_fault(); - if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) { + if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { inc_misses_counter(prog); return 0; } @@ -982,7 +982,7 @@ void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start, bpf_reset_run_ctx(run_ctx->saved_run_ctx); update_prog_stats(prog, start); - __this_cpu_dec(*(prog->active)); + this_cpu_dec(*(prog->active)); migrate_enable(); rcu_read_unlock_trace(); } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8d07493a477c..003f7ba19558 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1092,6 +1092,7 @@ static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx) id = ++env->id_gen; state->refs[new_ofs].id = id; state->refs[new_ofs].insn_idx = insn_idx; + state->refs[new_ofs].callback_ref = state->in_callback_fn ? state->frameno : 0; return id; } @@ -1104,6 +1105,9 @@ static int release_reference_state(struct bpf_func_state *state, int ptr_id) last_idx = state->acquired_refs - 1; for (i = 0; i < state->acquired_refs; i++) { if (state->refs[i].id == ptr_id) { + /* Cannot release caller references in callbacks */ + if (state->in_callback_fn && state->refs[i].callback_ref != state->frameno) + return -EINVAL; if (last_idx && i != last_idx) memcpy(&state->refs[i], &state->refs[last_idx], sizeof(*state->refs)); @@ -6918,10 +6922,17 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) caller->regs[BPF_REG_0] = *r0; } - /* Transfer references to the caller */ - err = copy_reference_state(caller, callee); - if (err) - return err; + /* callback_fn frame should have released its own additions to parent's + * reference state at this point, or check_reference_leak would + * complain, hence it must be the same as the caller. There is no need + * to copy it back. + */ + if (!callee->in_callback_fn) { + /* Transfer references to the caller */ + err = copy_reference_state(caller, callee); + if (err) + return err; + } *insn_idx = callee->callsite + 1; if (env->log.level & BPF_LOG_LEVEL) { @@ -7043,13 +7054,20 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, static int check_reference_leak(struct bpf_verifier_env *env) { struct bpf_func_state *state = cur_func(env); + bool refs_lingering = false; int i; + if (state->frameno && !state->in_callback_fn) + return 0; + for (i = 0; i < state->acquired_refs; i++) { + if (state->in_callback_fn && state->refs[i].callback_ref != state->frameno) + continue; verbose(env, "Unreleased reference id=%d alloc_insn=%d\n", state->refs[i].id, state->refs[i].insn_idx); + refs_lingering = true; } - return state->acquired_refs ? -EINVAL : 0; + return refs_lingering ? -EINVAL : 0; } static int check_bpf_snprintf_call(struct bpf_verifier_env *env, @@ -12338,6 +12356,16 @@ static int do_check(struct bpf_verifier_env *env) return -EINVAL; } + /* We must do check_reference_leak here before + * prepare_func_exit to handle the case when + * state->curframe > 0, it may be a callback + * function, for which reference_state must + * match caller reference state when it exits. + */ + err = check_reference_leak(env); + if (err) + return err; + if (state->curframe) { /* exit from nested function */ err = prepare_func_exit(env, &env->insn_idx); @@ -12347,10 +12375,6 @@ static int do_check(struct bpf_verifier_env *env) continue; } - err = check_reference_leak(env); - if (err) - return err; - err = check_return_code(env); if (err) return err; @@ -12563,14 +12587,6 @@ err_put: return err; } -static int check_map_prealloc(struct bpf_map *map) -{ - return (map->map_type != BPF_MAP_TYPE_HASH && - map->map_type != BPF_MAP_TYPE_PERCPU_HASH && - map->map_type != BPF_MAP_TYPE_HASH_OF_MAPS) || - !(map->map_flags & BPF_F_NO_PREALLOC); -} - static bool is_tracing_prog_type(enum bpf_prog_type type) { switch (type) { @@ -12585,50 +12601,12 @@ static bool is_tracing_prog_type(enum bpf_prog_type type) } } -static bool is_preallocated_map(struct bpf_map *map) -{ - if (!check_map_prealloc(map)) - return false; - if (map->inner_map_meta && !check_map_prealloc(map->inner_map_meta)) - return false; - return true; -} - static int check_map_prog_compatibility(struct bpf_verifier_env *env, struct bpf_map *map, struct bpf_prog *prog) { enum bpf_prog_type prog_type = resolve_prog_type(prog); - /* - * Validate that trace type programs use preallocated hash maps. - * - * For programs attached to PERF events this is mandatory as the - * perf NMI can hit any arbitrary code sequence. - * - * All other trace types using preallocated hash maps are unsafe as - * well because tracepoint or kprobes can be inside locked regions - * of the memory allocator or at a place where a recursion into the - * memory allocator would see inconsistent state. - * - * On RT enabled kernels run-time allocation of all trace type - * programs is strictly prohibited due to lock type constraints. On - * !RT kernels it is allowed for backwards compatibility reasons for - * now, but warnings are emitted so developers are made aware of - * the unsafety and can fix their programs before this is enforced. - */ - if (is_tracing_prog_type(prog_type) && !is_preallocated_map(map)) { - if (prog_type == BPF_PROG_TYPE_PERF_EVENT) { - verbose(env, "perf_event programs can only use preallocated hash map\n"); - return -EINVAL; - } - if (IS_ENABLED(CONFIG_PREEMPT_RT)) { - verbose(env, "trace type programs can only use preallocated hash map\n"); - return -EINVAL; - } - WARN_ONCE(1, "trace type BPF program uses run-time allocation\n"); - verbose(env, "trace type programs with run-time allocated hash maps are unsafe. Switch to preallocated hash maps.\n"); - } if (map_value_has_spin_lock(map)) { if (prog_type == BPF_PROG_TYPE_SOCKET_FILTER) { @@ -12675,12 +12653,6 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, case BPF_MAP_TYPE_LRU_PERCPU_HASH: case BPF_MAP_TYPE_ARRAY_OF_MAPS: case BPF_MAP_TYPE_HASH_OF_MAPS: - if (!is_preallocated_map(map)) { - verbose(env, - "Sleepable programs can only use preallocated maps\n"); - return -EINVAL; - } - break; case BPF_MAP_TYPE_RINGBUF: case BPF_MAP_TYPE_INODE_STORAGE: case BPF_MAP_TYPE_SK_STORAGE: diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index feb59380c896..793ecff29038 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -3,6 +3,10 @@ #include <linux/sched/cputime.h> +#include <linux/bpf.h> +#include <linux/btf.h> +#include <linux/btf_ids.h> + static DEFINE_SPINLOCK(cgroup_rstat_lock); static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock); @@ -141,6 +145,31 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, return pos; } +/* + * A hook for bpf stat collectors to attach to and flush their stats. + * Together with providing bpf kfuncs for cgroup_rstat_updated() and + * cgroup_rstat_flush(), this enables a complete workflow where bpf progs that + * collect cgroup stats can integrate with rstat for efficient flushing. + * + * A static noinline declaration here could cause the compiler to optimize away + * the function. A global noinline declaration will keep the definition, but may + * optimize away the callsite. Therefore, __weak is needed to ensure that the + * call is still emitted, by telling the compiler that we don't know what the + * function might eventually be. + * + * __diag_* below are needed to dismiss the missing prototype warning. + */ +__diag_push(); +__diag_ignore_all("-Wmissing-prototypes", + "kfuncs which will be used in BPF programs"); + +__weak noinline void bpf_rstat_flush(struct cgroup *cgrp, + struct cgroup *parent, int cpu) +{ +} + +__diag_pop(); + /* see cgroup_rstat_flush() */ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep) __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock) @@ -168,6 +197,7 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep) struct cgroup_subsys_state *css; cgroup_base_stat_flush(pos, cpu); + bpf_rstat_flush(pos, cgroup_parent(pos), cpu); rcu_read_lock(); list_for_each_entry_rcu(css, &pos->rstat_css_list, @@ -501,3 +531,21 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq) seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time); #endif } + +/* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */ +BTF_SET8_START(bpf_rstat_kfunc_ids) +BTF_ID_FLAGS(func, cgroup_rstat_updated) +BTF_ID_FLAGS(func, cgroup_rstat_flush, KF_SLEEPABLE) +BTF_SET8_END(bpf_rstat_kfunc_ids) + +static const struct btf_kfunc_id_set bpf_rstat_kfunc_set = { + .owner = THIS_MODULE, + .set = &bpf_rstat_kfunc_ids, +}; + +static int __init bpf_rstat_kfunc_init(void) +{ + return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, + &bpf_rstat_kfunc_set); +} +late_initcall(bpf_rstat_kfunc_init); diff --git a/net/core/filter.c b/net/core/filter.c index c191db80ce93..e872f45399b0 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3010,7 +3010,7 @@ BPF_CALL_0(bpf_get_cgroup_classid_curr) return __task_get_classid(current); } -static const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto = { +const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto = { .func = bpf_get_cgroup_classid_curr, .gpl_only = false, .ret_type = RET_INTEGER, @@ -4489,7 +4489,8 @@ BPF_CALL_4(bpf_skb_get_tunnel_key, struct sk_buff *, skb, struct bpf_tunnel_key void *to_orig = to; int err; - if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6)))) { + if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6 | + BPF_F_TUNINFO_FLAGS)))) { err = -EINVAL; goto err_clear; } @@ -4521,7 +4522,10 @@ set_compat: to->tunnel_id = be64_to_cpu(info->key.tun_id); to->tunnel_tos = info->key.tos; to->tunnel_ttl = info->key.ttl; - to->tunnel_ext = 0; + if (flags & BPF_F_TUNINFO_FLAGS) + to->tunnel_flags = info->key.tun_flags; + else + to->tunnel_ext = 0; if (flags & BPF_F_TUNINFO_IPV6) { memcpy(to->remote_ipv6, &info->key.u.ipv6.src, @@ -5014,359 +5018,259 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = { .arg1_type = ARG_PTR_TO_CTX, }; -static int __bpf_setsockopt(struct sock *sk, int level, int optname, - char *optval, int optlen) -{ - char devname[IFNAMSIZ]; - int val, valbool; - struct net *net; - int ifindex; - int ret = 0; - - if (!sk_fullsock(sk)) +static int sol_socket_sockopt(struct sock *sk, int optname, + char *optval, int *optlen, + bool getopt) +{ + switch (optname) { + case SO_REUSEADDR: + case SO_SNDBUF: + case SO_RCVBUF: + case SO_KEEPALIVE: + case SO_PRIORITY: + case SO_REUSEPORT: + case SO_RCVLOWAT: + case SO_MARK: + case SO_MAX_PACING_RATE: + case SO_BINDTOIFINDEX: + case SO_TXREHASH: + if (*optlen != sizeof(int)) + return -EINVAL; + break; + case SO_BINDTODEVICE: + break; + default: return -EINVAL; + } - if (level == SOL_SOCKET) { - if (optlen != sizeof(int) && optname != SO_BINDTODEVICE) + if (getopt) { + if (optname == SO_BINDTODEVICE) return -EINVAL; - val = *((int *)optval); - valbool = val ? 1 : 0; - - /* Only some socketops are supported */ - switch (optname) { - case SO_RCVBUF: - val = min_t(u32, val, READ_ONCE(sysctl_rmem_max)); - val = min_t(int, val, INT_MAX / 2); - sk->sk_userlocks |= SOCK_RCVBUF_LOCK; - WRITE_ONCE(sk->sk_rcvbuf, - max_t(int, val * 2, SOCK_MIN_RCVBUF)); - break; - case SO_SNDBUF: - val = min_t(u32, val, READ_ONCE(sysctl_wmem_max)); - val = min_t(int, val, INT_MAX / 2); - sk->sk_userlocks |= SOCK_SNDBUF_LOCK; - WRITE_ONCE(sk->sk_sndbuf, - max_t(int, val * 2, SOCK_MIN_SNDBUF)); - break; - case SO_MAX_PACING_RATE: /* 32bit version */ - if (val != ~0U) - cmpxchg(&sk->sk_pacing_status, - SK_PACING_NONE, - SK_PACING_NEEDED); - sk->sk_max_pacing_rate = (val == ~0U) ? - ~0UL : (unsigned int)val; - sk->sk_pacing_rate = min(sk->sk_pacing_rate, - sk->sk_max_pacing_rate); - break; - case SO_PRIORITY: - sk->sk_priority = val; - break; - case SO_RCVLOWAT: - if (val < 0) - val = INT_MAX; - if (sk->sk_socket && sk->sk_socket->ops->set_rcvlowat) - ret = sk->sk_socket->ops->set_rcvlowat(sk, val); - else - WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); - break; - case SO_MARK: - if (sk->sk_mark != val) { - sk->sk_mark = val; - sk_dst_reset(sk); - } - break; - case SO_BINDTODEVICE: - optlen = min_t(long, optlen, IFNAMSIZ - 1); - strncpy(devname, optval, optlen); - devname[optlen] = 0; + return sk_getsockopt(sk, SOL_SOCKET, optname, + KERNEL_SOCKPTR(optval), + KERNEL_SOCKPTR(optlen)); + } - ifindex = 0; - if (devname[0] != '\0') { - struct net_device *dev; + return sk_setsockopt(sk, SOL_SOCKET, optname, + KERNEL_SOCKPTR(optval), *optlen); +} - ret = -ENODEV; +static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname, + char *optval, int optlen) +{ + struct tcp_sock *tp = tcp_sk(sk); + unsigned long timeout; + int val; - net = sock_net(sk); - dev = dev_get_by_name(net, devname); - if (!dev) - break; - ifindex = dev->ifindex; - dev_put(dev); - } - fallthrough; - case SO_BINDTOIFINDEX: - if (optname == SO_BINDTOIFINDEX) - ifindex = val; - ret = sock_bindtoindex(sk, ifindex, false); - break; - case SO_KEEPALIVE: - if (sk->sk_prot->keepalive) - sk->sk_prot->keepalive(sk, valbool); - sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); - break; - case SO_REUSEPORT: - sk->sk_reuseport = valbool; - break; - case SO_TXREHASH: - if (val < -1 || val > 1) { - ret = -EINVAL; - break; - } - sk->sk_txrehash = (u8)val; - break; - default: - ret = -EINVAL; - } -#ifdef CONFIG_INET - } else if (level == SOL_IP) { - if (optlen != sizeof(int) || sk->sk_family != AF_INET) - return -EINVAL; + if (optlen != sizeof(int)) + return -EINVAL; - val = *((int *)optval); - /* Only some options are supported */ - switch (optname) { - case IP_TOS: - if (val < -1 || val > 0xff) { - ret = -EINVAL; - } else { - struct inet_sock *inet = inet_sk(sk); + val = *(int *)optval; - if (val == -1) - val = 0; - inet->tos = val; - } - break; - default: - ret = -EINVAL; - } -#if IS_ENABLED(CONFIG_IPV6) - } else if (level == SOL_IPV6) { - if (optlen != sizeof(int) || sk->sk_family != AF_INET6) + /* Only some options are supported */ + switch (optname) { + case TCP_BPF_IW: + if (val <= 0 || tp->data_segs_out > tp->syn_data) + return -EINVAL; + tcp_snd_cwnd_set(tp, val); + break; + case TCP_BPF_SNDCWND_CLAMP: + if (val <= 0) + return -EINVAL; + tp->snd_cwnd_clamp = val; + tp->snd_ssthresh = val; + break; + case TCP_BPF_DELACK_MAX: + timeout = usecs_to_jiffies(val); + if (timeout > TCP_DELACK_MAX || + timeout < TCP_TIMEOUT_MIN) + return -EINVAL; + inet_csk(sk)->icsk_delack_max = timeout; + break; + case TCP_BPF_RTO_MIN: + timeout = usecs_to_jiffies(val); + if (timeout > TCP_RTO_MIN || + timeout < TCP_TIMEOUT_MIN) return -EINVAL; + inet_csk(sk)->icsk_rto_min = timeout; + break; + default: + return -EINVAL; + } - val = *((int *)optval); - /* Only some options are supported */ - switch (optname) { - case IPV6_TCLASS: - if (val < -1 || val > 0xff) { - ret = -EINVAL; - } else { - struct ipv6_pinfo *np = inet6_sk(sk); + return 0; +} - if (val == -1) - val = 0; - np->tclass = val; - } - break; - default: - ret = -EINVAL; - } -#endif - } else if (level == SOL_TCP && - sk->sk_prot->setsockopt == tcp_setsockopt) { - if (optname == TCP_CONGESTION) { - char name[TCP_CA_NAME_MAX]; +static int sol_tcp_sockopt(struct sock *sk, int optname, + char *optval, int *optlen, + bool getopt) +{ + if (sk->sk_prot->setsockopt != tcp_setsockopt) + return -EINVAL; - strncpy(name, optval, min_t(long, optlen, - TCP_CA_NAME_MAX-1)); - name[TCP_CA_NAME_MAX-1] = 0; - ret = tcp_set_congestion_control(sk, name, false, true); - } else { - struct inet_connection_sock *icsk = inet_csk(sk); + switch (optname) { + case TCP_NODELAY: + case TCP_MAXSEG: + case TCP_KEEPIDLE: + case TCP_KEEPINTVL: + case TCP_KEEPCNT: + case TCP_SYNCNT: + case TCP_WINDOW_CLAMP: + case TCP_THIN_LINEAR_TIMEOUTS: + case TCP_USER_TIMEOUT: + case TCP_NOTSENT_LOWAT: + case TCP_SAVE_SYN: + if (*optlen != sizeof(int)) + return -EINVAL; + break; + case TCP_CONGESTION: + if (*optlen < 2) + return -EINVAL; + break; + case TCP_SAVED_SYN: + if (*optlen < 1) + return -EINVAL; + break; + default: + if (getopt) + return -EINVAL; + return bpf_sol_tcp_setsockopt(sk, optname, optval, *optlen); + } + + if (getopt) { + if (optname == TCP_SAVED_SYN) { struct tcp_sock *tp = tcp_sk(sk); - unsigned long timeout; - if (optlen != sizeof(int)) + if (!tp->saved_syn || + *optlen > tcp_saved_syn_len(tp->saved_syn)) return -EINVAL; + memcpy(optval, tp->saved_syn->data, *optlen); + /* It cannot free tp->saved_syn here because it + * does not know if the user space still needs it. + */ + return 0; + } - val = *((int *)optval); - /* Only some options are supported */ - switch (optname) { - case TCP_BPF_IW: - if (val <= 0 || tp->data_segs_out > tp->syn_data) - ret = -EINVAL; - else - tcp_snd_cwnd_set(tp, val); - break; - case TCP_BPF_SNDCWND_CLAMP: - if (val <= 0) { - ret = -EINVAL; - } else { - tp->snd_cwnd_clamp = val; - tp->snd_ssthresh = val; - } - break; - case TCP_BPF_DELACK_MAX: - timeout = usecs_to_jiffies(val); - if (timeout > TCP_DELACK_MAX || - timeout < TCP_TIMEOUT_MIN) - return -EINVAL; - inet_csk(sk)->icsk_delack_max = timeout; - break; - case TCP_BPF_RTO_MIN: - timeout = usecs_to_jiffies(val); - if (timeout > TCP_RTO_MIN || - timeout < TCP_TIMEOUT_MIN) - return -EINVAL; - inet_csk(sk)->icsk_rto_min = timeout; - break; - case TCP_SAVE_SYN: - if (val < 0 || val > 1) - ret = -EINVAL; - else - tp->save_syn = val; - break; - case TCP_KEEPIDLE: - ret = tcp_sock_set_keepidle_locked(sk, val); - break; - case TCP_KEEPINTVL: - if (val < 1 || val > MAX_TCP_KEEPINTVL) - ret = -EINVAL; - else - tp->keepalive_intvl = val * HZ; - break; - case TCP_KEEPCNT: - if (val < 1 || val > MAX_TCP_KEEPCNT) - ret = -EINVAL; - else - tp->keepalive_probes = val; - break; - case TCP_SYNCNT: - if (val < 1 || val > MAX_TCP_SYNCNT) - ret = -EINVAL; - else - icsk->icsk_syn_retries = val; - break; - case TCP_USER_TIMEOUT: - if (val < 0) - ret = -EINVAL; - else - icsk->icsk_user_timeout = val; - break; - case TCP_NOTSENT_LOWAT: - tp->notsent_lowat = val; - sk->sk_write_space(sk); - break; - case TCP_WINDOW_CLAMP: - ret = tcp_set_window_clamp(sk, val); - break; - default: - ret = -EINVAL; - } + if (optname == TCP_CONGESTION) { + if (!inet_csk(sk)->icsk_ca_ops) + return -EINVAL; + /* BPF expects NULL-terminated tcp-cc string */ + optval[--(*optlen)] = '\0'; } -#endif - } else { - ret = -EINVAL; + + return do_tcp_getsockopt(sk, SOL_TCP, optname, + KERNEL_SOCKPTR(optval), + KERNEL_SOCKPTR(optlen)); } - return ret; + + return do_tcp_setsockopt(sk, SOL_TCP, optname, + KERNEL_SOCKPTR(optval), *optlen); } -static int _bpf_setsockopt(struct sock *sk, int level, int optname, - char *optval, int optlen) +static int sol_ip_sockopt(struct sock *sk, int optname, + char *optval, int *optlen, + bool getopt) { - if (sk_fullsock(sk)) - sock_owned_by_me(sk); - return __bpf_setsockopt(sk, level, optname, optval, optlen); + if (sk->sk_family != AF_INET) + return -EINVAL; + + switch (optname) { + case IP_TOS: + if (*optlen != sizeof(int)) + return -EINVAL; + break; + default: + return -EINVAL; + } + + if (getopt) + return do_ip_getsockopt(sk, SOL_IP, optname, + KERNEL_SOCKPTR(optval), + KERNEL_SOCKPTR(optlen)); + + return do_ip_setsockopt(sk, SOL_IP, optname, + KERNEL_SOCKPTR(optval), *optlen); } -static int __bpf_getsockopt(struct sock *sk, int level, int optname, - char *optval, int optlen) +static int sol_ipv6_sockopt(struct sock *sk, int optname, + char *optval, int *optlen, + bool getopt) { - if (!sk_fullsock(sk)) - goto err_clear; + if (sk->sk_family != AF_INET6) + return -EINVAL; - if (level == SOL_SOCKET) { - if (optlen != sizeof(int)) - goto err_clear; + switch (optname) { + case IPV6_TCLASS: + case IPV6_AUTOFLOWLABEL: + if (*optlen != sizeof(int)) + return -EINVAL; + break; + default: + return -EINVAL; + } - switch (optname) { - case SO_RCVBUF: - *((int *)optval) = sk->sk_rcvbuf; - break; - case SO_SNDBUF: - *((int *)optval) = sk->sk_sndbuf; - break; - case SO_MARK: - *((int *)optval) = sk->sk_mark; - break; - case SO_PRIORITY: - *((int *)optval) = sk->sk_priority; - break; - case SO_BINDTOIFINDEX: - *((int *)optval) = sk->sk_bound_dev_if; - break; - case SO_REUSEPORT: - *((int *)optval) = sk->sk_reuseport; - break; - case SO_TXREHASH: - *((int *)optval) = sk->sk_txrehash; - break; - default: - goto err_clear; - } -#ifdef CONFIG_INET - } else if (level == SOL_TCP && sk->sk_prot->getsockopt == tcp_getsockopt) { - struct inet_connection_sock *icsk; - struct tcp_sock *tp; + if (getopt) + return ipv6_bpf_stub->ipv6_getsockopt(sk, SOL_IPV6, optname, + KERNEL_SOCKPTR(optval), + KERNEL_SOCKPTR(optlen)); - switch (optname) { - case TCP_CONGESTION: - icsk = inet_csk(sk); + return ipv6_bpf_stub->ipv6_setsockopt(sk, SOL_IPV6, optname, + KERNEL_SOCKPTR(optval), *optlen); +} - if (!icsk->icsk_ca_ops || optlen <= 1) - goto err_clear; - strncpy(optval, icsk->icsk_ca_ops->name, optlen); - optval[optlen - 1] = 0; - break; - case TCP_SAVED_SYN: - tp = tcp_sk(sk); +static int __bpf_setsockopt(struct sock *sk, int level, int optname, + char *optval, int optlen) +{ + if (!sk_fullsock(sk)) + return -EINVAL; - if (optlen <= 0 || !tp->saved_syn || - optlen > tcp_saved_syn_len(tp->saved_syn)) - goto err_clear; - memcpy(optval, tp->saved_syn->data, optlen); - break; - default: - goto err_clear; - } - } else if (level == SOL_IP) { - struct inet_sock *inet = inet_sk(sk); + if (level == SOL_SOCKET) + return sol_socket_sockopt(sk, optname, optval, &optlen, false); + else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP) + return sol_ip_sockopt(sk, optname, optval, &optlen, false); + else if (IS_ENABLED(CONFIG_IPV6) && level == SOL_IPV6) + return sol_ipv6_sockopt(sk, optname, optval, &optlen, false); + else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP) + return sol_tcp_sockopt(sk, optname, optval, &optlen, false); - if (optlen != sizeof(int) || sk->sk_family != AF_INET) - goto err_clear; + return -EINVAL; +} - /* Only some options are supported */ - switch (optname) { - case IP_TOS: - *((int *)optval) = (int)inet->tos; - break; - default: - goto err_clear; - } -#if IS_ENABLED(CONFIG_IPV6) - } else if (level == SOL_IPV6) { - struct ipv6_pinfo *np = inet6_sk(sk); +static int _bpf_setsockopt(struct sock *sk, int level, int optname, + char *optval, int optlen) +{ + if (sk_fullsock(sk)) + sock_owned_by_me(sk); + return __bpf_setsockopt(sk, level, optname, optval, optlen); +} - if (optlen != sizeof(int) || sk->sk_family != AF_INET6) - goto err_clear; +static int __bpf_getsockopt(struct sock *sk, int level, int optname, + char *optval, int optlen) +{ + int err, saved_optlen = optlen; - /* Only some options are supported */ - switch (optname) { - case IPV6_TCLASS: - *((int *)optval) = (int)np->tclass; - break; - default: - goto err_clear; - } -#endif -#endif - } else { - goto err_clear; + if (!sk_fullsock(sk)) { + err = -EINVAL; + goto done; } - return 0; -err_clear: - memset(optval, 0, optlen); - return -EINVAL; + + if (level == SOL_SOCKET) + err = sol_socket_sockopt(sk, optname, optval, &optlen, true); + else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP) + err = sol_tcp_sockopt(sk, optname, optval, &optlen, true); + else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP) + err = sol_ip_sockopt(sk, optname, optval, &optlen, true); + else if (IS_ENABLED(CONFIG_IPV6) && level == SOL_IPV6) + err = sol_ipv6_sockopt(sk, optname, optval, &optlen, true); + else + err = -EINVAL; + +done: + if (err) + optlen = 0; + if (optlen < saved_optlen) + memset(optval + optlen, 0, saved_optlen - optlen); + return err; } static int _bpf_getsockopt(struct sock *sk, int level, int optname, @@ -7667,34 +7571,23 @@ const struct bpf_func_proto bpf_sk_storage_get_cg_sock_proto __weak; static const struct bpf_func_proto * sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { + const struct bpf_func_proto *func_proto; + + func_proto = cgroup_common_func_proto(func_id, prog); + if (func_proto) + return func_proto; + + func_proto = cgroup_current_func_proto(func_id, prog); + if (func_proto) + return func_proto; + switch (func_id) { - /* inet and inet6 sockets are created in a process - * context so there is always a valid uid/gid - */ - case BPF_FUNC_get_current_uid_gid: - return &bpf_get_current_uid_gid_proto; - case BPF_FUNC_get_local_storage: - return &bpf_get_local_storage_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_sock_proto; case BPF_FUNC_get_netns_cookie: return &bpf_get_netns_cookie_sock_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; - case BPF_FUNC_get_current_pid_tgid: - return &bpf_get_current_pid_tgid_proto; - case BPF_FUNC_get_current_comm: - return &bpf_get_current_comm_proto; -#ifdef CONFIG_CGROUPS - case BPF_FUNC_get_current_cgroup_id: - return &bpf_get_current_cgroup_id_proto; - case BPF_FUNC_get_current_ancestor_cgroup_id: - return &bpf_get_current_ancestor_cgroup_id_proto; -#endif -#ifdef CONFIG_CGROUP_NET_CLASSID - case BPF_FUNC_get_cgroup_classid: - return &bpf_get_cgroup_classid_curr_proto; -#endif case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_cg_sock_proto; case BPF_FUNC_ktime_get_coarse_ns: @@ -7707,12 +7600,17 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) static const struct bpf_func_proto * sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { + const struct bpf_func_proto *func_proto; + + func_proto = cgroup_common_func_proto(func_id, prog); + if (func_proto) + return func_proto; + + func_proto = cgroup_current_func_proto(func_id, prog); + if (func_proto) + return func_proto; + switch (func_id) { - /* inet and inet6 sockets are created in a process - * context so there is always a valid uid/gid - */ - case BPF_FUNC_get_current_uid_gid: - return &bpf_get_current_uid_gid_proto; case BPF_FUNC_bind: switch (prog->expected_attach_type) { case BPF_CGROUP_INET4_CONNECT: @@ -7725,24 +7623,8 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_socket_cookie_sock_addr_proto; case BPF_FUNC_get_netns_cookie: return &bpf_get_netns_cookie_sock_addr_proto; - case BPF_FUNC_get_local_storage: - return &bpf_get_local_storage_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; - case BPF_FUNC_get_current_pid_tgid: - return &bpf_get_current_pid_tgid_proto; - case BPF_FUNC_get_current_comm: - return &bpf_get_current_comm_proto; -#ifdef CONFIG_CGROUPS - case BPF_FUNC_get_current_cgroup_id: - return &bpf_get_current_cgroup_id_proto; - case BPF_FUNC_get_current_ancestor_cgroup_id: - return &bpf_get_current_ancestor_cgroup_id_proto; -#endif -#ifdef CONFIG_CGROUP_NET_CLASSID - case BPF_FUNC_get_cgroup_classid: - return &bpf_get_cgroup_classid_curr_proto; -#endif #ifdef CONFIG_INET case BPF_FUNC_sk_lookup_tcp: return &bpf_sock_addr_sk_lookup_tcp_proto; @@ -7823,9 +7705,13 @@ const struct bpf_func_proto bpf_sk_storage_delete_proto __weak; static const struct bpf_func_proto * cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { + const struct bpf_func_proto *func_proto; + + func_proto = cgroup_common_func_proto(func_id, prog); + if (func_proto) + return func_proto; + switch (func_id) { - case BPF_FUNC_get_local_storage: - return &bpf_get_local_storage_proto; case BPF_FUNC_sk_fullsock: return &bpf_sk_fullsock_proto; case BPF_FUNC_sk_storage_get: @@ -8065,6 +7951,12 @@ const struct bpf_func_proto bpf_sock_hash_update_proto __weak; static const struct bpf_func_proto * sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { + const struct bpf_func_proto *func_proto; + + func_proto = cgroup_common_func_proto(func_id, prog); + if (func_proto) + return func_proto; + switch (func_id) { case BPF_FUNC_setsockopt: return &bpf_sock_ops_setsockopt_proto; @@ -8078,8 +7970,6 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sock_hash_update_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_sock_ops_proto; - case BPF_FUNC_get_local_storage: - return &bpf_get_local_storage_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; case BPF_FUNC_sk_storage_get: @@ -10812,14 +10702,13 @@ int sk_detach_filter(struct sock *sk) } EXPORT_SYMBOL_GPL(sk_detach_filter); -int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, - unsigned int len) +int sk_get_filter(struct sock *sk, sockptr_t optval, unsigned int len) { struct sock_fprog_kern *fprog; struct sk_filter *filter; int ret = 0; - lock_sock(sk); + sockopt_lock_sock(sk); filter = rcu_dereference_protected(sk->sk_filter, lockdep_sock_is_held(sk)); if (!filter) @@ -10844,7 +10733,7 @@ int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, goto out; ret = -EFAULT; - if (copy_to_user(ubuf, fprog->filter, bpf_classic_proglen(fprog))) + if (copy_to_sockptr(optval, fprog->filter, bpf_classic_proglen(fprog))) goto out; /* Instead of bytes, the API requests to return the number @@ -10852,7 +10741,7 @@ int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, */ ret = fprog->len; out: - release_sock(sk); + sockopt_release_sock(sk); return ret; } diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 764c4cb3fe8f..990429c69ccd 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -866,8 +866,8 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys, } } -bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, - __be16 proto, int nhoff, int hlen, unsigned int flags) +u32 bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, + __be16 proto, int nhoff, int hlen, unsigned int flags) { struct bpf_flow_keys *flow_keys = ctx->flow_keys; u32 result; @@ -892,7 +892,7 @@ bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, flow_keys->thoff = clamp_t(u16, flow_keys->thoff, flow_keys->nhoff, hlen); - return result == BPF_OK; + return result; } static bool is_pppoe_ses_hdr_valid(const struct pppoe_hdr *hdr) @@ -1008,6 +1008,7 @@ bool __skb_flow_dissect(const struct net *net, }; __be16 n_proto = proto; struct bpf_prog *prog; + u32 result; if (skb) { ctx.skb = skb; @@ -1019,13 +1020,16 @@ bool __skb_flow_dissect(const struct net *net, } prog = READ_ONCE(run_array->items[0].prog); - ret = bpf_flow_dissect(prog, &ctx, n_proto, nhoff, - hlen, flags); + result = bpf_flow_dissect(prog, &ctx, n_proto, nhoff, + hlen, flags); + if (result == BPF_FLOW_DISSECTOR_CONTINUE) + goto dissect_continue; __skb_flow_bpf_to_target(&flow_keys, flow_dissector, target_container); rcu_read_unlock(); - return ret; + return result == BPF_OK; } +dissect_continue: rcu_read_unlock(); } diff --git a/net/core/sock.c b/net/core/sock.c index 788c1372663c..eeb6cbac6f49 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -703,15 +703,17 @@ static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen) goto out; } - return sock_bindtoindex(sk, index, true); + sockopt_lock_sock(sk); + ret = sock_bindtoindex_locked(sk, index); + sockopt_release_sock(sk); out: #endif return ret; } -static int sock_getbindtodevice(struct sock *sk, char __user *optval, - int __user *optlen, int len) +static int sock_getbindtodevice(struct sock *sk, sockptr_t optval, + sockptr_t optlen, int len) { int ret = -ENOPROTOOPT; #ifdef CONFIG_NETDEVICES @@ -735,12 +737,12 @@ static int sock_getbindtodevice(struct sock *sk, char __user *optval, len = strlen(devname) + 1; ret = -EFAULT; - if (copy_to_user(optval, devname, len)) + if (copy_to_sockptr(optval, devname, len)) goto out; zero: ret = -EFAULT; - if (put_user(len, optlen)) + if (copy_to_sockptr(optlen, &len, sizeof(int))) goto out; ret = 0; @@ -1036,17 +1038,51 @@ static int sock_reserve_memory(struct sock *sk, int bytes) return 0; } +void sockopt_lock_sock(struct sock *sk) +{ + /* When current->bpf_ctx is set, the setsockopt is called from + * a bpf prog. bpf has ensured the sk lock has been + * acquired before calling setsockopt(). + */ + if (has_current_bpf_ctx()) + return; + + lock_sock(sk); +} +EXPORT_SYMBOL(sockopt_lock_sock); + +void sockopt_release_sock(struct sock *sk) +{ + if (has_current_bpf_ctx()) + return; + + release_sock(sk); +} +EXPORT_SYMBOL(sockopt_release_sock); + +bool sockopt_ns_capable(struct user_namespace *ns, int cap) +{ + return has_current_bpf_ctx() || ns_capable(ns, cap); +} +EXPORT_SYMBOL(sockopt_ns_capable); + +bool sockopt_capable(int cap) +{ + return has_current_bpf_ctx() || capable(cap); +} +EXPORT_SYMBOL(sockopt_capable); + /* * This is meant for all protocols to use and covers goings on * at the socket level. Everything here is generic. */ -int sock_setsockopt(struct socket *sock, int level, int optname, - sockptr_t optval, unsigned int optlen) +int sk_setsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, unsigned int optlen) { struct so_timestamping timestamping; + struct socket *sock = sk->sk_socket; struct sock_txtime sk_txtime; - struct sock *sk = sock->sk; int val; int valbool; struct linger ling; @@ -1067,11 +1103,11 @@ int sock_setsockopt(struct socket *sock, int level, int optname, valbool = val ? 1 : 0; - lock_sock(sk); + sockopt_lock_sock(sk); switch (optname) { case SO_DEBUG: - if (val && !capable(CAP_NET_ADMIN)) + if (val && !sockopt_capable(CAP_NET_ADMIN)) ret = -EACCES; else sock_valbool_flag(sk, SOCK_DBG, valbool); @@ -1115,7 +1151,7 @@ set_sndbuf: break; case SO_SNDBUFFORCE: - if (!capable(CAP_NET_ADMIN)) { + if (!sockopt_capable(CAP_NET_ADMIN)) { ret = -EPERM; break; } @@ -1137,7 +1173,7 @@ set_sndbuf: break; case SO_RCVBUFFORCE: - if (!capable(CAP_NET_ADMIN)) { + if (!sockopt_capable(CAP_NET_ADMIN)) { ret = -EPERM; break; } @@ -1164,8 +1200,8 @@ set_sndbuf: case SO_PRIORITY: if ((val >= 0 && val <= 6) || - ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) || - ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) + sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) || + sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) sk->sk_priority = val; else ret = -EPERM; @@ -1228,7 +1264,7 @@ set_sndbuf: case SO_RCVLOWAT: if (val < 0) val = INT_MAX; - if (sock->ops->set_rcvlowat) + if (sock && sock->ops->set_rcvlowat) ret = sock->ops->set_rcvlowat(sk, val); else WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); @@ -1310,8 +1346,8 @@ set_sndbuf: clear_bit(SOCK_PASSSEC, &sock->flags); break; case SO_MARK: - if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && - !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { + if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && + !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { ret = -EPERM; break; } @@ -1319,8 +1355,8 @@ set_sndbuf: __sock_set_mark(sk, val); break; case SO_RCVMARK: - if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && - !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { + if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && + !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { ret = -EPERM; break; } @@ -1354,7 +1390,7 @@ set_sndbuf: #ifdef CONFIG_NET_RX_BUSY_POLL case SO_BUSY_POLL: /* allow unprivileged users to decrease the value */ - if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN)) + if ((val > sk->sk_ll_usec) && !sockopt_capable(CAP_NET_ADMIN)) ret = -EPERM; else { if (val < 0) @@ -1364,13 +1400,13 @@ set_sndbuf: } break; case SO_PREFER_BUSY_POLL: - if (valbool && !capable(CAP_NET_ADMIN)) + if (valbool && !sockopt_capable(CAP_NET_ADMIN)) ret = -EPERM; else WRITE_ONCE(sk->sk_prefer_busy_poll, valbool); break; case SO_BUSY_POLL_BUDGET: - if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) { + if (val > READ_ONCE(sk->sk_busy_poll_budget) && !sockopt_capable(CAP_NET_ADMIN)) { ret = -EPERM; } else { if (val < 0 || val > U16_MAX) @@ -1441,7 +1477,7 @@ set_sndbuf: * scheduler has enough safe guards. */ if (sk_txtime.clockid != CLOCK_MONOTONIC && - !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { + !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { ret = -EPERM; break; } @@ -1496,9 +1532,16 @@ set_sndbuf: ret = -ENOPROTOOPT; break; } - release_sock(sk); + sockopt_release_sock(sk); return ret; } + +int sock_setsockopt(struct socket *sock, int level, int optname, + sockptr_t optval, unsigned int optlen) +{ + return sk_setsockopt(sock->sk, level, optname, + optval, optlen); +} EXPORT_SYMBOL(sock_setsockopt); static const struct cred *sk_get_peer_cred(struct sock *sk) @@ -1525,22 +1568,25 @@ static void cred_to_ucred(struct pid *pid, const struct cred *cred, } } -static int groups_to_user(gid_t __user *dst, const struct group_info *src) +static int groups_to_user(sockptr_t dst, const struct group_info *src) { struct user_namespace *user_ns = current_user_ns(); int i; - for (i = 0; i < src->ngroups; i++) - if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i)) + for (i = 0; i < src->ngroups; i++) { + gid_t gid = from_kgid_munged(user_ns, src->gid[i]); + + if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid))) return -EFAULT; + } return 0; } -int sock_getsockopt(struct socket *sock, int level, int optname, - char __user *optval, int __user *optlen) +int sk_getsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, sockptr_t optlen) { - struct sock *sk = sock->sk; + struct socket *sock = sk->sk_socket; union { int val; @@ -1557,7 +1603,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, int lv = sizeof(int); int len; - if (get_user(len, optlen)) + if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; if (len < 0) return -EINVAL; @@ -1692,7 +1738,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred); spin_unlock(&sk->sk_peer_lock); - if (copy_to_user(optval, &peercred, len)) + if (copy_to_sockptr(optval, &peercred, len)) return -EFAULT; goto lenout; } @@ -1710,11 +1756,11 @@ int sock_getsockopt(struct socket *sock, int level, int optname, if (len < n * sizeof(gid_t)) { len = n * sizeof(gid_t); put_cred(cred); - return put_user(len, optlen) ? -EFAULT : -ERANGE; + return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE; } len = n * sizeof(gid_t); - ret = groups_to_user((gid_t __user *)optval, cred->group_info); + ret = groups_to_user(optval, cred->group_info); put_cred(cred); if (ret) return ret; @@ -1730,7 +1776,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, return -ENOTCONN; if (lv < len) return -EINVAL; - if (copy_to_user(optval, address, len)) + if (copy_to_sockptr(optval, address, len)) return -EFAULT; goto lenout; } @@ -1747,7 +1793,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, break; case SO_PEERSEC: - return security_socket_getpeersec_stream(sock, optval, optlen, len); + return security_socket_getpeersec_stream(sock, optval.user, optlen.user, len); case SO_MARK: v.val = sk->sk_mark; @@ -1779,7 +1825,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, return sock_getbindtodevice(sk, optval, optlen, len); case SO_GET_FILTER: - len = sk_get_filter(sk, (struct sock_filter __user *)optval, len); + len = sk_get_filter(sk, optval, len); if (len < 0) return len; @@ -1827,7 +1873,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, sk_get_meminfo(sk, meminfo); len = min_t(unsigned int, len, sizeof(meminfo)); - if (copy_to_user(optval, &meminfo, len)) + if (copy_to_sockptr(optval, &meminfo, len)) return -EFAULT; goto lenout; @@ -1896,14 +1942,22 @@ int sock_getsockopt(struct socket *sock, int level, int optname, if (len > lv) len = lv; - if (copy_to_user(optval, &v, len)) + if (copy_to_sockptr(optval, &v, len)) return -EFAULT; lenout: - if (put_user(len, optlen)) + if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; return 0; } +int sock_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + return sk_getsockopt(sock->sk, level, optname, + USER_SOCKPTR(optval), + USER_SOCKPTR(optlen)); +} + /* * Initialize an sk_lock. * diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index e3ab0cb61624..df0660d818ac 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -2529,11 +2529,10 @@ done: err = ip_mc_leave_group(sk, &imr); return err; } - int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf, - struct ip_msfilter __user *optval, int __user *optlen) + sockptr_t optval, sockptr_t optlen) { - int err, len, count, copycount; + int err, len, count, copycount, msf_size; struct ip_mreqn imr; __be32 addr = msf->imsf_multiaddr; struct ip_mc_socklist *pmc; @@ -2575,12 +2574,15 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf, copycount = count < msf->imsf_numsrc ? count : msf->imsf_numsrc; len = flex_array_size(psl, sl_addr, copycount); msf->imsf_numsrc = count; - if (put_user(IP_MSFILTER_SIZE(copycount), optlen) || - copy_to_user(optval, msf, IP_MSFILTER_SIZE(0))) { + msf_size = IP_MSFILTER_SIZE(copycount); + if (copy_to_sockptr(optlen, &msf_size, sizeof(int)) || + copy_to_sockptr(optval, msf, IP_MSFILTER_SIZE(0))) { return -EFAULT; } if (len && - copy_to_user(&optval->imsf_slist_flex[0], psl->sl_addr, len)) + copy_to_sockptr_offset(optval, + offsetof(struct ip_msfilter, imsf_slist_flex), + psl->sl_addr, len)) return -EFAULT; return 0; done: @@ -2588,7 +2590,7 @@ done: } int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf, - struct sockaddr_storage __user *p) + sockptr_t optval, size_t ss_offset) { int i, count, copycount; struct sockaddr_in *psin; @@ -2618,15 +2620,17 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf, count = psl ? psl->sl_count : 0; copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc; gsf->gf_numsrc = count; - for (i = 0; i < copycount; i++, p++) { + for (i = 0; i < copycount; i++) { struct sockaddr_storage ss; psin = (struct sockaddr_in *)&ss; memset(&ss, 0, sizeof(ss)); psin->sin_family = AF_INET; psin->sin_addr.s_addr = psl->sl_addr[i]; - if (copy_to_user(p, &ss, sizeof(ss))) + if (copy_to_sockptr_offset(optval, ss_offset, + &ss, sizeof(ss))) return -EFAULT; + ss_offset += sizeof(ss); } return 0; } diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index e49a61a053a6..6e19cad154f5 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -888,8 +888,8 @@ static int compat_ip_mcast_join_leave(struct sock *sk, int optname, DEFINE_STATIC_KEY_FALSE(ip4_min_ttl); -static int do_ip_setsockopt(struct sock *sk, int level, int optname, - sockptr_t optval, unsigned int optlen) +int do_ip_setsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, unsigned int optlen) { struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); @@ -944,7 +944,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, int optname, err = 0; if (needs_rtnl) rtnl_lock(); - lock_sock(sk); + sockopt_lock_sock(sk); switch (optname) { case IP_OPTIONS: @@ -1333,14 +1333,14 @@ static int do_ip_setsockopt(struct sock *sk, int level, int optname, case IP_IPSEC_POLICY: case IP_XFRM_POLICY: err = -EPERM; - if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) + if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) break; err = xfrm_user_policy(sk, optname, optval, optlen); break; case IP_TRANSPARENT: - if (!!val && !ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && - !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { + if (!!val && !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && + !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { err = -EPERM; break; } @@ -1368,13 +1368,13 @@ static int do_ip_setsockopt(struct sock *sk, int level, int optname, err = -ENOPROTOOPT; break; } - release_sock(sk); + sockopt_release_sock(sk); if (needs_rtnl) rtnl_unlock(); return err; e_inval: - release_sock(sk); + sockopt_release_sock(sk); if (needs_rtnl) rtnl_unlock(); return -EINVAL; @@ -1462,37 +1462,37 @@ static bool getsockopt_needs_rtnl(int optname) return false; } -static int ip_get_mcast_msfilter(struct sock *sk, void __user *optval, - int __user *optlen, int len) +static int ip_get_mcast_msfilter(struct sock *sk, sockptr_t optval, + sockptr_t optlen, int len) { const int size0 = offsetof(struct group_filter, gf_slist_flex); - struct group_filter __user *p = optval; struct group_filter gsf; - int num; + int num, gsf_size; int err; if (len < size0) return -EINVAL; - if (copy_from_user(&gsf, p, size0)) + if (copy_from_sockptr(&gsf, optval, size0)) return -EFAULT; num = gsf.gf_numsrc; - err = ip_mc_gsfget(sk, &gsf, p->gf_slist_flex); + err = ip_mc_gsfget(sk, &gsf, optval, + offsetof(struct group_filter, gf_slist_flex)); if (err) return err; if (gsf.gf_numsrc < num) num = gsf.gf_numsrc; - if (put_user(GROUP_FILTER_SIZE(num), optlen) || - copy_to_user(p, &gsf, size0)) + gsf_size = GROUP_FILTER_SIZE(num); + if (copy_to_sockptr(optlen, &gsf_size, sizeof(int)) || + copy_to_sockptr(optval, &gsf, size0)) return -EFAULT; return 0; } -static int compat_ip_get_mcast_msfilter(struct sock *sk, void __user *optval, - int __user *optlen, int len) +static int compat_ip_get_mcast_msfilter(struct sock *sk, sockptr_t optval, + sockptr_t optlen, int len) { const int size0 = offsetof(struct compat_group_filter, gf_slist_flex); - struct compat_group_filter __user *p = optval; struct compat_group_filter gf32; struct group_filter gf; int num; @@ -1500,7 +1500,7 @@ static int compat_ip_get_mcast_msfilter(struct sock *sk, void __user *optval, if (len < size0) return -EINVAL; - if (copy_from_user(&gf32, p, size0)) + if (copy_from_sockptr(&gf32, optval, size0)) return -EFAULT; gf.gf_interface = gf32.gf_interface; @@ -1508,21 +1508,24 @@ static int compat_ip_get_mcast_msfilter(struct sock *sk, void __user *optval, num = gf.gf_numsrc = gf32.gf_numsrc; gf.gf_group = gf32.gf_group; - err = ip_mc_gsfget(sk, &gf, p->gf_slist_flex); + err = ip_mc_gsfget(sk, &gf, optval, + offsetof(struct compat_group_filter, gf_slist_flex)); if (err) return err; if (gf.gf_numsrc < num) num = gf.gf_numsrc; len = GROUP_FILTER_SIZE(num) - (sizeof(gf) - sizeof(gf32)); - if (put_user(len, optlen) || - put_user(gf.gf_fmode, &p->gf_fmode) || - put_user(gf.gf_numsrc, &p->gf_numsrc)) + if (copy_to_sockptr(optlen, &len, sizeof(int)) || + copy_to_sockptr_offset(optval, offsetof(struct compat_group_filter, gf_fmode), + &gf.gf_fmode, sizeof(gf.gf_fmode)) || + copy_to_sockptr_offset(optval, offsetof(struct compat_group_filter, gf_numsrc), + &gf.gf_numsrc, sizeof(gf.gf_numsrc))) return -EFAULT; return 0; } -static int do_ip_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen) +int do_ip_getsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, sockptr_t optlen) { struct inet_sock *inet = inet_sk(sk); bool needs_rtnl = getsockopt_needs_rtnl(optname); @@ -1535,14 +1538,14 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, if (ip_mroute_opt(optname)) return ip_mroute_getsockopt(sk, optname, optval, optlen); - if (get_user(len, optlen)) + if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; if (len < 0) return -EINVAL; if (needs_rtnl) rtnl_lock(); - lock_sock(sk); + sockopt_lock_sock(sk); switch (optname) { case IP_OPTIONS: @@ -1558,17 +1561,19 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, memcpy(optbuf, &inet_opt->opt, sizeof(struct ip_options) + inet_opt->opt.optlen); - release_sock(sk); + sockopt_release_sock(sk); - if (opt->optlen == 0) - return put_user(0, optlen); + if (opt->optlen == 0) { + len = 0; + return copy_to_sockptr(optlen, &len, sizeof(int)); + } ip_options_undo(opt); len = min_t(unsigned int, len, opt->optlen); - if (put_user(len, optlen)) + if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; - if (copy_to_user(optval, opt->__data, len)) + if (copy_to_sockptr(optval, opt->__data, len)) return -EFAULT; return 0; } @@ -1632,7 +1637,7 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, dst_release(dst); } if (!val) { - release_sock(sk); + sockopt_release_sock(sk); return -ENOTCONN; } break; @@ -1657,11 +1662,11 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, struct in_addr addr; len = min_t(unsigned int, len, sizeof(struct in_addr)); addr.s_addr = inet->mc_addr; - release_sock(sk); + sockopt_release_sock(sk); - if (put_user(len, optlen)) + if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; - if (copy_to_user(optval, &addr, len)) + if (copy_to_sockptr(optval, &addr, len)) return -EFAULT; return 0; } @@ -1673,12 +1678,11 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, err = -EINVAL; goto out; } - if (copy_from_user(&msf, optval, IP_MSFILTER_SIZE(0))) { + if (copy_from_sockptr(&msf, optval, IP_MSFILTER_SIZE(0))) { err = -EFAULT; goto out; } - err = ip_mc_msfget(sk, &msf, - (struct ip_msfilter __user *)optval, optlen); + err = ip_mc_msfget(sk, &msf, optval, optlen); goto out; } case MCAST_MSFILTER: @@ -1695,13 +1699,18 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, { struct msghdr msg; - release_sock(sk); + sockopt_release_sock(sk); if (sk->sk_type != SOCK_STREAM) return -ENOPROTOOPT; - msg.msg_control_is_user = true; - msg.msg_control_user = optval; + if (optval.is_kernel) { + msg.msg_control_is_user = false; + msg.msg_control = optval.kernel; + } else { + msg.msg_control_is_user = true; + msg.msg_control_user = optval.user; + } msg.msg_controllen = len; msg.msg_flags = in_compat_syscall() ? MSG_CMSG_COMPAT : 0; @@ -1722,7 +1731,7 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, put_cmsg(&msg, SOL_IP, IP_TOS, sizeof(tos), &tos); } len -= msg.msg_controllen; - return put_user(len, optlen); + return copy_to_sockptr(optlen, &len, sizeof(int)); } case IP_FREEBIND: val = inet->freebind; @@ -1734,29 +1743,29 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, val = inet->min_ttl; break; default: - release_sock(sk); + sockopt_release_sock(sk); return -ENOPROTOOPT; } - release_sock(sk); + sockopt_release_sock(sk); if (len < sizeof(int) && len > 0 && val >= 0 && val <= 255) { unsigned char ucval = (unsigned char)val; len = 1; - if (put_user(len, optlen)) + if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; - if (copy_to_user(optval, &ucval, 1)) + if (copy_to_sockptr(optval, &ucval, 1)) return -EFAULT; } else { len = min_t(unsigned int, sizeof(int), len); - if (put_user(len, optlen)) + if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; - if (copy_to_user(optval, &val, len)) + if (copy_to_sockptr(optval, &val, len)) return -EFAULT; } return 0; out: - release_sock(sk); + sockopt_release_sock(sk); if (needs_rtnl) rtnl_unlock(); return err; @@ -1767,7 +1776,8 @@ int ip_getsockopt(struct sock *sk, int level, { int err; - err = do_ip_getsockopt(sk, level, optname, optval, optlen); + err = do_ip_getsockopt(sk, level, optname, + USER_SOCKPTR(optval), USER_SOCKPTR(optlen)); #if IS_ENABLED(CONFIG_BPFILTER_UMH) if (optname >= BPFILTER_IPT_SO_GET_INFO && diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 73651d17e51f..95eefbe2e142 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1546,7 +1546,8 @@ out: } /* Getsock opt support for the multicast routing system. */ -int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int __user *optlen) +int ip_mroute_getsockopt(struct sock *sk, int optname, sockptr_t optval, + sockptr_t optlen) { int olr; int val; @@ -1577,14 +1578,14 @@ int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int return -ENOPROTOOPT; } - if (get_user(olr, optlen)) + if (copy_from_sockptr(&olr, optlen, sizeof(int))) return -EFAULT; olr = min_t(unsigned int, olr, sizeof(int)); if (olr < 0) return -EINVAL; - if (put_user(olr, optlen)) + if (copy_to_sockptr(optlen, &olr, sizeof(int))) return -EFAULT; - if (copy_to_user(optval, &val, olr)) + if (copy_to_sockptr(optval, &val, olr)) return -EFAULT; return 0; } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 306b94dedc8d..52b8879e7d20 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3199,7 +3199,7 @@ EXPORT_SYMBOL(tcp_disconnect); static inline bool tcp_can_repair_sock(const struct sock *sk) { - return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) && + return sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) && (sk->sk_state != TCP_LISTEN); } @@ -3476,8 +3476,8 @@ int tcp_set_window_clamp(struct sock *sk, int val) /* * Socket option code for TCP. */ -static int do_tcp_setsockopt(struct sock *sk, int level, int optname, - sockptr_t optval, unsigned int optlen) +int do_tcp_setsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, unsigned int optlen) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); @@ -3499,11 +3499,11 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int optname, return -EFAULT; name[val] = 0; - lock_sock(sk); - err = tcp_set_congestion_control(sk, name, true, - ns_capable(sock_net(sk)->user_ns, - CAP_NET_ADMIN)); - release_sock(sk); + sockopt_lock_sock(sk); + err = tcp_set_congestion_control(sk, name, !has_current_bpf_ctx(), + sockopt_ns_capable(sock_net(sk)->user_ns, + CAP_NET_ADMIN)); + sockopt_release_sock(sk); return err; } case TCP_ULP: { @@ -3519,9 +3519,9 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int optname, return -EFAULT; name[val] = 0; - lock_sock(sk); + sockopt_lock_sock(sk); err = tcp_set_ulp(sk, name); - release_sock(sk); + sockopt_release_sock(sk); return err; } case TCP_FASTOPEN_KEY: { @@ -3554,7 +3554,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int optname, if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; - lock_sock(sk); + sockopt_lock_sock(sk); switch (optname) { case TCP_MAXSEG: @@ -3776,7 +3776,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int optname, break; } - release_sock(sk); + sockopt_release_sock(sk); return err; } @@ -4040,15 +4040,15 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk, return stats; } -static int do_tcp_getsockopt(struct sock *sk, int level, - int optname, char __user *optval, int __user *optlen) +int do_tcp_getsockopt(struct sock *sk, int level, + int optname, sockptr_t optval, sockptr_t optlen) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); int val, len; - if (get_user(len, optlen)) + if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; len = min_t(unsigned int, len, sizeof(int)); @@ -4098,15 +4098,15 @@ static int do_tcp_getsockopt(struct sock *sk, int level, case TCP_INFO: { struct tcp_info info; - if (get_user(len, optlen)) + if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; tcp_get_info(sk, &info); len = min_t(unsigned int, len, sizeof(info)); - if (put_user(len, optlen)) + if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; - if (copy_to_user(optval, &info, len)) + if (copy_to_sockptr(optval, &info, len)) return -EFAULT; return 0; } @@ -4116,7 +4116,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, size_t sz = 0; int attr; - if (get_user(len, optlen)) + if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; ca_ops = icsk->icsk_ca_ops; @@ -4124,9 +4124,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level, sz = ca_ops->get_info(sk, ~0U, &attr, &info); len = min_t(unsigned int, len, sz); - if (put_user(len, optlen)) + if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; - if (copy_to_user(optval, &info, len)) + if (copy_to_sockptr(optval, &info, len)) return -EFAULT; return 0; } @@ -4135,27 +4135,28 @@ static int do_tcp_getsockopt(struct sock *sk, int level, break; case TCP_CONGESTION: - if (get_user(len, optlen)) + if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; len = min_t(unsigned int, len, TCP_CA_NAME_MAX); - if (put_user(len, optlen)) + if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; - if (copy_to_user(optval, icsk->icsk_ca_ops->name, len)) + if (copy_to_sockptr(optval, icsk->icsk_ca_ops->name, len)) return -EFAULT; return 0; case TCP_ULP: - if (get_user(len, optlen)) + if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; len = min_t(unsigned int, len, TCP_ULP_NAME_MAX); if (!icsk->icsk_ulp_ops) { - if (put_user(0, optlen)) + len = 0; + if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; return 0; } - if (put_user(len, optlen)) + if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; - if (copy_to_user(optval, icsk->icsk_ulp_ops->name, len)) + if (copy_to_sockptr(optval, icsk->icsk_ulp_ops->name, len)) return -EFAULT; return 0; @@ -4163,15 +4164,15 @@ static int do_tcp_getsockopt(struct sock *sk, int level, u64 key[TCP_FASTOPEN_KEY_BUF_LENGTH / sizeof(u64)]; unsigned int key_len; - if (get_user(len, optlen)) + if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; key_len = tcp_fastopen_get_cipher(net, icsk, key) * TCP_FASTOPEN_KEY_LENGTH; len = min_t(unsigned int, len, key_len); - if (put_user(len, optlen)) + if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; - if (copy_to_user(optval, key, len)) + if (copy_to_sockptr(optval, key, len)) return -EFAULT; return 0; } @@ -4197,7 +4198,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, case TCP_REPAIR_WINDOW: { struct tcp_repair_window opt; - if (get_user(len, optlen)) + if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; if (len != sizeof(opt)) @@ -4212,7 +4213,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, opt.rcv_wnd = tp->rcv_wnd; opt.rcv_wup = tp->rcv_wup; - if (copy_to_user(optval, &opt, len)) + if (copy_to_sockptr(optval, &opt, len)) return -EFAULT; return 0; } @@ -4258,35 +4259,35 @@ static int do_tcp_getsockopt(struct sock *sk, int level, val = tp->save_syn; break; case TCP_SAVED_SYN: { - if (get_user(len, optlen)) + if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; - lock_sock(sk); + sockopt_lock_sock(sk); if (tp->saved_syn) { if (len < tcp_saved_syn_len(tp->saved_syn)) { - if (put_user(tcp_saved_syn_len(tp->saved_syn), - optlen)) { - release_sock(sk); + len = tcp_saved_syn_len(tp->saved_syn); + if (copy_to_sockptr(optlen, &len, sizeof(int))) { + sockopt_release_sock(sk); return -EFAULT; } - release_sock(sk); + sockopt_release_sock(sk); return -EINVAL; } len = tcp_saved_syn_len(tp->saved_syn); - if (put_user(len, optlen)) { - release_sock(sk); + if (copy_to_sockptr(optlen, &len, sizeof(int))) { + sockopt_release_sock(sk); return -EFAULT; } - if (copy_to_user(optval, tp->saved_syn->data, len)) { - release_sock(sk); + if (copy_to_sockptr(optval, tp->saved_syn->data, len)) { + sockopt_release_sock(sk); return -EFAULT; } tcp_saved_syn_free(tp); - release_sock(sk); + sockopt_release_sock(sk); } else { - release_sock(sk); + sockopt_release_sock(sk); len = 0; - if (put_user(len, optlen)) + if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; } return 0; @@ -4297,31 +4298,31 @@ static int do_tcp_getsockopt(struct sock *sk, int level, struct tcp_zerocopy_receive zc = {}; int err; - if (get_user(len, optlen)) + if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; if (len < 0 || len < offsetofend(struct tcp_zerocopy_receive, length)) return -EINVAL; if (unlikely(len > sizeof(zc))) { - err = check_zeroed_user(optval + sizeof(zc), - len - sizeof(zc)); + err = check_zeroed_sockptr(optval, sizeof(zc), + len - sizeof(zc)); if (err < 1) return err == 0 ? -EINVAL : err; len = sizeof(zc); - if (put_user(len, optlen)) + if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; } - if (copy_from_user(&zc, optval, len)) + if (copy_from_sockptr(&zc, optval, len)) return -EFAULT; if (zc.reserved) return -EINVAL; if (zc.msg_flags & ~(TCP_VALID_ZC_MSG_FLAGS)) return -EINVAL; - lock_sock(sk); + sockopt_lock_sock(sk); err = tcp_zerocopy_receive(sk, &zc, &tss); err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname, &zc, &len, err); - release_sock(sk); + sockopt_release_sock(sk); if (len >= offsetofend(struct tcp_zerocopy_receive, msg_flags)) goto zerocopy_rcv_cmsg; switch (len) { @@ -4351,7 +4352,7 @@ zerocopy_rcv_sk_err: zerocopy_rcv_inq: zc.inq = tcp_inq_hint(sk); zerocopy_rcv_out: - if (!err && copy_to_user(optval, &zc, len)) + if (!err && copy_to_sockptr(optval, &zc, len)) err = -EFAULT; return err; } @@ -4360,9 +4361,9 @@ zerocopy_rcv_out: return -ENOPROTOOPT; } - if (put_user(len, optlen)) + if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; - if (copy_to_user(optval, &val, len)) + if (copy_to_sockptr(optval, &val, len)) return -EFAULT; return 0; } @@ -4387,7 +4388,8 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, if (level != SOL_TCP) return icsk->icsk_af_ops->getsockopt(sk, level, optname, optval, optlen); - return do_tcp_getsockopt(sk, level, optname, optval, optlen); + return do_tcp_getsockopt(sk, level, optname, USER_SOCKPTR(optval), + USER_SOCKPTR(optlen)); } EXPORT_SYMBOL(tcp_getsockopt); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 2ce0c44d0081..19732b5dce23 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -1057,6 +1057,8 @@ static const struct ipv6_stub ipv6_stub_impl = { static const struct ipv6_bpf_stub ipv6_bpf_stub_impl = { .inet6_bind = __inet6_bind, .udp6_lib_lookup = __udp6_lib_lookup, + .ipv6_setsockopt = do_ipv6_setsockopt, + .ipv6_getsockopt = do_ipv6_getsockopt, }; static int __init inet6_init(void) diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index a9ba41648e36..516e83b52f26 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -1827,8 +1827,8 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, sockptr_t optval, * Getsock opt support for the multicast routing system. */ -int ip6_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, - int __user *optlen) +int ip6_mroute_getsockopt(struct sock *sk, int optname, sockptr_t optval, + sockptr_t optlen) { int olr; int val; @@ -1859,16 +1859,16 @@ int ip6_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, return -ENOPROTOOPT; } - if (get_user(olr, optlen)) + if (copy_from_sockptr(&olr, optlen, sizeof(int))) return -EFAULT; olr = min_t(int, olr, sizeof(int)); if (olr < 0) return -EINVAL; - if (put_user(olr, optlen)) + if (copy_to_sockptr(optlen, &olr, sizeof(int))) return -EFAULT; - if (copy_to_user(optval, &val, olr)) + if (copy_to_sockptr(optval, &val, olr)) return -EFAULT; return 0; } diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index e0dcc7a193df..2d2f4dd9e5df 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -327,7 +327,7 @@ static int ipv6_set_opt_hdr(struct sock *sk, int optname, sockptr_t optval, int err; /* hop-by-hop / destination options are privileged option */ - if (optname != IPV6_RTHDR && !ns_capable(net->user_ns, CAP_NET_RAW)) + if (optname != IPV6_RTHDR && !sockopt_ns_capable(net->user_ns, CAP_NET_RAW)) return -EPERM; /* remove any sticky options header with a zero option @@ -391,8 +391,8 @@ sticky_done: return err; } -static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, - sockptr_t optval, unsigned int optlen) +int do_ipv6_setsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, unsigned int optlen) { struct ipv6_pinfo *np = inet6_sk(sk); struct net *net = sock_net(sk); @@ -417,7 +417,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, if (needs_rtnl) rtnl_lock(); - lock_sock(sk); + sockopt_lock_sock(sk); switch (optname) { @@ -634,8 +634,8 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, break; case IPV6_TRANSPARENT: - if (valbool && !ns_capable(net->user_ns, CAP_NET_RAW) && - !ns_capable(net->user_ns, CAP_NET_ADMIN)) { + if (valbool && !sockopt_ns_capable(net->user_ns, CAP_NET_RAW) && + !sockopt_ns_capable(net->user_ns, CAP_NET_ADMIN)) { retv = -EPERM; break; } @@ -946,7 +946,7 @@ done: case IPV6_IPSEC_POLICY: case IPV6_XFRM_POLICY: retv = -EPERM; - if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + if (!sockopt_ns_capable(net->user_ns, CAP_NET_ADMIN)) break; retv = xfrm_user_policy(sk, optname, optval, optlen); break; @@ -994,14 +994,14 @@ done: break; } - release_sock(sk); + sockopt_release_sock(sk); if (needs_rtnl) rtnl_unlock(); return retv; e_inval: - release_sock(sk); + sockopt_release_sock(sk); if (needs_rtnl) rtnl_unlock(); return -EINVAL; @@ -1030,7 +1030,7 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, EXPORT_SYMBOL(ipv6_setsockopt); static int ipv6_getsockopt_sticky(struct sock *sk, struct ipv6_txoptions *opt, - int optname, char __user *optval, int len) + int optname, sockptr_t optval, int len) { struct ipv6_opt_hdr *hdr; @@ -1058,56 +1058,53 @@ static int ipv6_getsockopt_sticky(struct sock *sk, struct ipv6_txoptions *opt, return 0; len = min_t(unsigned int, len, ipv6_optlen(hdr)); - if (copy_to_user(optval, hdr, len)) + if (copy_to_sockptr(optval, hdr, len)) return -EFAULT; return len; } -static int ipv6_get_msfilter(struct sock *sk, void __user *optval, - int __user *optlen, int len) +static int ipv6_get_msfilter(struct sock *sk, sockptr_t optval, + sockptr_t optlen, int len) { const int size0 = offsetof(struct group_filter, gf_slist_flex); - struct group_filter __user *p = optval; struct group_filter gsf; int num; int err; if (len < size0) return -EINVAL; - if (copy_from_user(&gsf, p, size0)) + if (copy_from_sockptr(&gsf, optval, size0)) return -EFAULT; if (gsf.gf_group.ss_family != AF_INET6) return -EADDRNOTAVAIL; num = gsf.gf_numsrc; - lock_sock(sk); - err = ip6_mc_msfget(sk, &gsf, p->gf_slist_flex); + sockopt_lock_sock(sk); + err = ip6_mc_msfget(sk, &gsf, optval, size0); if (!err) { if (num > gsf.gf_numsrc) num = gsf.gf_numsrc; - if (put_user(GROUP_FILTER_SIZE(num), optlen) || - copy_to_user(p, &gsf, size0)) + len = GROUP_FILTER_SIZE(num); + if (copy_to_sockptr(optlen, &len, sizeof(int)) || + copy_to_sockptr(optval, &gsf, size0)) err = -EFAULT; } - release_sock(sk); + sockopt_release_sock(sk); return err; } -static int compat_ipv6_get_msfilter(struct sock *sk, void __user *optval, - int __user *optlen) +static int compat_ipv6_get_msfilter(struct sock *sk, sockptr_t optval, + sockptr_t optlen, int len) { const int size0 = offsetof(struct compat_group_filter, gf_slist_flex); - struct compat_group_filter __user *p = optval; struct compat_group_filter gf32; struct group_filter gf; - int len, err; + int err; int num; - if (get_user(len, optlen)) - return -EFAULT; if (len < size0) return -EINVAL; - if (copy_from_user(&gf32, p, size0)) + if (copy_from_sockptr(&gf32, optval, size0)) return -EFAULT; gf.gf_interface = gf32.gf_interface; gf.gf_fmode = gf32.gf_fmode; @@ -1117,23 +1114,25 @@ static int compat_ipv6_get_msfilter(struct sock *sk, void __user *optval, if (gf.gf_group.ss_family != AF_INET6) return -EADDRNOTAVAIL; - lock_sock(sk); - err = ip6_mc_msfget(sk, &gf, p->gf_slist_flex); - release_sock(sk); + sockopt_lock_sock(sk); + err = ip6_mc_msfget(sk, &gf, optval, size0); + sockopt_release_sock(sk); if (err) return err; if (num > gf.gf_numsrc) num = gf.gf_numsrc; len = GROUP_FILTER_SIZE(num) - (sizeof(gf)-sizeof(gf32)); - if (put_user(len, optlen) || - put_user(gf.gf_fmode, &p->gf_fmode) || - put_user(gf.gf_numsrc, &p->gf_numsrc)) + if (copy_to_sockptr(optlen, &len, sizeof(int)) || + copy_to_sockptr_offset(optval, offsetof(struct compat_group_filter, gf_fmode), + &gf.gf_fmode, sizeof(gf32.gf_fmode)) || + copy_to_sockptr_offset(optval, offsetof(struct compat_group_filter, gf_numsrc), + &gf.gf_numsrc, sizeof(gf32.gf_numsrc))) return -EFAULT; return 0; } -static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen, unsigned int flags) +int do_ipv6_getsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, sockptr_t optlen) { struct ipv6_pinfo *np = inet6_sk(sk); int len; @@ -1142,7 +1141,7 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, if (ip6_mroute_opt(optname)) return ip6_mroute_getsockopt(sk, optname, optval, optlen); - if (get_user(len, optlen)) + if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; switch (optname) { case IPV6_ADDRFORM: @@ -1156,7 +1155,7 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, break; case MCAST_MSFILTER: if (in_compat_syscall()) - return compat_ipv6_get_msfilter(sk, optval, optlen); + return compat_ipv6_get_msfilter(sk, optval, optlen, len); return ipv6_get_msfilter(sk, optval, optlen, len); case IPV6_2292PKTOPTIONS: { @@ -1166,16 +1165,21 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, if (sk->sk_type != SOCK_STREAM) return -ENOPROTOOPT; - msg.msg_control_user = optval; + if (optval.is_kernel) { + msg.msg_control_is_user = false; + msg.msg_control = optval.kernel; + } else { + msg.msg_control_is_user = true; + msg.msg_control_user = optval.user; + } msg.msg_controllen = len; - msg.msg_flags = flags; - msg.msg_control_is_user = true; + msg.msg_flags = 0; - lock_sock(sk); + sockopt_lock_sock(sk); skb = np->pktoptions; if (skb) ip6_datagram_recv_ctl(sk, &msg, skb); - release_sock(sk); + sockopt_release_sock(sk); if (!skb) { if (np->rxopt.bits.rxinfo) { struct in6_pktinfo src_info; @@ -1212,7 +1216,7 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, } } len -= msg.msg_controllen; - return put_user(len, optlen); + return copy_to_sockptr(optlen, &len, sizeof(int)); } case IPV6_MTU: { @@ -1264,15 +1268,15 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, { struct ipv6_txoptions *opt; - lock_sock(sk); + sockopt_lock_sock(sk); opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk)); len = ipv6_getsockopt_sticky(sk, opt, optname, optval, len); - release_sock(sk); + sockopt_release_sock(sk); /* check if ipv6_getsockopt_sticky() returns err code */ if (len < 0) return len; - return put_user(len, optlen); + return copy_to_sockptr(optlen, &len, sizeof(int)); } case IPV6_RECVHOPOPTS: @@ -1326,9 +1330,9 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, if (!mtuinfo.ip6m_mtu) return -ENOTCONN; - if (put_user(len, optlen)) + if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; - if (copy_to_user(optval, &mtuinfo, len)) + if (copy_to_sockptr(optval, &mtuinfo, len)) return -EFAULT; return 0; @@ -1405,7 +1409,7 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, if (len < sizeof(freq)) return -EINVAL; - if (copy_from_user(&freq, optval, sizeof(freq))) + if (copy_from_sockptr(&freq, optval, sizeof(freq))) return -EFAULT; if (freq.flr_action != IPV6_FL_A_GET) @@ -1420,9 +1424,9 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, if (val < 0) return val; - if (put_user(len, optlen)) + if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; - if (copy_to_user(optval, &freq, len)) + if (copy_to_sockptr(optval, &freq, len)) return -EFAULT; return 0; @@ -1474,9 +1478,9 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, return -ENOPROTOOPT; } len = min_t(unsigned int, sizeof(int), len); - if (put_user(len, optlen)) + if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; - if (copy_to_user(optval, &val, len)) + if (copy_to_sockptr(optval, &val, len)) return -EFAULT; return 0; } @@ -1492,7 +1496,8 @@ int ipv6_getsockopt(struct sock *sk, int level, int optname, if (level != SOL_IPV6) return -ENOPROTOOPT; - err = do_ipv6_getsockopt(sk, level, optname, optval, optlen, 0); + err = do_ipv6_getsockopt(sk, level, optname, + USER_SOCKPTR(optval), USER_SOCKPTR(optlen)); #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ if (err == -ENOPROTOOPT && optname != IPV6_2292PKTOPTIONS) { diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 87c699d57b36..0566ab03ddbe 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -580,7 +580,7 @@ done: } int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, - struct sockaddr_storage __user *p) + sockptr_t optval, size_t ss_offset) { struct ipv6_pinfo *inet6 = inet6_sk(sk); const struct in6_addr *group; @@ -612,8 +612,7 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc; gsf->gf_numsrc = count; - - for (i = 0; i < copycount; i++, p++) { + for (i = 0; i < copycount; i++) { struct sockaddr_in6 *psin6; struct sockaddr_storage ss; @@ -621,8 +620,9 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, memset(&ss, 0, sizeof(ss)); psin6->sin6_family = AF_INET6; psin6->sin6_addr = psl->sl_addr[i]; - if (copy_to_user(p, &ss, sizeof(ss))) + if (copy_to_sockptr_offset(optval, ss_offset, &ss, sizeof(ss))) return -EFAULT; + ss_offset += sizeof(ss); } return 0; } diff --git a/samples/bpf/map_perf_test_kern.c b/samples/bpf/map_perf_test_kern.c index 8773f22b6a98..7342c5b2f278 100644 --- a/samples/bpf/map_perf_test_kern.c +++ b/samples/bpf/map_perf_test_kern.c @@ -108,11 +108,14 @@ int stress_hmap(struct pt_regs *ctx) u32 key = bpf_get_current_pid_tgid(); long init_val = 1; long *value; + int i; - bpf_map_update_elem(&hash_map, &key, &init_val, BPF_ANY); - value = bpf_map_lookup_elem(&hash_map, &key); - if (value) - bpf_map_delete_elem(&hash_map, &key); + for (i = 0; i < 10; i++) { + bpf_map_update_elem(&hash_map, &key, &init_val, BPF_ANY); + value = bpf_map_lookup_elem(&hash_map, &key); + if (value) + bpf_map_delete_elem(&hash_map, &key); + } return 0; } @@ -123,11 +126,14 @@ int stress_percpu_hmap(struct pt_regs *ctx) u32 key = bpf_get_current_pid_tgid(); long init_val = 1; long *value; + int i; - bpf_map_update_elem(&percpu_hash_map, &key, &init_val, BPF_ANY); - value = bpf_map_lookup_elem(&percpu_hash_map, &key); - if (value) - bpf_map_delete_elem(&percpu_hash_map, &key); + for (i = 0; i < 10; i++) { + bpf_map_update_elem(&percpu_hash_map, &key, &init_val, BPF_ANY); + value = bpf_map_lookup_elem(&percpu_hash_map, &key); + if (value) + bpf_map_delete_elem(&percpu_hash_map, &key); + } return 0; } @@ -137,11 +143,14 @@ int stress_hmap_alloc(struct pt_regs *ctx) u32 key = bpf_get_current_pid_tgid(); long init_val = 1; long *value; + int i; - bpf_map_update_elem(&hash_map_alloc, &key, &init_val, BPF_ANY); - value = bpf_map_lookup_elem(&hash_map_alloc, &key); - if (value) - bpf_map_delete_elem(&hash_map_alloc, &key); + for (i = 0; i < 10; i++) { + bpf_map_update_elem(&hash_map_alloc, &key, &init_val, BPF_ANY); + value = bpf_map_lookup_elem(&hash_map_alloc, &key); + if (value) + bpf_map_delete_elem(&hash_map_alloc, &key); + } return 0; } @@ -151,11 +160,14 @@ int stress_percpu_hmap_alloc(struct pt_regs *ctx) u32 key = bpf_get_current_pid_tgid(); long init_val = 1; long *value; + int i; - bpf_map_update_elem(&percpu_hash_map_alloc, &key, &init_val, BPF_ANY); - value = bpf_map_lookup_elem(&percpu_hash_map_alloc, &key); - if (value) - bpf_map_delete_elem(&percpu_hash_map_alloc, &key); + for (i = 0; i < 10; i++) { + bpf_map_update_elem(&percpu_hash_map_alloc, &key, &init_val, BPF_ANY); + value = bpf_map_lookup_elem(&percpu_hash_map_alloc, &key); + if (value) + bpf_map_delete_elem(&percpu_hash_map_alloc, &key); + } return 0; } diff --git a/samples/bpf/map_perf_test_user.c b/samples/bpf/map_perf_test_user.c index b6fc174ab1f2..1bb53f4b29e1 100644 --- a/samples/bpf/map_perf_test_user.c +++ b/samples/bpf/map_perf_test_user.c @@ -72,7 +72,7 @@ static int test_flags = ~0; static uint32_t num_map_entries; static uint32_t inner_lru_hash_size; static int lru_hash_lookup_test_entries = 32; -static uint32_t max_cnt = 1000000; +static uint32_t max_cnt = 10000; static int check_test_flags(enum test_type t) { diff --git a/scripts/bpf_doc.py b/scripts/bpf_doc.py index dfb260de17a8..d5c389df6045 100755 --- a/scripts/bpf_doc.py +++ b/scripts/bpf_doc.py @@ -10,6 +10,9 @@ from __future__ import print_function import argparse import re import sys, os +import subprocess + +helpersDocStart = 'Start of BPF helper function descriptions:' class NoHelperFound(BaseException): pass @@ -47,6 +50,10 @@ class Helper(APIElement): @desc: textual description of the helper function @ret: description of the return value of the helper function """ + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.enum_val = None + def proto_break_down(self): """ Break down helper function protocol into smaller chunks: return type, @@ -89,6 +96,7 @@ class HeaderParser(object): self.commands = [] self.desc_unique_helpers = set() self.define_unique_helpers = [] + self.helper_enum_vals = {} self.desc_syscalls = [] self.enum_syscalls = [] @@ -233,7 +241,7 @@ class HeaderParser(object): self.enum_syscalls = re.findall('(BPF\w+)+', bpf_cmd_str) def parse_desc_helpers(self): - self.seek_to('* Start of BPF helper function descriptions:', + self.seek_to(helpersDocStart, 'Could not find start of eBPF helper descriptions list') while True: try: @@ -245,30 +253,54 @@ class HeaderParser(object): break def parse_define_helpers(self): - # Parse the number of FN(...) in #define __BPF_FUNC_MAPPER to compare - # later with the number of unique function names present in description. + # Parse FN(...) in #define __BPF_FUNC_MAPPER to compare later with the + # number of unique function names present in description and use the + # correct enumeration value. # Note: seek_to(..) discards the first line below the target search text, # resulting in FN(unspec) being skipped and not added to self.define_unique_helpers. self.seek_to('#define __BPF_FUNC_MAPPER(FN)', 'Could not find start of eBPF helper definition list') - # Searches for either one or more FN(\w+) defines or a backslash for newline - p = re.compile('\s*(FN\(\w+\))+|\\\\') + # Searches for one FN(\w+) define or a backslash for newline + p = re.compile('\s*FN\((\w+)\)|\\\\') fn_defines_str = '' + i = 1 # 'unspec' is skipped as mentioned above while True: capture = p.match(self.line) if capture: fn_defines_str += self.line + self.helper_enum_vals[capture.expand(r'bpf_\1')] = i + i += 1 else: break self.line = self.reader.readline() # Find the number of occurences of FN(\w+) self.define_unique_helpers = re.findall('FN\(\w+\)', fn_defines_str) + def assign_helper_values(self): + seen_helpers = set() + for helper in self.helpers: + proto = helper.proto_break_down() + name = proto['name'] + try: + enum_val = self.helper_enum_vals[name] + except KeyError: + raise Exception("Helper %s is missing from enum bpf_func_id" % name) + + # Enforce current practice of having the descriptions ordered + # by enum value. + seen_helpers.add(name) + desc_val = len(seen_helpers) + if desc_val != enum_val: + raise Exception("Helper %s comment order (#%d) must be aligned with its position (#%d) in enum bpf_func_id" % (name, desc_val, enum_val)) + + helper.enum_val = enum_val + def run(self): self.parse_desc_syscall() self.parse_enum_syscall() self.parse_desc_helpers() self.parse_define_helpers() + self.assign_helper_values() self.reader.close() ############################################################################### @@ -357,6 +389,31 @@ class PrinterRST(Printer): print('') + def get_kernel_version(self): + try: + version = subprocess.run(['git', 'describe'], cwd=linuxRoot, + capture_output=True, check=True) + version = version.stdout.decode().rstrip() + except: + try: + version = subprocess.run(['make', 'kernelversion'], cwd=linuxRoot, + capture_output=True, check=True) + version = version.stdout.decode().rstrip() + except: + return 'Linux' + return 'Linux {version}'.format(version=version) + + def get_last_doc_update(self, delimiter): + try: + cmd = ['git', 'log', '-1', '--pretty=format:%cs', '--no-patch', + '-L', + '/{}/,/\*\//:include/uapi/linux/bpf.h'.format(delimiter)] + date = subprocess.run(cmd, cwd=linuxRoot, + capture_output=True, check=True) + return date.stdout.decode().rstrip() + except: + return '' + class PrinterHelpersRST(PrinterRST): """ A printer for dumping collected information about helpers as a ReStructured @@ -378,6 +435,8 @@ list of eBPF helper functions ------------------------------------------------------------------------------- :Manual section: 7 +:Version: {version} +{date_field}{date} DESCRIPTION =========== @@ -410,8 +469,13 @@ kernel at the top). HELPERS ======= ''' + kernelVersion = self.get_kernel_version() + lastUpdate = self.get_last_doc_update(helpersDocStart) + PrinterRST.print_license(self) - print(header) + print(header.format(version=kernelVersion, + date_field = ':Date: ' if lastUpdate else '', + date=lastUpdate)) def print_footer(self): footer = ''' @@ -761,7 +825,7 @@ class PrinterHelpers(Printer): comma = ', ' print(one_arg, end='') - print(') = (void *) %d;' % len(self.seen_helpers)) + print(') = (void *) %d;' % helper.enum_val) print('') ############################################################################### diff --git a/tools/bpf/bpftool/btf_dumper.c b/tools/bpf/bpftool/btf_dumper.c index 125798b0bc5d..19924b6ce796 100644 --- a/tools/bpf/bpftool/btf_dumper.c +++ b/tools/bpf/bpftool/btf_dumper.c @@ -452,7 +452,7 @@ static int btf_dumper_int(const struct btf_type *t, __u8 bit_offset, *(char *)data); break; case BTF_INT_BOOL: - jsonw_bool(jw, *(int *)data); + jsonw_bool(jw, *(bool *)data); break; default: /* shouldn't happen */ diff --git a/tools/bpf/bpftool/link.c b/tools/bpf/bpftool/link.c index 7a20931c3250..ef0dc2f8d5a2 100644 --- a/tools/bpf/bpftool/link.c +++ b/tools/bpf/bpftool/link.c @@ -83,6 +83,29 @@ static bool is_iter_map_target(const char *target_name) strcmp(target_name, "bpf_sk_storage_map") == 0; } +static bool is_iter_cgroup_target(const char *target_name) +{ + return strcmp(target_name, "cgroup") == 0; +} + +static const char *cgroup_order_string(__u32 order) +{ + switch (order) { + case BPF_CGROUP_ITER_ORDER_UNSPEC: + return "order_unspec"; + case BPF_CGROUP_ITER_SELF_ONLY: + return "self_only"; + case BPF_CGROUP_ITER_DESCENDANTS_PRE: + return "descendants_pre"; + case BPF_CGROUP_ITER_DESCENDANTS_POST: + return "descendants_post"; + case BPF_CGROUP_ITER_ANCESTORS_UP: + return "ancestors_up"; + default: /* won't happen */ + return "unknown"; + } +} + static void show_iter_json(struct bpf_link_info *info, json_writer_t *wtr) { const char *target_name = u64_to_ptr(info->iter.target_name); @@ -91,6 +114,12 @@ static void show_iter_json(struct bpf_link_info *info, json_writer_t *wtr) if (is_iter_map_target(target_name)) jsonw_uint_field(wtr, "map_id", info->iter.map.map_id); + + if (is_iter_cgroup_target(target_name)) { + jsonw_lluint_field(wtr, "cgroup_id", info->iter.cgroup.cgroup_id); + jsonw_string_field(wtr, "order", + cgroup_order_string(info->iter.cgroup.order)); + } } static int get_prog_info(int prog_id, struct bpf_prog_info *info) @@ -208,6 +237,12 @@ static void show_iter_plain(struct bpf_link_info *info) if (is_iter_map_target(target_name)) printf("map_id %u ", info->iter.map.map_id); + + if (is_iter_cgroup_target(target_name)) { + printf("cgroup_id %llu ", info->iter.cgroup.cgroup_id); + printf("order %s ", + cgroup_order_string(info->iter.cgroup.order)); + } } static int show_link_close_plain(int fd, struct bpf_link_info *info) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 1d6085e15fc8..793103b10eab 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -87,10 +87,29 @@ struct bpf_cgroup_storage_key { __u32 attach_type; /* program attach type (enum bpf_attach_type) */ }; +enum bpf_cgroup_iter_order { + BPF_CGROUP_ITER_ORDER_UNSPEC = 0, + BPF_CGROUP_ITER_SELF_ONLY, /* process only a single object. */ + BPF_CGROUP_ITER_DESCENDANTS_PRE, /* walk descendants in pre-order. */ + BPF_CGROUP_ITER_DESCENDANTS_POST, /* walk descendants in post-order. */ + BPF_CGROUP_ITER_ANCESTORS_UP, /* walk ancestors upward. */ +}; + union bpf_iter_link_info { struct { __u32 map_fd; } map; + struct { + enum bpf_cgroup_iter_order order; + + /* At most one of cgroup_fd and cgroup_id can be non-zero. If + * both are zero, the walk starts from the default cgroup v2 + * root. For walking v1 hierarchy, one should always explicitly + * specify cgroup_fd. + */ + __u32 cgroup_fd; + __u64 cgroup_id; + } cgroup; }; /* BPF syscall commands, see bpf(2) man-page for more details. */ @@ -4437,7 +4456,7 @@ union bpf_attr { * * **-EEXIST** if the option already exists. * - * **-EFAULT** on failrue to parse the existing header options. + * **-EFAULT** on failure to parse the existing header options. * * **-EPERM** if the helper cannot be used under the current * *skops*\ **->op**. @@ -4646,7 +4665,7 @@ union bpf_attr { * a *map* with *task* as the **key**. From this * perspective, the usage is not much different from * **bpf_map_lookup_elem**\ (*map*, **&**\ *task*) except this - * helper enforces the key must be an task_struct and the map must also + * helper enforces the key must be a task_struct and the map must also * be a **BPF_MAP_TYPE_TASK_STORAGE**. * * Underneath, the value is stored locally at *task* instead of @@ -4704,7 +4723,7 @@ union bpf_attr { * * long bpf_ima_inode_hash(struct inode *inode, void *dst, u32 size) * Description - * Returns the stored IMA hash of the *inode* (if it's avaialable). + * Returns the stored IMA hash of the *inode* (if it's available). * If the hash is larger than *size*, then only *size* * bytes will be copied to *dst* * Return @@ -4728,12 +4747,12 @@ union bpf_attr { * * The argument *len_diff* can be used for querying with a planned * size change. This allows to check MTU prior to changing packet - * ctx. Providing an *len_diff* adjustment that is larger than the + * ctx. Providing a *len_diff* adjustment that is larger than the * actual packet size (resulting in negative packet size) will in - * principle not exceed the MTU, why it is not considered a - * failure. Other BPF-helpers are needed for performing the - * planned size change, why the responsability for catch a negative - * packet size belong in those helpers. + * principle not exceed the MTU, which is why it is not considered + * a failure. Other BPF helpers are needed for performing the + * planned size change; therefore the responsibility for catching + * a negative packet size belongs in those helpers. * * Specifying *ifindex* zero means the MTU check is performed * against the current net device. This is practical if this isn't @@ -5085,17 +5104,29 @@ union bpf_attr { * * int bpf_get_retval(void) * Description - * Get the syscall's return value that will be returned to userspace. + * Get the BPF program's return value that will be returned to the upper layers. * - * This helper is currently supported by cgroup programs only. + * This helper is currently supported by cgroup programs and only by the hooks + * where BPF program's return value is returned to the userspace via errno. * Return - * The syscall's return value. + * The BPF program's return value. * * int bpf_set_retval(int retval) * Description - * Set the syscall's return value that will be returned to userspace. + * Set the BPF program's return value that will be returned to the upper layers. + * + * This helper is currently supported by cgroup programs and only by the hooks + * where BPF program's return value is returned to the userspace via errno. + * + * Note that there is the following corner case where the program exports an error + * via bpf_set_retval but signals success via 'return 1': + * + * bpf_set_retval(-EPERM); + * return 1; + * + * In this case, the BPF program's return value will use helper's -EPERM. This + * still holds true for cgroup/bind{4,6} which supports extra 'return 3' success case. * - * This helper is currently supported by cgroup programs only. * Return * 0 on success, or a negative error in case of failure. * @@ -5628,6 +5659,11 @@ enum { BPF_F_SEQ_NUMBER = (1ULL << 3), }; +/* BPF_FUNC_skb_get_tunnel_key flags. */ +enum { + BPF_F_TUNINFO_FLAGS = (1ULL << 4), +}; + /* BPF_FUNC_perf_event_output, BPF_FUNC_perf_event_read and * BPF_FUNC_perf_event_read_value flags. */ @@ -5817,7 +5853,10 @@ struct bpf_tunnel_key { }; __u8 tunnel_tos; __u8 tunnel_ttl; - __u16 tunnel_ext; /* Padding, future use. */ + union { + __u16 tunnel_ext; /* compat */ + __be16 tunnel_flags; + }; __u32 tunnel_label; union { __u32 local_ipv4; @@ -5861,6 +5900,11 @@ enum bpf_ret_code { * represented by BPF_REDIRECT above). */ BPF_LWT_REROUTE = 128, + /* BPF_FLOW_DISSECTOR_CONTINUE: used by BPF_PROG_TYPE_FLOW_DISSECTOR + * to indicate that no custom dissection was performed, and + * fallback to standard dissector is requested. + */ + BPF_FLOW_DISSECTOR_CONTINUE = 129, }; struct bpf_sock { @@ -6159,11 +6203,22 @@ struct bpf_link_info { struct { __aligned_u64 target_name; /* in/out: target_name buffer ptr */ __u32 target_name_len; /* in/out: target_name buffer len */ + + /* If the iter specific field is 32 bits, it can be put + * in the first or second union. Otherwise it should be + * put in the second union. + */ union { struct { __u32 map_id; } map; }; + union { + struct { + __u64 cgroup_id; + __u32 order; + } cgroup; + }; } iter; struct { __u32 netns_ino; diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h index 7349b16b8e2f..867b734839dd 100644 --- a/tools/lib/bpf/bpf_helpers.h +++ b/tools/lib/bpf/bpf_helpers.h @@ -131,7 +131,7 @@ /* * Helper function to perform a tail call with a constant/immediate map slot. */ -#if __clang_major__ >= 8 && defined(__bpf__) +#if (!defined(__clang__) || __clang_major__ >= 8) && defined(__bpf__) static __always_inline void bpf_tail_call_static(void *ctx, const void *map, const __u32 slot) { @@ -139,8 +139,8 @@ bpf_tail_call_static(void *ctx, const void *map, const __u32 slot) __bpf_unreachable(); /* - * Provide a hard guarantee that LLVM won't optimize setting r2 (map - * pointer) and r3 (constant map index) from _different paths_ ending + * Provide a hard guarantee that the compiler won't optimize setting r2 + * (map pointer) and r3 (constant map index) from _different paths_ ending * up at the _same_ call insn as otherwise we won't be able to use the * jmpq/nopl retpoline-free patching by the x86-64 JIT in the kernel * given they mismatch. See also d2e4c1e6c294 ("bpf: Constant map key @@ -148,12 +148,19 @@ bpf_tail_call_static(void *ctx, const void *map, const __u32 slot) * * Note on clobber list: we need to stay in-line with BPF calling * convention, so even if we don't end up using r0, r4, r5, we need - * to mark them as clobber so that LLVM doesn't end up using them - * before / after the call. + * to mark them as clobber so that the compiler doesn't end up using + * them before / after the call. */ - asm volatile("r1 = %[ctx]\n\t" + asm volatile( +#ifdef __clang__ + "r1 = %[ctx]\n\t" "r2 = %[map]\n\t" "r3 = %[slot]\n\t" +#else + "mov %%r1,%[ctx]\n\t" + "mov %%r2,%[map]\n\t" + "mov %%r3,%[slot]\n\t" +#endif "call 12" :: [ctx]"r"(ctx), [map]"r"(map), [slot]"i"(slot) : "r0", "r1", "r2", "r3", "r4", "r5"); diff --git a/tools/lib/bpf/skel_internal.h b/tools/lib/bpf/skel_internal.h index 00c5f94b43be..1e82ab06c3eb 100644 --- a/tools/lib/bpf/skel_internal.h +++ b/tools/lib/bpf/skel_internal.h @@ -251,6 +251,29 @@ static inline int skel_map_update_elem(int fd, const void *key, return skel_sys_bpf(BPF_MAP_UPDATE_ELEM, &attr, attr_sz); } +static inline int skel_map_delete_elem(int fd, const void *key) +{ + const size_t attr_sz = offsetofend(union bpf_attr, flags); + union bpf_attr attr; + + memset(&attr, 0, attr_sz); + attr.map_fd = fd; + attr.key = (long)key; + + return skel_sys_bpf(BPF_MAP_DELETE_ELEM, &attr, attr_sz); +} + +static inline int skel_map_get_fd_by_id(__u32 id) +{ + const size_t attr_sz = offsetofend(union bpf_attr, flags); + union bpf_attr attr; + + memset(&attr, 0, attr_sz); + attr.map_id = id; + + return skel_sys_bpf(BPF_MAP_GET_FD_BY_ID, &attr, attr_sz); +} + static inline int skel_raw_tracepoint_open(const char *name, int prog_fd) { const size_t attr_sz = offsetofend(union bpf_attr, raw_tracepoint.prog_fd); diff --git a/tools/testing/selftests/bpf/DENYLIST.s390x b/tools/testing/selftests/bpf/DENYLIST.s390x index 5cadfbdadf36..18fbb6eab1e2 100644 --- a/tools/testing/selftests/bpf/DENYLIST.s390x +++ b/tools/testing/selftests/bpf/DENYLIST.s390x @@ -66,3 +66,7 @@ select_reuseport # intermittently fails on new s390x set xdp_synproxy # JIT does not support calling kernel function (kfunc) unpriv_bpf_disabled # fentry lru_bug # prog 'printk': failed to auto-attach: -524 +setget_sockopt # attach unexpected error: -524 (trampoline) +cb_refs # expected error message unexpected error: -524 (trampoline) +cgroup_hierarchical_stats # JIT does not support calling kernel function (kfunc) +htab_update # failed to attach: ERROR: strerror_r(-524)=22 (trampoline) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 8d59ec7f4c2d..c10adecb5a73 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -45,7 +45,7 @@ ifneq ($(BPF_GCC),) TEST_GEN_PROGS += test_progs-bpf_gcc endif -TEST_GEN_FILES = test_lwt_ip_encap.o test_tc_edt.o +TEST_GEN_FILES = test_lwt_ip_encap.bpf.o test_tc_edt.bpf.o TEST_FILES = xsk_prereqs.sh $(wildcard progs/btf_dump_test_case_*.c) # Order correspond to 'make run_tests' order @@ -323,6 +323,7 @@ $(OUTPUT)/test_l4lb_noinline.o: BPF_CFLAGS += -fno-inline $(OUTPUT)/test_xdp_noinline.o: BPF_CFLAGS += -fno-inline $(OUTPUT)/flow_dissector_load.o: flow_dissector_load.h +$(OUTPUT)/cgroup_getset_retval_hooks.o: cgroup_getset_retval_hooks.h # Build BPF object using Clang # $1 - input .c file @@ -357,17 +358,17 @@ LSKELS := kfunc_call_test.c fentry_test.c fexit_test.c fexit_sleep.c \ LSKELS_EXTRA := test_ksyms_module.c test_ksyms_weak.c kfunc_call_test_subprog.c SKEL_BLACKLIST += $$(LSKELS) -test_static_linked.skel.h-deps := test_static_linked1.o test_static_linked2.o -linked_funcs.skel.h-deps := linked_funcs1.o linked_funcs2.o -linked_vars.skel.h-deps := linked_vars1.o linked_vars2.o -linked_maps.skel.h-deps := linked_maps1.o linked_maps2.o +test_static_linked.skel.h-deps := test_static_linked1.bpf.o test_static_linked2.bpf.o +linked_funcs.skel.h-deps := linked_funcs1.bpf.o linked_funcs2.bpf.o +linked_vars.skel.h-deps := linked_vars1.bpf.o linked_vars2.bpf.o +linked_maps.skel.h-deps := linked_maps1.bpf.o linked_maps2.bpf.o # In the subskeleton case, we want the test_subskeleton_lib.subskel.h file # but that's created as a side-effect of the skel.h generation. -test_subskeleton.skel.h-deps := test_subskeleton_lib2.o test_subskeleton_lib.o test_subskeleton.o -test_subskeleton_lib.skel.h-deps := test_subskeleton_lib2.o test_subskeleton_lib.o -test_usdt.skel.h-deps := test_usdt.o test_usdt_multispec.o +test_subskeleton.skel.h-deps := test_subskeleton_lib2.bpf.o test_subskeleton_lib.bpf.o test_subskeleton.bpf.o +test_subskeleton_lib.skel.h-deps := test_subskeleton_lib2.bpf.o test_subskeleton_lib.bpf.o +test_usdt.skel.h-deps := test_usdt.bpf.o test_usdt_multispec.bpf.o -LINKED_BPF_SRCS := $(patsubst %.o,%.c,$(foreach skel,$(LINKED_SKELS),$($(skel)-deps))) +LINKED_BPF_SRCS := $(patsubst %.bpf.o,%.c,$(foreach skel,$(LINKED_SKELS),$($(skel)-deps))) # Set up extra TRUNNER_XXX "temporary" variables in the environment (relies on # $eval()) and pass control to DEFINE_TEST_RUNNER_RULES. @@ -385,7 +386,7 @@ TRUNNER_EXTRA_OBJS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.o, \ TRUNNER_EXTRA_HDRS := $$(filter %.h,$(TRUNNER_EXTRA_SOURCES)) TRUNNER_TESTS_HDR := $(TRUNNER_TESTS_DIR)/tests.h TRUNNER_BPF_SRCS := $$(notdir $$(wildcard $(TRUNNER_BPF_PROGS_DIR)/*.c)) -TRUNNER_BPF_OBJS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.o, $$(TRUNNER_BPF_SRCS)) +TRUNNER_BPF_OBJS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.bpf.o, $$(TRUNNER_BPF_SRCS)) TRUNNER_BPF_SKELS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.skel.h, \ $$(filter-out $(SKEL_BLACKLIST) $(LINKED_BPF_SRCS),\ $$(TRUNNER_BPF_SRCS))) @@ -415,7 +416,7 @@ endif # input/output directory combination ifeq ($($(TRUNNER_BPF_PROGS_DIR)$(if $2,-)$2-bpfobjs),) $(TRUNNER_BPF_PROGS_DIR)$(if $2,-)$2-bpfobjs := y -$(TRUNNER_BPF_OBJS): $(TRUNNER_OUTPUT)/%.o: \ +$(TRUNNER_BPF_OBJS): $(TRUNNER_OUTPUT)/%.bpf.o: \ $(TRUNNER_BPF_PROGS_DIR)/%.c \ $(TRUNNER_BPF_PROGS_DIR)/*.h \ $$(INCLUDE_DIR)/vmlinux.h \ @@ -425,25 +426,25 @@ $(TRUNNER_BPF_OBJS): $(TRUNNER_OUTPUT)/%.o: \ $$(call $(TRUNNER_BPF_BUILD_RULE),$$<,$$@, \ $(TRUNNER_BPF_CFLAGS)) -$(TRUNNER_BPF_SKELS): %.skel.h: %.o $(BPFTOOL) | $(TRUNNER_OUTPUT) +$(TRUNNER_BPF_SKELS): %.skel.h: %.bpf.o $(BPFTOOL) | $(TRUNNER_OUTPUT) $$(call msg,GEN-SKEL,$(TRUNNER_BINARY),$$@) $(Q)$$(BPFTOOL) gen object $$(<:.o=.linked1.o) $$< $(Q)$$(BPFTOOL) gen object $$(<:.o=.linked2.o) $$(<:.o=.linked1.o) $(Q)$$(BPFTOOL) gen object $$(<:.o=.linked3.o) $$(<:.o=.linked2.o) $(Q)diff $$(<:.o=.linked2.o) $$(<:.o=.linked3.o) - $(Q)$$(BPFTOOL) gen skeleton $$(<:.o=.linked3.o) name $$(notdir $$(<:.o=)) > $$@ - $(Q)$$(BPFTOOL) gen subskeleton $$(<:.o=.linked3.o) name $$(notdir $$(<:.o=)) > $$(@:.skel.h=.subskel.h) + $(Q)$$(BPFTOOL) gen skeleton $$(<:.o=.linked3.o) name $$(notdir $$(<:.bpf.o=)) > $$@ + $(Q)$$(BPFTOOL) gen subskeleton $$(<:.o=.linked3.o) name $$(notdir $$(<:.bpf.o=)) > $$(@:.skel.h=.subskel.h) -$(TRUNNER_BPF_LSKELS): %.lskel.h: %.o $(BPFTOOL) | $(TRUNNER_OUTPUT) +$(TRUNNER_BPF_LSKELS): %.lskel.h: %.bpf.o $(BPFTOOL) | $(TRUNNER_OUTPUT) $$(call msg,GEN-SKEL,$(TRUNNER_BINARY),$$@) $(Q)$$(BPFTOOL) gen object $$(<:.o=.llinked1.o) $$< $(Q)$$(BPFTOOL) gen object $$(<:.o=.llinked2.o) $$(<:.o=.llinked1.o) $(Q)$$(BPFTOOL) gen object $$(<:.o=.llinked3.o) $$(<:.o=.llinked2.o) $(Q)diff $$(<:.o=.llinked2.o) $$(<:.o=.llinked3.o) - $(Q)$$(BPFTOOL) gen skeleton -L $$(<:.o=.llinked3.o) name $$(notdir $$(<:.o=_lskel)) > $$@ + $(Q)$$(BPFTOOL) gen skeleton -L $$(<:.o=.llinked3.o) name $$(notdir $$(<:.bpf.o=_lskel)) > $$@ $(TRUNNER_BPF_SKELS_LINKED): $(TRUNNER_BPF_OBJS) $(BPFTOOL) | $(TRUNNER_OUTPUT) - $$(call msg,LINK-BPF,$(TRUNNER_BINARY),$$(@:.skel.h=.o)) + $$(call msg,LINK-BPF,$(TRUNNER_BINARY),$$(@:.skel.h=.bpf.o)) $(Q)$$(BPFTOOL) gen object $$(@:.skel.h=.linked1.o) $$(addprefix $(TRUNNER_OUTPUT)/,$$($$(@F)-deps)) $(Q)$$(BPFTOOL) gen object $$(@:.skel.h=.linked2.o) $$(@:.skel.h=.linked1.o) $(Q)$$(BPFTOOL) gen object $$(@:.skel.h=.linked3.o) $$(@:.skel.h=.linked2.o) @@ -499,7 +500,7 @@ $(OUTPUT)/$(TRUNNER_BINARY): $(TRUNNER_TEST_OBJS) \ | $(TRUNNER_BINARY)-extras $$(call msg,BINARY,,$$@) $(Q)$$(CC) $$(CFLAGS) $$(filter %.a %.o,$$^) $$(LDLIBS) -o $$@ - $(Q)$(RESOLVE_BTFIDS) --btf $(TRUNNER_OUTPUT)/btf_data.o $$@ + $(Q)$(RESOLVE_BTFIDS) --btf $(TRUNNER_OUTPUT)/btf_data.bpf.o $$@ $(Q)ln -sf $(if $2,..,.)/tools/build/bpftool/bootstrap/bpftool $(if $2,$2/)bpftool endef diff --git a/tools/testing/selftests/bpf/README.rst b/tools/testing/selftests/bpf/README.rst index eb1b7541f39d..d3c6b3da0bb1 100644 --- a/tools/testing/selftests/bpf/README.rst +++ b/tools/testing/selftests/bpf/README.rst @@ -126,11 +126,11 @@ available in 10.0.1. The patch is available in llvm 11.0.0 trunk. __ https://reviews.llvm.org/D78466 -bpf_verif_scale/loop6.o test failure with Clang 12 -================================================== +bpf_verif_scale/loop6.bpf.o test failure with Clang 12 +====================================================== With Clang 12, the following bpf_verif_scale test failed: - * ``bpf_verif_scale/loop6.o`` + * ``bpf_verif_scale/loop6.bpf.o`` The verifier output looks like @@ -245,7 +245,7 @@ See `kernel llvm reloc`_ for more explanation and some examples. Using clang 13 to compile old libbpf which has static linker support, there will be a compilation failure:: - libbpf: ELF relo #0 in section #6 has unexpected type 2 in .../bpf_tcp_nogpl.o + libbpf: ELF relo #0 in section #6 has unexpected type 2 in .../bpf_tcp_nogpl.bpf.o Here, ``type 2`` refers to new relocation type ``R_BPF_64_ABS64``. To fix this issue, user newer libbpf. diff --git a/tools/testing/selftests/bpf/cgroup_getset_retval_hooks.h b/tools/testing/selftests/bpf/cgroup_getset_retval_hooks.h new file mode 100644 index 000000000000..a525d3544fd7 --- /dev/null +++ b/tools/testing/selftests/bpf/cgroup_getset_retval_hooks.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +BPF_RETVAL_HOOK(ingress, "cgroup_skb/ingress", __sk_buff, -EINVAL) +BPF_RETVAL_HOOK(egress, "cgroup_skb/egress", __sk_buff, -EINVAL) +BPF_RETVAL_HOOK(sock_create, "cgroup/sock_create", bpf_sock, 0) +BPF_RETVAL_HOOK(sock_ops, "sockops", bpf_sock_ops, -EINVAL) +BPF_RETVAL_HOOK(dev, "cgroup/dev", bpf_cgroup_dev_ctx, 0) +BPF_RETVAL_HOOK(bind4, "cgroup/bind4", bpf_sock_addr, 0) +BPF_RETVAL_HOOK(bind6, "cgroup/bind6", bpf_sock_addr, 0) +BPF_RETVAL_HOOK(connect4, "cgroup/connect4", bpf_sock_addr, 0) +BPF_RETVAL_HOOK(connect6, "cgroup/connect6", bpf_sock_addr, 0) +BPF_RETVAL_HOOK(post_bind4, "cgroup/post_bind4", bpf_sock_addr, 0) +BPF_RETVAL_HOOK(post_bind6, "cgroup/post_bind6", bpf_sock_addr, 0) +BPF_RETVAL_HOOK(sendmsg4, "cgroup/sendmsg4", bpf_sock_addr, 0) +BPF_RETVAL_HOOK(sendmsg6, "cgroup/sendmsg6", bpf_sock_addr, 0) +BPF_RETVAL_HOOK(sysctl, "cgroup/sysctl", bpf_sysctl, 0) +BPF_RETVAL_HOOK(recvmsg4, "cgroup/recvmsg4", bpf_sock_addr, -EINVAL) +BPF_RETVAL_HOOK(recvmsg6, "cgroup/recvmsg6", bpf_sock_addr, -EINVAL) +BPF_RETVAL_HOOK(getsockopt, "cgroup/getsockopt", bpf_sockopt, 0) +BPF_RETVAL_HOOK(setsockopt, "cgroup/setsockopt", bpf_sockopt, 0) +BPF_RETVAL_HOOK(getpeername4, "cgroup/getpeername4", bpf_sock_addr, -EINVAL) +BPF_RETVAL_HOOK(getpeername6, "cgroup/getpeername6", bpf_sock_addr, -EINVAL) +BPF_RETVAL_HOOK(getsockname4, "cgroup/getsockname4", bpf_sock_addr, -EINVAL) +BPF_RETVAL_HOOK(getsockname6, "cgroup/getsockname6", bpf_sock_addr, -EINVAL) +BPF_RETVAL_HOOK(sock_release, "cgroup/sock_release", bpf_sock, 0) diff --git a/tools/testing/selftests/bpf/cgroup_helpers.c b/tools/testing/selftests/bpf/cgroup_helpers.c index 9d59c3990ca8..e914cc45b766 100644 --- a/tools/testing/selftests/bpf/cgroup_helpers.c +++ b/tools/testing/selftests/bpf/cgroup_helpers.c @@ -33,49 +33,52 @@ #define CGROUP_MOUNT_DFLT "/sys/fs/cgroup" #define NETCLS_MOUNT_PATH CGROUP_MOUNT_DFLT "/net_cls" #define CGROUP_WORK_DIR "/cgroup-test-work-dir" -#define format_cgroup_path(buf, path) \ + +#define format_cgroup_path_pid(buf, path, pid) \ snprintf(buf, sizeof(buf), "%s%s%d%s", CGROUP_MOUNT_PATH, \ - CGROUP_WORK_DIR, getpid(), path) + CGROUP_WORK_DIR, pid, path) + +#define format_cgroup_path(buf, path) \ + format_cgroup_path_pid(buf, path, getpid()) + +#define format_parent_cgroup_path(buf, path) \ + format_cgroup_path_pid(buf, path, getppid()) #define format_classid_path(buf) \ snprintf(buf, sizeof(buf), "%s%s", NETCLS_MOUNT_PATH, \ CGROUP_WORK_DIR) -/** - * enable_all_controllers() - Enable all available cgroup v2 controllers - * - * Enable all available cgroup v2 controllers in order to increase - * the code coverage. - * - * If successful, 0 is returned. - */ -static int enable_all_controllers(char *cgroup_path) +static int __enable_controllers(const char *cgroup_path, const char *controllers) { char path[PATH_MAX + 1]; - char buf[PATH_MAX]; + char enable[PATH_MAX + 1]; char *c, *c2; int fd, cfd; ssize_t len; - snprintf(path, sizeof(path), "%s/cgroup.controllers", cgroup_path); - fd = open(path, O_RDONLY); - if (fd < 0) { - log_err("Opening cgroup.controllers: %s", path); - return 1; - } - - len = read(fd, buf, sizeof(buf) - 1); - if (len < 0) { + /* If not controllers are passed, enable all available controllers */ + if (!controllers) { + snprintf(path, sizeof(path), "%s/cgroup.controllers", + cgroup_path); + fd = open(path, O_RDONLY); + if (fd < 0) { + log_err("Opening cgroup.controllers: %s", path); + return 1; + } + len = read(fd, enable, sizeof(enable) - 1); + if (len < 0) { + close(fd); + log_err("Reading cgroup.controllers: %s", path); + return 1; + } else if (len == 0) { /* No controllers to enable */ + close(fd); + return 0; + } + enable[len] = 0; close(fd); - log_err("Reading cgroup.controllers: %s", path); - return 1; + } else { + strncpy(enable, controllers, sizeof(enable)); } - buf[len] = 0; - close(fd); - - /* No controllers available? We're probably on cgroup v1. */ - if (len == 0) - return 0; snprintf(path, sizeof(path), "%s/cgroup.subtree_control", cgroup_path); cfd = open(path, O_RDWR); @@ -84,7 +87,7 @@ static int enable_all_controllers(char *cgroup_path) return 1; } - for (c = strtok_r(buf, " ", &c2); c; c = strtok_r(NULL, " ", &c2)) { + for (c = strtok_r(enable, " ", &c2); c; c = strtok_r(NULL, " ", &c2)) { if (dprintf(cfd, "+%s\n", c) <= 0) { log_err("Enabling controller %s: %s", c, path); close(cfd); @@ -96,6 +99,87 @@ static int enable_all_controllers(char *cgroup_path) } /** + * enable_controllers() - Enable cgroup v2 controllers + * @relative_path: The cgroup path, relative to the workdir + * @controllers: List of controllers to enable in cgroup.controllers format + * + * + * Enable given cgroup v2 controllers, if @controllers is NULL, enable all + * available controllers. + * + * If successful, 0 is returned. + */ +int enable_controllers(const char *relative_path, const char *controllers) +{ + char cgroup_path[PATH_MAX + 1]; + + format_cgroup_path(cgroup_path, relative_path); + return __enable_controllers(cgroup_path, controllers); +} + +static int __write_cgroup_file(const char *cgroup_path, const char *file, + const char *buf) +{ + char file_path[PATH_MAX + 1]; + int fd; + + snprintf(file_path, sizeof(file_path), "%s/%s", cgroup_path, file); + fd = open(file_path, O_RDWR); + if (fd < 0) { + log_err("Opening %s", file_path); + return 1; + } + + if (dprintf(fd, "%s", buf) <= 0) { + log_err("Writing to %s", file_path); + close(fd); + return 1; + } + close(fd); + return 0; +} + +/** + * write_cgroup_file() - Write to a cgroup file + * @relative_path: The cgroup path, relative to the workdir + * @file: The name of the file in cgroupfs to write to + * @buf: Buffer to write to the file + * + * Write to a file in the given cgroup's directory. + * + * If successful, 0 is returned. + */ +int write_cgroup_file(const char *relative_path, const char *file, + const char *buf) +{ + char cgroup_path[PATH_MAX - 24]; + + format_cgroup_path(cgroup_path, relative_path); + return __write_cgroup_file(cgroup_path, file, buf); +} + +/** + * write_cgroup_file_parent() - Write to a cgroup file in the parent process + * workdir + * @relative_path: The cgroup path, relative to the parent process workdir + * @file: The name of the file in cgroupfs to write to + * @buf: Buffer to write to the file + * + * Write to a file in the given cgroup's directory under the parent process + * workdir. + * + * If successful, 0 is returned. + */ +int write_cgroup_file_parent(const char *relative_path, const char *file, + const char *buf) +{ + char cgroup_path[PATH_MAX - 24]; + + format_parent_cgroup_path(cgroup_path, relative_path); + return __write_cgroup_file(cgroup_path, file, buf); +} + +/** * setup_cgroup_environment() - Setup the cgroup environment * * After calling this function, cleanup_cgroup_environment should be called @@ -133,7 +217,9 @@ int setup_cgroup_environment(void) return 1; } - if (enable_all_controllers(cgroup_workdir)) + /* Enable all available controllers to increase test coverage */ + if (__enable_controllers(CGROUP_MOUNT_PATH, NULL) || + __enable_controllers(cgroup_workdir, NULL)) return 1; return 0; @@ -173,7 +259,7 @@ static int join_cgroup_from_top(const char *cgroup_path) /** * join_cgroup() - Join a cgroup - * @path: The cgroup path, relative to the workdir, to join + * @relative_path: The cgroup path, relative to the workdir, to join * * This function expects a cgroup to already be created, relative to the cgroup * work dir, and it joins it. For example, passing "/my-cgroup" as the path @@ -182,11 +268,27 @@ static int join_cgroup_from_top(const char *cgroup_path) * * On success, it returns 0, otherwise on failure it returns 1. */ -int join_cgroup(const char *path) +int join_cgroup(const char *relative_path) +{ + char cgroup_path[PATH_MAX + 1]; + + format_cgroup_path(cgroup_path, relative_path); + return join_cgroup_from_top(cgroup_path); +} + +/** + * join_parent_cgroup() - Join a cgroup in the parent process workdir + * @relative_path: The cgroup path, relative to parent process workdir, to join + * + * See join_cgroup(). + * + * On success, it returns 0, otherwise on failure it returns 1. + */ +int join_parent_cgroup(const char *relative_path) { char cgroup_path[PATH_MAX + 1]; - format_cgroup_path(cgroup_path, path); + format_parent_cgroup_path(cgroup_path, relative_path); return join_cgroup_from_top(cgroup_path); } @@ -213,8 +315,26 @@ void cleanup_cgroup_environment(void) } /** + * get_root_cgroup() - Get the FD of the root cgroup + * + * On success, it returns the file descriptor. On failure, it returns -1. + * If there is a failure, it prints the error to stderr. + */ +int get_root_cgroup(void) +{ + int fd; + + fd = open(CGROUP_MOUNT_PATH, O_RDONLY); + if (fd < 0) { + log_err("Opening root cgroup"); + return -1; + } + return fd; +} + +/** * create_and_get_cgroup() - Create a cgroup, relative to workdir, and get the FD - * @path: The cgroup path, relative to the workdir, to join + * @relative_path: The cgroup path, relative to the workdir, to join * * This function creates a cgroup under the top level workdir and returns the * file descriptor. It is idempotent. @@ -222,14 +342,14 @@ void cleanup_cgroup_environment(void) * On success, it returns the file descriptor. On failure it returns -1. * If there is a failure, it prints the error to stderr. */ -int create_and_get_cgroup(const char *path) +int create_and_get_cgroup(const char *relative_path) { char cgroup_path[PATH_MAX + 1]; int fd; - format_cgroup_path(cgroup_path, path); + format_cgroup_path(cgroup_path, relative_path); if (mkdir(cgroup_path, 0777) && errno != EEXIST) { - log_err("mkdiring cgroup %s .. %s", path, cgroup_path); + log_err("mkdiring cgroup %s .. %s", relative_path, cgroup_path); return -1; } @@ -244,13 +364,13 @@ int create_and_get_cgroup(const char *path) /** * get_cgroup_id() - Get cgroup id for a particular cgroup path - * @path: The cgroup path, relative to the workdir, to join + * @relative_path: The cgroup path, relative to the workdir, to join * * On success, it returns the cgroup id. On failure it returns 0, * which is an invalid cgroup id. * If there is a failure, it prints the error to stderr. */ -unsigned long long get_cgroup_id(const char *path) +unsigned long long get_cgroup_id(const char *relative_path) { int dirfd, err, flags, mount_id, fhsize; union { @@ -261,7 +381,7 @@ unsigned long long get_cgroup_id(const char *path) struct file_handle *fhp, *fhp2; unsigned long long ret = 0; - format_cgroup_path(cgroup_workdir, path); + format_cgroup_path(cgroup_workdir, relative_path); dirfd = AT_FDCWD; flags = 0; diff --git a/tools/testing/selftests/bpf/cgroup_helpers.h b/tools/testing/selftests/bpf/cgroup_helpers.h index fcc9cb91b211..3358734356ab 100644 --- a/tools/testing/selftests/bpf/cgroup_helpers.h +++ b/tools/testing/selftests/bpf/cgroup_helpers.h @@ -10,11 +10,18 @@ __FILE__, __LINE__, clean_errno(), ##__VA_ARGS__) /* cgroupv2 related */ -int cgroup_setup_and_join(const char *path); -int create_and_get_cgroup(const char *path); -unsigned long long get_cgroup_id(const char *path); - -int join_cgroup(const char *path); +int enable_controllers(const char *relative_path, const char *controllers); +int write_cgroup_file(const char *relative_path, const char *file, + const char *buf); +int write_cgroup_file_parent(const char *relative_path, const char *file, + const char *buf); +int cgroup_setup_and_join(const char *relative_path); +int get_root_cgroup(void); +int create_and_get_cgroup(const char *relative_path); +unsigned long long get_cgroup_id(const char *relative_path); + +int join_cgroup(const char *relative_path); +int join_parent_cgroup(const char *relative_path); int setup_cgroup_environment(void); void cleanup_cgroup_environment(void); @@ -26,4 +33,4 @@ int join_classid(void); int setup_classid_environment(void); void cleanup_classid_environment(void); -#endif /* __CGROUP_HELPERS_H */
\ No newline at end of file +#endif /* __CGROUP_HELPERS_H */ diff --git a/tools/testing/selftests/bpf/get_cgroup_id_user.c b/tools/testing/selftests/bpf/get_cgroup_id_user.c index e021cc67dc02..156743cf5870 100644 --- a/tools/testing/selftests/bpf/get_cgroup_id_user.c +++ b/tools/testing/selftests/bpf/get_cgroup_id_user.c @@ -48,7 +48,7 @@ static int bpf_find_map(const char *test, struct bpf_object *obj, int main(int argc, char **argv) { const char *probe_name = "syscalls/sys_enter_nanosleep"; - const char *file = "get_cgroup_id_kern.o"; + const char *file = "get_cgroup_id_kern.bpf.o"; int err, bytes, efd, prog_fd, pmu_fd; int cgroup_fd, cgidmap_fd, pidmap_fd; struct perf_event_attr attr = {}; diff --git a/tools/testing/selftests/bpf/map_tests/task_storage_map.c b/tools/testing/selftests/bpf/map_tests/task_storage_map.c new file mode 100644 index 000000000000..1adc9c292eb2 --- /dev/null +++ b/tools/testing/selftests/bpf/map_tests/task_storage_map.c @@ -0,0 +1,122 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2022. Huawei Technologies Co., Ltd */ +#define _GNU_SOURCE +#include <sched.h> +#include <unistd.h> +#include <stdlib.h> +#include <stdbool.h> +#include <errno.h> +#include <string.h> +#include <pthread.h> + +#include <bpf/bpf.h> +#include <bpf/libbpf.h> + +#include "test_maps.h" +#include "task_local_storage_helpers.h" +#include "read_bpf_task_storage_busy.skel.h" + +struct lookup_ctx { + bool start; + bool stop; + int pid_fd; + int map_fd; + int loop; +}; + +static void *lookup_fn(void *arg) +{ + struct lookup_ctx *ctx = arg; + long value; + int i = 0; + + while (!ctx->start) + usleep(1); + + while (!ctx->stop && i++ < ctx->loop) + bpf_map_lookup_elem(ctx->map_fd, &ctx->pid_fd, &value); + return NULL; +} + +static void abort_lookup(struct lookup_ctx *ctx, pthread_t *tids, unsigned int nr) +{ + unsigned int i; + + ctx->stop = true; + ctx->start = true; + for (i = 0; i < nr; i++) + pthread_join(tids[i], NULL); +} + +void test_task_storage_map_stress_lookup(void) +{ +#define MAX_NR_THREAD 4096 + unsigned int i, nr = 256, loop = 8192, cpu = 0; + struct read_bpf_task_storage_busy *skel; + pthread_t tids[MAX_NR_THREAD]; + struct lookup_ctx ctx; + cpu_set_t old, new; + const char *cfg; + int err; + + cfg = getenv("TASK_STORAGE_MAP_NR_THREAD"); + if (cfg) { + nr = atoi(cfg); + if (nr > MAX_NR_THREAD) + nr = MAX_NR_THREAD; + } + cfg = getenv("TASK_STORAGE_MAP_NR_LOOP"); + if (cfg) + loop = atoi(cfg); + cfg = getenv("TASK_STORAGE_MAP_PIN_CPU"); + if (cfg) + cpu = atoi(cfg); + + skel = read_bpf_task_storage_busy__open_and_load(); + err = libbpf_get_error(skel); + CHECK(err, "open_and_load", "error %d\n", err); + + /* Only for a fully preemptible kernel */ + if (!skel->kconfig->CONFIG_PREEMPT) + return; + + /* Save the old affinity setting */ + sched_getaffinity(getpid(), sizeof(old), &old); + + /* Pinned on a specific CPU */ + CPU_ZERO(&new); + CPU_SET(cpu, &new); + sched_setaffinity(getpid(), sizeof(new), &new); + + ctx.start = false; + ctx.stop = false; + ctx.pid_fd = sys_pidfd_open(getpid(), 0); + ctx.map_fd = bpf_map__fd(skel->maps.task); + ctx.loop = loop; + for (i = 0; i < nr; i++) { + err = pthread_create(&tids[i], NULL, lookup_fn, &ctx); + if (err) { + abort_lookup(&ctx, tids, i); + CHECK(err, "pthread_create", "error %d\n", err); + goto out; + } + } + + ctx.start = true; + for (i = 0; i < nr; i++) + pthread_join(tids[i], NULL); + + skel->bss->pid = getpid(); + err = read_bpf_task_storage_busy__attach(skel); + CHECK(err, "attach", "error %d\n", err); + + /* Trigger program */ + syscall(SYS_gettid); + skel->bss->pid = 0; + + CHECK(skel->bss->busy != 0, "bad bpf_task_storage_busy", "got %d\n", skel->bss->busy); +out: + read_bpf_task_storage_busy__destroy(skel); + /* Restore affinity setting */ + sched_setaffinity(getpid(), sizeof(old), &old); +} diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c b/tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c index dbe56fa8582d..e1c1e521cca2 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c @@ -7,7 +7,7 @@ void serial_test_bpf_obj_id(void) { const __u64 array_magic_value = 0xfaceb00c; const __u32 array_key = 0; - const char *file = "./test_obj_id.o"; + const char *file = "./test_obj_id.bpf.o"; const char *expected_prog_name = "test_obj_id"; const char *expected_map_name = "test_map_id"; const __u64 nsec_per_sec = 1000000000; diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c b/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c index ff6cce9fef06..5ca252823294 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c @@ -75,45 +75,45 @@ static void scale_test(const char *file, void test_verif_scale1() { - scale_test("test_verif_scale1.o", BPF_PROG_TYPE_SCHED_CLS, false); + scale_test("test_verif_scale1.bpf.o", BPF_PROG_TYPE_SCHED_CLS, false); } void test_verif_scale2() { - scale_test("test_verif_scale2.o", BPF_PROG_TYPE_SCHED_CLS, false); + scale_test("test_verif_scale2.bpf.o", BPF_PROG_TYPE_SCHED_CLS, false); } void test_verif_scale3() { - scale_test("test_verif_scale3.o", BPF_PROG_TYPE_SCHED_CLS, false); + scale_test("test_verif_scale3.bpf.o", BPF_PROG_TYPE_SCHED_CLS, false); } void test_verif_scale_pyperf_global() { - scale_test("pyperf_global.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false); + scale_test("pyperf_global.bpf.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false); } void test_verif_scale_pyperf_subprogs() { - scale_test("pyperf_subprogs.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false); + scale_test("pyperf_subprogs.bpf.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false); } void test_verif_scale_pyperf50() { /* full unroll by llvm */ - scale_test("pyperf50.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false); + scale_test("pyperf50.bpf.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false); } void test_verif_scale_pyperf100() { /* full unroll by llvm */ - scale_test("pyperf100.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false); + scale_test("pyperf100.bpf.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false); } void test_verif_scale_pyperf180() { /* full unroll by llvm */ - scale_test("pyperf180.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false); + scale_test("pyperf180.bpf.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false); } void test_verif_scale_pyperf600() @@ -124,13 +124,13 @@ void test_verif_scale_pyperf600() * 16k insns in loop body. * Total of 5 such loops. Total program size ~82k insns. */ - scale_test("pyperf600.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false); + scale_test("pyperf600.bpf.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false); } void test_verif_scale_pyperf600_bpf_loop(void) { /* use the bpf_loop helper*/ - scale_test("pyperf600_bpf_loop.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false); + scale_test("pyperf600_bpf_loop.bpf.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false); } void test_verif_scale_pyperf600_nounroll() @@ -141,37 +141,37 @@ void test_verif_scale_pyperf600_nounroll() * ~110 insns in loop body. * Total of 5 such loops. Total program size ~1500 insns. */ - scale_test("pyperf600_nounroll.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false); + scale_test("pyperf600_nounroll.bpf.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false); } void test_verif_scale_loop1() { - scale_test("loop1.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false); + scale_test("loop1.bpf.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false); } void test_verif_scale_loop2() { - scale_test("loop2.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false); + scale_test("loop2.bpf.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false); } void test_verif_scale_loop3_fail() { - scale_test("loop3.o", BPF_PROG_TYPE_RAW_TRACEPOINT, true /* fails */); + scale_test("loop3.bpf.o", BPF_PROG_TYPE_RAW_TRACEPOINT, true /* fails */); } void test_verif_scale_loop4() { - scale_test("loop4.o", BPF_PROG_TYPE_SCHED_CLS, false); + scale_test("loop4.bpf.o", BPF_PROG_TYPE_SCHED_CLS, false); } void test_verif_scale_loop5() { - scale_test("loop5.o", BPF_PROG_TYPE_SCHED_CLS, false); + scale_test("loop5.bpf.o", BPF_PROG_TYPE_SCHED_CLS, false); } void test_verif_scale_loop6() { - scale_test("loop6.o", BPF_PROG_TYPE_KPROBE, false); + scale_test("loop6.bpf.o", BPF_PROG_TYPE_KPROBE, false); } void test_verif_scale_strobemeta() @@ -180,54 +180,54 @@ void test_verif_scale_strobemeta() * Total program size 20.8k insn. * ~350k processed_insns */ - scale_test("strobemeta.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false); + scale_test("strobemeta.bpf.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false); } void test_verif_scale_strobemeta_bpf_loop(void) { /* use the bpf_loop helper*/ - scale_test("strobemeta_bpf_loop.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false); + scale_test("strobemeta_bpf_loop.bpf.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false); } void test_verif_scale_strobemeta_nounroll1() { /* no unroll, tiny loops */ - scale_test("strobemeta_nounroll1.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false); + scale_test("strobemeta_nounroll1.bpf.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false); } void test_verif_scale_strobemeta_nounroll2() { /* no unroll, tiny loops */ - scale_test("strobemeta_nounroll2.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false); + scale_test("strobemeta_nounroll2.bpf.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false); } void test_verif_scale_strobemeta_subprogs() { /* non-inlined subprogs */ - scale_test("strobemeta_subprogs.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false); + scale_test("strobemeta_subprogs.bpf.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false); } void test_verif_scale_sysctl_loop1() { - scale_test("test_sysctl_loop1.o", BPF_PROG_TYPE_CGROUP_SYSCTL, false); + scale_test("test_sysctl_loop1.bpf.o", BPF_PROG_TYPE_CGROUP_SYSCTL, false); } void test_verif_scale_sysctl_loop2() { - scale_test("test_sysctl_loop2.o", BPF_PROG_TYPE_CGROUP_SYSCTL, false); + scale_test("test_sysctl_loop2.bpf.o", BPF_PROG_TYPE_CGROUP_SYSCTL, false); } void test_verif_scale_xdp_loop() { - scale_test("test_xdp_loop.o", BPF_PROG_TYPE_XDP, false); + scale_test("test_xdp_loop.bpf.o", BPF_PROG_TYPE_XDP, false); } void test_verif_scale_seg6_loop() { - scale_test("test_seg6_loop.o", BPF_PROG_TYPE_LWT_SEG6LOCAL, false); + scale_test("test_seg6_loop.bpf.o", BPF_PROG_TYPE_LWT_SEG6LOCAL, false); } void test_verif_twfw() { - scale_test("twfw.o", BPF_PROG_TYPE_CGROUP_SKB, false); + scale_test("twfw.bpf.o", BPF_PROG_TYPE_CGROUP_SKB, false); } diff --git a/tools/testing/selftests/bpf/prog_tests/btf.c b/tools/testing/selftests/bpf/prog_tests/btf.c index ef6528b8084c..127b8caa3dc1 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf.c +++ b/tools/testing/selftests/bpf/prog_tests/btf.c @@ -4651,8 +4651,8 @@ struct btf_file_test { }; static struct btf_file_test file_tests[] = { - { .file = "test_btf_newkv.o", }, - { .file = "test_btf_nokv.o", .btf_kv_notfound = true, }, + { .file = "test_btf_newkv.bpf.o", }, + { .file = "test_btf_nokv.bpf.o", .btf_kv_notfound = true, }, }; static void do_test_file(unsigned int test_num) diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dump.c b/tools/testing/selftests/bpf/prog_tests/btf_dump.c index 5fce7008d1ff..b1ca954ed1e5 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_dump.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_dump.c @@ -52,7 +52,7 @@ static int test_btf_dump_case(int n, struct btf_dump_test_case *t) int err = 0, fd = -1; FILE *f = NULL; - snprintf(test_file, sizeof(test_file), "%s.o", t->file); + snprintf(test_file, sizeof(test_file), "%s.bpf.o", t->file); btf = btf__parse_elf(test_file, NULL); if (!ASSERT_OK_PTR(btf, "btf_parse_elf")) { @@ -764,8 +764,8 @@ static void test_btf_dump_struct_data(struct btf *btf, struct btf_dump *d, /* union with nested struct */ TEST_BTF_DUMP_DATA(btf, d, "union", str, union bpf_iter_link_info, BTF_F_COMPACT, - "(union bpf_iter_link_info){.map = (struct){.map_fd = (__u32)1,},}", - { .map = { .map_fd = 1 }}); + "(union bpf_iter_link_info){.map = (struct){.map_fd = (__u32)1,},.cgroup = (struct){.order = (enum bpf_cgroup_iter_order)BPF_CGROUP_ITER_SELF_ONLY,.cgroup_fd = (__u32)1,},}", + { .cgroup = { .order = 1, .cgroup_fd = 1, }}); /* struct skb with nested structs/unions; because type output is so * complex, we don't do a string comparison, just verify we return @@ -841,8 +841,8 @@ static void test_btf_dump_datasec_data(char *str) char license[4] = "GPL"; struct btf_dump *d; - btf = btf__parse("xdping_kern.o", NULL); - if (!ASSERT_OK_PTR(btf, "xdping_kern.o BTF not found")) + btf = btf__parse("xdping_kern.bpf.o", NULL); + if (!ASSERT_OK_PTR(btf, "xdping_kern.bpf.o BTF not found")) return; d = btf_dump__new(btf, btf_dump_snprintf, str, NULL); diff --git a/tools/testing/selftests/bpf/prog_tests/btf_endian.c b/tools/testing/selftests/bpf/prog_tests/btf_endian.c index 8afbf3d0b89a..5b9f84dbeb43 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_endian.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_endian.c @@ -23,7 +23,7 @@ void test_btf_endian() { int var_id; /* Load BTF in native endianness */ - btf = btf__parse_elf("btf_dump_test_case_syntax.o", NULL); + btf = btf__parse_elf("btf_dump_test_case_syntax.bpf.o", NULL); if (!ASSERT_OK_PTR(btf, "parse_native_btf")) goto err_out; diff --git a/tools/testing/selftests/bpf/prog_tests/cb_refs.c b/tools/testing/selftests/bpf/prog_tests/cb_refs.c new file mode 100644 index 000000000000..3bff680de16c --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/cb_refs.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bpf/libbpf.h" +#include <test_progs.h> +#include <network_helpers.h> + +#include "cb_refs.skel.h" + +static char log_buf[1024 * 1024]; + +struct { + const char *prog_name; + const char *err_msg; +} cb_refs_tests[] = { + { "underflow_prog", "reference has not been acquired before" }, + { "leak_prog", "Unreleased reference" }, + { "nested_cb", "Unreleased reference id=4 alloc_insn=2" }, /* alloc_insn=2{4,5} */ + { "non_cb_transfer_ref", "Unreleased reference id=4 alloc_insn=1" }, /* alloc_insn=1{1,2} */ +}; + +void test_cb_refs(void) +{ + LIBBPF_OPTS(bpf_object_open_opts, opts, .kernel_log_buf = log_buf, + .kernel_log_size = sizeof(log_buf), + .kernel_log_level = 1); + struct bpf_program *prog; + struct cb_refs *skel; + int i; + + for (i = 0; i < ARRAY_SIZE(cb_refs_tests); i++) { + LIBBPF_OPTS(bpf_test_run_opts, run_opts, + .data_in = &pkt_v4, + .data_size_in = sizeof(pkt_v4), + .repeat = 1, + ); + skel = cb_refs__open_opts(&opts); + if (!ASSERT_OK_PTR(skel, "cb_refs__open_and_load")) + return; + prog = bpf_object__find_program_by_name(skel->obj, cb_refs_tests[i].prog_name); + bpf_program__set_autoload(prog, true); + if (!ASSERT_ERR(cb_refs__load(skel), "cb_refs__load")) + bpf_prog_test_run_opts(bpf_program__fd(prog), &run_opts); + if (!ASSERT_OK_PTR(strstr(log_buf, cb_refs_tests[i].err_msg), "expected error message")) { + fprintf(stderr, "Expected: %s\n", cb_refs_tests[i].err_msg); + fprintf(stderr, "Verifier: %s\n", log_buf); + } + cb_refs__destroy(skel); + } +} diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_getset_retval.c b/tools/testing/selftests/bpf/prog_tests/cgroup_getset_retval.c index 0b47c3c000c7..4d2fa99273d8 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgroup_getset_retval.c +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_getset_retval.c @@ -10,6 +10,7 @@ #include "cgroup_getset_retval_setsockopt.skel.h" #include "cgroup_getset_retval_getsockopt.skel.h" +#include "cgroup_getset_retval_hooks.skel.h" #define SOL_CUSTOM 0xdeadbeef @@ -433,6 +434,50 @@ close_bpf_object: cgroup_getset_retval_getsockopt__destroy(obj); } +struct exposed_hook { + const char *name; + int expected_err; +} exposed_hooks[] = { + +#define BPF_RETVAL_HOOK(NAME, SECTION, CTX, EXPECTED_ERR) \ + { \ + .name = #NAME, \ + .expected_err = EXPECTED_ERR, \ + }, + +#include "cgroup_getset_retval_hooks.h" + +#undef BPF_RETVAL_HOOK +}; + +static void test_exposed_hooks(int cgroup_fd, int sock_fd) +{ + struct cgroup_getset_retval_hooks *skel; + struct bpf_program *prog; + int err; + int i; + + for (i = 0; i < ARRAY_SIZE(exposed_hooks); i++) { + skel = cgroup_getset_retval_hooks__open(); + if (!ASSERT_OK_PTR(skel, "cgroup_getset_retval_hooks__open")) + continue; + + prog = bpf_object__find_program_by_name(skel->obj, exposed_hooks[i].name); + if (!ASSERT_NEQ(prog, NULL, "bpf_object__find_program_by_name")) + goto close_skel; + + err = bpf_program__set_autoload(prog, true); + if (!ASSERT_OK(err, "bpf_program__set_autoload")) + goto close_skel; + + err = cgroup_getset_retval_hooks__load(skel); + ASSERT_EQ(err, exposed_hooks[i].expected_err, "expected_err"); + +close_skel: + cgroup_getset_retval_hooks__destroy(skel); + } +} + void test_cgroup_getset_retval(void) { int cgroup_fd = -1; @@ -476,6 +521,9 @@ void test_cgroup_getset_retval(void) if (test__start_subtest("getsockopt-retval_sync")) test_getsockopt_retval_sync(cgroup_fd, sock_fd); + if (test__start_subtest("exposed_hooks")) + test_exposed_hooks(cgroup_fd, sock_fd); + close_fd: close(cgroup_fd); } diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_hierarchical_stats.c b/tools/testing/selftests/bpf/prog_tests/cgroup_hierarchical_stats.c new file mode 100644 index 000000000000..bed1661596f7 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_hierarchical_stats.c @@ -0,0 +1,357 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Functions to manage eBPF programs attached to cgroup subsystems + * + * Copyright 2022 Google LLC. + */ +#include <asm-generic/errno.h> +#include <errno.h> +#include <sys/types.h> +#include <sys/mount.h> +#include <sys/stat.h> +#include <unistd.h> + +#include <test_progs.h> +#include <bpf/libbpf.h> +#include <bpf/bpf.h> + +#include "cgroup_helpers.h" +#include "cgroup_hierarchical_stats.skel.h" + +#define PAGE_SIZE 4096 +#define MB(x) (x << 20) + +#define BPFFS_ROOT "/sys/fs/bpf/" +#define BPFFS_VMSCAN BPFFS_ROOT"vmscan/" + +#define CG_ROOT_NAME "root" +#define CG_ROOT_ID 1 + +#define CGROUP_PATH(p, n) {.path = p"/"n, .name = n} + +static struct { + const char *path, *name; + unsigned long long id; + int fd; +} cgroups[] = { + CGROUP_PATH("/", "test"), + CGROUP_PATH("/test", "child1"), + CGROUP_PATH("/test", "child2"), + CGROUP_PATH("/test/child1", "child1_1"), + CGROUP_PATH("/test/child1", "child1_2"), + CGROUP_PATH("/test/child2", "child2_1"), + CGROUP_PATH("/test/child2", "child2_2"), +}; + +#define N_CGROUPS ARRAY_SIZE(cgroups) +#define N_NON_LEAF_CGROUPS 3 + +static int root_cgroup_fd; +static bool mounted_bpffs; + +/* reads file at 'path' to 'buf', returns 0 on success. */ +static int read_from_file(const char *path, char *buf, size_t size) +{ + int fd, len; + + fd = open(path, O_RDONLY); + if (fd < 0) + return fd; + + len = read(fd, buf, size); + close(fd); + if (len < 0) + return len; + + buf[len] = 0; + return 0; +} + +/* mounts bpffs and mkdir for reading stats, returns 0 on success. */ +static int setup_bpffs(void) +{ + int err; + + /* Mount bpffs */ + err = mount("bpf", BPFFS_ROOT, "bpf", 0, NULL); + mounted_bpffs = !err; + if (ASSERT_FALSE(err && errno != EBUSY, "mount")) + return err; + + /* Create a directory to contain stat files in bpffs */ + err = mkdir(BPFFS_VMSCAN, 0755); + if (!ASSERT_OK(err, "mkdir")) + return err; + + return 0; +} + +static void cleanup_bpffs(void) +{ + /* Remove created directory in bpffs */ + ASSERT_OK(rmdir(BPFFS_VMSCAN), "rmdir "BPFFS_VMSCAN); + + /* Unmount bpffs, if it wasn't already mounted when we started */ + if (mounted_bpffs) + return; + + ASSERT_OK(umount(BPFFS_ROOT), "unmount bpffs"); +} + +/* sets up cgroups, returns 0 on success. */ +static int setup_cgroups(void) +{ + int i, fd, err; + + err = setup_cgroup_environment(); + if (!ASSERT_OK(err, "setup_cgroup_environment")) + return err; + + root_cgroup_fd = get_root_cgroup(); + if (!ASSERT_GE(root_cgroup_fd, 0, "get_root_cgroup")) + return root_cgroup_fd; + + for (i = 0; i < N_CGROUPS; i++) { + fd = create_and_get_cgroup(cgroups[i].path); + if (!ASSERT_GE(fd, 0, "create_and_get_cgroup")) + return fd; + + cgroups[i].fd = fd; + cgroups[i].id = get_cgroup_id(cgroups[i].path); + + /* + * Enable memcg controller for the entire hierarchy. + * Note that stats are collected for all cgroups in a hierarchy + * with memcg enabled anyway, but are only exposed for cgroups + * that have memcg enabled. + */ + if (i < N_NON_LEAF_CGROUPS) { + err = enable_controllers(cgroups[i].path, "memory"); + if (!ASSERT_OK(err, "enable_controllers")) + return err; + } + } + return 0; +} + +static void cleanup_cgroups(void) +{ + close(root_cgroup_fd); + for (int i = 0; i < N_CGROUPS; i++) + close(cgroups[i].fd); + cleanup_cgroup_environment(); +} + +/* Sets up cgroup hiearchary, returns 0 on success. */ +static int setup_hierarchy(void) +{ + return setup_bpffs() || setup_cgroups(); +} + +static void destroy_hierarchy(void) +{ + cleanup_cgroups(); + cleanup_bpffs(); +} + +static int reclaimer(const char *cgroup_path, size_t size) +{ + static char size_buf[128]; + char *buf, *ptr; + int err; + + /* Join cgroup in the parent process workdir */ + if (join_parent_cgroup(cgroup_path)) + return EACCES; + + /* Allocate memory */ + buf = malloc(size); + if (!buf) + return ENOMEM; + + /* Write to memory to make sure it's actually allocated */ + for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE) + *ptr = 1; + + /* Try to reclaim memory */ + snprintf(size_buf, 128, "%lu", size); + err = write_cgroup_file_parent(cgroup_path, "memory.reclaim", size_buf); + + free(buf); + /* memory.reclaim returns EAGAIN if the amount is not fully reclaimed */ + if (err && errno != EAGAIN) + return errno; + + return 0; +} + +static int induce_vmscan(void) +{ + int i, status; + + /* + * In every leaf cgroup, run a child process that allocates some memory + * and attempts to reclaim some of it. + */ + for (i = N_NON_LEAF_CGROUPS; i < N_CGROUPS; i++) { + pid_t pid; + + /* Create reclaimer child */ + pid = fork(); + if (pid == 0) { + status = reclaimer(cgroups[i].path, MB(5)); + exit(status); + } + + /* Cleanup reclaimer child */ + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status), "reclaimer exited"); + ASSERT_EQ(WEXITSTATUS(status), 0, "reclaim exit code"); + } + return 0; +} + +static unsigned long long +get_cgroup_vmscan_delay(unsigned long long cgroup_id, const char *file_name) +{ + unsigned long long vmscan = 0, id = 0; + static char buf[128], path[128]; + + /* For every cgroup, read the file generated by cgroup_iter */ + snprintf(path, 128, "%s%s", BPFFS_VMSCAN, file_name); + if (!ASSERT_OK(read_from_file(path, buf, 128), "read cgroup_iter")) + return 0; + + /* Check the output file formatting */ + ASSERT_EQ(sscanf(buf, "cg_id: %llu, total_vmscan_delay: %llu\n", + &id, &vmscan), 2, "output format"); + + /* Check that the cgroup_id is displayed correctly */ + ASSERT_EQ(id, cgroup_id, "cgroup_id"); + /* Check that the vmscan reading is non-zero */ + ASSERT_GT(vmscan, 0, "vmscan_reading"); + return vmscan; +} + +static void check_vmscan_stats(void) +{ + unsigned long long vmscan_readings[N_CGROUPS], vmscan_root; + int i; + + for (i = 0; i < N_CGROUPS; i++) { + vmscan_readings[i] = get_cgroup_vmscan_delay(cgroups[i].id, + cgroups[i].name); + } + + /* Read stats for root too */ + vmscan_root = get_cgroup_vmscan_delay(CG_ROOT_ID, CG_ROOT_NAME); + + /* Check that child1 == child1_1 + child1_2 */ + ASSERT_EQ(vmscan_readings[1], vmscan_readings[3] + vmscan_readings[4], + "child1_vmscan"); + /* Check that child2 == child2_1 + child2_2 */ + ASSERT_EQ(vmscan_readings[2], vmscan_readings[5] + vmscan_readings[6], + "child2_vmscan"); + /* Check that test == child1 + child2 */ + ASSERT_EQ(vmscan_readings[0], vmscan_readings[1] + vmscan_readings[2], + "test_vmscan"); + /* Check that root >= test */ + ASSERT_GE(vmscan_root, vmscan_readings[1], "root_vmscan"); +} + +/* Creates iter link and pins in bpffs, returns 0 on success, -errno on failure. + */ +static int setup_cgroup_iter(struct cgroup_hierarchical_stats *obj, + int cgroup_fd, const char *file_name) +{ + DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts); + union bpf_iter_link_info linfo = {}; + struct bpf_link *link; + static char path[128]; + int err; + + /* + * Create an iter link, parameterized by cgroup_fd. We only want to + * traverse one cgroup, so set the traversal order to "self". + */ + linfo.cgroup.cgroup_fd = cgroup_fd; + linfo.cgroup.order = BPF_CGROUP_ITER_SELF_ONLY; + opts.link_info = &linfo; + opts.link_info_len = sizeof(linfo); + link = bpf_program__attach_iter(obj->progs.dump_vmscan, &opts); + if (!ASSERT_OK_PTR(link, "attach_iter")) + return -EFAULT; + + /* Pin the link to a bpffs file */ + snprintf(path, 128, "%s%s", BPFFS_VMSCAN, file_name); + err = bpf_link__pin(link, path); + ASSERT_OK(err, "pin cgroup_iter"); + + /* Remove the link, leaving only the ref held by the pinned file */ + bpf_link__destroy(link); + return err; +} + +/* Sets up programs for collecting stats, returns 0 on success. */ +static int setup_progs(struct cgroup_hierarchical_stats **skel) +{ + int i, err; + + *skel = cgroup_hierarchical_stats__open_and_load(); + if (!ASSERT_OK_PTR(*skel, "open_and_load")) + return 1; + + /* Attach cgroup_iter program that will dump the stats to cgroups */ + for (i = 0; i < N_CGROUPS; i++) { + err = setup_cgroup_iter(*skel, cgroups[i].fd, cgroups[i].name); + if (!ASSERT_OK(err, "setup_cgroup_iter")) + return err; + } + + /* Also dump stats for root */ + err = setup_cgroup_iter(*skel, root_cgroup_fd, CG_ROOT_NAME); + if (!ASSERT_OK(err, "setup_cgroup_iter")) + return err; + + bpf_program__set_autoattach((*skel)->progs.dump_vmscan, false); + err = cgroup_hierarchical_stats__attach(*skel); + if (!ASSERT_OK(err, "attach")) + return err; + + return 0; +} + +static void destroy_progs(struct cgroup_hierarchical_stats *skel) +{ + static char path[128]; + int i; + + for (i = 0; i < N_CGROUPS; i++) { + /* Delete files in bpffs that cgroup_iters are pinned in */ + snprintf(path, 128, "%s%s", BPFFS_VMSCAN, + cgroups[i].name); + ASSERT_OK(remove(path), "remove cgroup_iter pin"); + } + + /* Delete root file in bpffs */ + snprintf(path, 128, "%s%s", BPFFS_VMSCAN, CG_ROOT_NAME); + ASSERT_OK(remove(path), "remove cgroup_iter root pin"); + cgroup_hierarchical_stats__destroy(skel); +} + +void test_cgroup_hierarchical_stats(void) +{ + struct cgroup_hierarchical_stats *skel = NULL; + + if (setup_hierarchy()) + goto hierarchy_cleanup; + if (setup_progs(&skel)) + goto cleanup; + if (induce_vmscan()) + goto cleanup; + check_vmscan_stats(); +cleanup: + destroy_progs(skel); +hierarchy_cleanup: + destroy_hierarchy(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c b/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c new file mode 100644 index 000000000000..c4a2adb38da1 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c @@ -0,0 +1,224 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022 Google */ + +#include <test_progs.h> +#include <bpf/libbpf.h> +#include <bpf/btf.h> +#include "cgroup_iter.skel.h" +#include "cgroup_helpers.h" + +#define ROOT 0 +#define PARENT 1 +#define CHILD1 2 +#define CHILD2 3 +#define NUM_CGROUPS 4 + +#define PROLOGUE "prologue\n" +#define EPILOGUE "epilogue\n" + +static const char *cg_path[] = { + "/", "/parent", "/parent/child1", "/parent/child2" +}; + +static int cg_fd[] = {-1, -1, -1, -1}; +static unsigned long long cg_id[] = {0, 0, 0, 0}; +static char expected_output[64]; + +static int setup_cgroups(void) +{ + int fd, i = 0; + + for (i = 0; i < NUM_CGROUPS; i++) { + fd = create_and_get_cgroup(cg_path[i]); + if (fd < 0) + return fd; + + cg_fd[i] = fd; + cg_id[i] = get_cgroup_id(cg_path[i]); + } + return 0; +} + +static void cleanup_cgroups(void) +{ + int i; + + for (i = 0; i < NUM_CGROUPS; i++) + close(cg_fd[i]); +} + +static void read_from_cgroup_iter(struct bpf_program *prog, int cgroup_fd, + int order, const char *testname) +{ + DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts); + union bpf_iter_link_info linfo; + struct bpf_link *link; + int len, iter_fd; + static char buf[128]; + size_t left; + char *p; + + memset(&linfo, 0, sizeof(linfo)); + linfo.cgroup.cgroup_fd = cgroup_fd; + linfo.cgroup.order = order; + opts.link_info = &linfo; + opts.link_info_len = sizeof(linfo); + + link = bpf_program__attach_iter(prog, &opts); + if (!ASSERT_OK_PTR(link, "attach_iter")) + return; + + iter_fd = bpf_iter_create(bpf_link__fd(link)); + if (iter_fd < 0) + goto free_link; + + memset(buf, 0, sizeof(buf)); + left = ARRAY_SIZE(buf); + p = buf; + while ((len = read(iter_fd, p, left)) > 0) { + p += len; + left -= len; + } + + ASSERT_STREQ(buf, expected_output, testname); + + /* read() after iter finishes should be ok. */ + if (len == 0) + ASSERT_OK(read(iter_fd, buf, sizeof(buf)), "second_read"); + + close(iter_fd); +free_link: + bpf_link__destroy(link); +} + +/* Invalid cgroup. */ +static void test_invalid_cgroup(struct cgroup_iter *skel) +{ + DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts); + union bpf_iter_link_info linfo; + struct bpf_link *link; + + memset(&linfo, 0, sizeof(linfo)); + linfo.cgroup.cgroup_fd = (__u32)-1; + opts.link_info = &linfo; + opts.link_info_len = sizeof(linfo); + + link = bpf_program__attach_iter(skel->progs.cgroup_id_printer, &opts); + ASSERT_ERR_PTR(link, "attach_iter"); + bpf_link__destroy(link); +} + +/* Specifying both cgroup_fd and cgroup_id is invalid. */ +static void test_invalid_cgroup_spec(struct cgroup_iter *skel) +{ + DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts); + union bpf_iter_link_info linfo; + struct bpf_link *link; + + memset(&linfo, 0, sizeof(linfo)); + linfo.cgroup.cgroup_fd = (__u32)cg_fd[PARENT]; + linfo.cgroup.cgroup_id = (__u64)cg_id[PARENT]; + opts.link_info = &linfo; + opts.link_info_len = sizeof(linfo); + + link = bpf_program__attach_iter(skel->progs.cgroup_id_printer, &opts); + ASSERT_ERR_PTR(link, "attach_iter"); + bpf_link__destroy(link); +} + +/* Preorder walk prints parent and child in order. */ +static void test_walk_preorder(struct cgroup_iter *skel) +{ + snprintf(expected_output, sizeof(expected_output), + PROLOGUE "%8llu\n%8llu\n%8llu\n" EPILOGUE, + cg_id[PARENT], cg_id[CHILD1], cg_id[CHILD2]); + + read_from_cgroup_iter(skel->progs.cgroup_id_printer, cg_fd[PARENT], + BPF_CGROUP_ITER_DESCENDANTS_PRE, "preorder"); +} + +/* Postorder walk prints child and parent in order. */ +static void test_walk_postorder(struct cgroup_iter *skel) +{ + snprintf(expected_output, sizeof(expected_output), + PROLOGUE "%8llu\n%8llu\n%8llu\n" EPILOGUE, + cg_id[CHILD1], cg_id[CHILD2], cg_id[PARENT]); + + read_from_cgroup_iter(skel->progs.cgroup_id_printer, cg_fd[PARENT], + BPF_CGROUP_ITER_DESCENDANTS_POST, "postorder"); +} + +/* Walking parents prints parent and then root. */ +static void test_walk_ancestors_up(struct cgroup_iter *skel) +{ + /* terminate the walk when ROOT is met. */ + skel->bss->terminal_cgroup = cg_id[ROOT]; + + snprintf(expected_output, sizeof(expected_output), + PROLOGUE "%8llu\n%8llu\n" EPILOGUE, + cg_id[PARENT], cg_id[ROOT]); + + read_from_cgroup_iter(skel->progs.cgroup_id_printer, cg_fd[PARENT], + BPF_CGROUP_ITER_ANCESTORS_UP, "ancestors_up"); + + skel->bss->terminal_cgroup = 0; +} + +/* Early termination prints parent only. */ +static void test_early_termination(struct cgroup_iter *skel) +{ + /* terminate the walk after the first element is processed. */ + skel->bss->terminate_early = 1; + + snprintf(expected_output, sizeof(expected_output), + PROLOGUE "%8llu\n" EPILOGUE, cg_id[PARENT]); + + read_from_cgroup_iter(skel->progs.cgroup_id_printer, cg_fd[PARENT], + BPF_CGROUP_ITER_DESCENDANTS_PRE, "early_termination"); + + skel->bss->terminate_early = 0; +} + +/* Waling self prints self only. */ +static void test_walk_self_only(struct cgroup_iter *skel) +{ + snprintf(expected_output, sizeof(expected_output), + PROLOGUE "%8llu\n" EPILOGUE, cg_id[PARENT]); + + read_from_cgroup_iter(skel->progs.cgroup_id_printer, cg_fd[PARENT], + BPF_CGROUP_ITER_SELF_ONLY, "self_only"); +} + +void test_cgroup_iter(void) +{ + struct cgroup_iter *skel = NULL; + + if (setup_cgroup_environment()) + return; + + if (setup_cgroups()) + goto out; + + skel = cgroup_iter__open_and_load(); + if (!ASSERT_OK_PTR(skel, "cgroup_iter__open_and_load")) + goto out; + + if (test__start_subtest("cgroup_iter__invalid_cgroup")) + test_invalid_cgroup(skel); + if (test__start_subtest("cgroup_iter__invalid_cgroup_spec")) + test_invalid_cgroup_spec(skel); + if (test__start_subtest("cgroup_iter__preorder")) + test_walk_preorder(skel); + if (test__start_subtest("cgroup_iter__postorder")) + test_walk_postorder(skel); + if (test__start_subtest("cgroup_iter__ancestors_up_walk")) + test_walk_ancestors_up(skel); + if (test__start_subtest("cgroup_iter__early_termination")) + test_early_termination(skel); + if (test__start_subtest("cgroup_iter__self_only")) + test_walk_self_only(skel); +out: + cgroup_iter__destroy(skel); + cleanup_cgroups(); + cleanup_cgroup_environment(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/connect_force_port.c b/tools/testing/selftests/bpf/prog_tests/connect_force_port.c index 9c4325f4aef2..24d553109f8d 100644 --- a/tools/testing/selftests/bpf/prog_tests/connect_force_port.c +++ b/tools/testing/selftests/bpf/prog_tests/connect_force_port.c @@ -53,7 +53,7 @@ static int run_test(int cgroup_fd, int server_fd, int family, int type) __u16 expected_peer_port = 60000; struct bpf_program *prog; struct bpf_object *obj; - const char *obj_file = v4 ? "connect_force_port4.o" : "connect_force_port6.o"; + const char *obj_file = v4 ? "connect_force_port4.bpf.o" : "connect_force_port6.bpf.o"; int fd, err; __u32 duration = 0; diff --git a/tools/testing/selftests/bpf/prog_tests/core_reloc.c b/tools/testing/selftests/bpf/prog_tests/core_reloc.c index c8655ba9a88f..47f42e680105 100644 --- a/tools/testing/selftests/bpf/prog_tests/core_reloc.c +++ b/tools/testing/selftests/bpf/prog_tests/core_reloc.c @@ -13,7 +13,7 @@ static int duration = 0; #define MODULES_CASE(name, pg_name, tp_name) { \ .case_name = name, \ - .bpf_obj_file = "test_core_reloc_module.o", \ + .bpf_obj_file = "test_core_reloc_module.bpf.o", \ .btf_src_file = NULL, /* find in kernel module BTFs */ \ .input = "", \ .input_len = 0, \ @@ -43,8 +43,8 @@ static int duration = 0; #define FLAVORS_CASE_COMMON(name) \ .case_name = #name, \ - .bpf_obj_file = "test_core_reloc_flavors.o", \ - .btf_src_file = "btf__core_reloc_" #name ".o", \ + .bpf_obj_file = "test_core_reloc_flavors.bpf.o", \ + .btf_src_file = "btf__core_reloc_" #name ".bpf.o", \ .raw_tp_name = "sys_enter", \ .prog_name = "test_core_flavors" \ @@ -68,8 +68,8 @@ static int duration = 0; #define NESTING_CASE_COMMON(name) \ .case_name = #name, \ - .bpf_obj_file = "test_core_reloc_nesting.o", \ - .btf_src_file = "btf__core_reloc_" #name ".o", \ + .bpf_obj_file = "test_core_reloc_nesting.bpf.o", \ + .btf_src_file = "btf__core_reloc_" #name ".bpf.o", \ .raw_tp_name = "sys_enter", \ .prog_name = "test_core_nesting" \ @@ -96,8 +96,8 @@ static int duration = 0; #define ARRAYS_CASE_COMMON(name) \ .case_name = #name, \ - .bpf_obj_file = "test_core_reloc_arrays.o", \ - .btf_src_file = "btf__core_reloc_" #name ".o", \ + .bpf_obj_file = "test_core_reloc_arrays.bpf.o", \ + .btf_src_file = "btf__core_reloc_" #name ".bpf.o", \ .raw_tp_name = "sys_enter", \ .prog_name = "test_core_arrays" \ @@ -130,8 +130,8 @@ static int duration = 0; #define PRIMITIVES_CASE_COMMON(name) \ .case_name = #name, \ - .bpf_obj_file = "test_core_reloc_primitives.o", \ - .btf_src_file = "btf__core_reloc_" #name ".o", \ + .bpf_obj_file = "test_core_reloc_primitives.bpf.o", \ + .btf_src_file = "btf__core_reloc_" #name ".bpf.o", \ .raw_tp_name = "sys_enter", \ .prog_name = "test_core_primitives" \ @@ -150,8 +150,8 @@ static int duration = 0; #define MODS_CASE(name) { \ .case_name = #name, \ - .bpf_obj_file = "test_core_reloc_mods.o", \ - .btf_src_file = "btf__core_reloc_" #name ".o", \ + .bpf_obj_file = "test_core_reloc_mods.bpf.o", \ + .btf_src_file = "btf__core_reloc_" #name ".bpf.o", \ .input = STRUCT_TO_CHAR_PTR(core_reloc_##name) { \ .a = 1, \ .b = 2, \ @@ -174,8 +174,8 @@ static int duration = 0; #define PTR_AS_ARR_CASE(name) { \ .case_name = #name, \ - .bpf_obj_file = "test_core_reloc_ptr_as_arr.o", \ - .btf_src_file = "btf__core_reloc_" #name ".o", \ + .bpf_obj_file = "test_core_reloc_ptr_as_arr.bpf.o", \ + .btf_src_file = "btf__core_reloc_" #name ".bpf.o", \ .input = (const char *)&(struct core_reloc_##name []){ \ { .a = 1 }, \ { .a = 2 }, \ @@ -203,8 +203,8 @@ static int duration = 0; #define INTS_CASE_COMMON(name) \ .case_name = #name, \ - .bpf_obj_file = "test_core_reloc_ints.o", \ - .btf_src_file = "btf__core_reloc_" #name ".o", \ + .bpf_obj_file = "test_core_reloc_ints.bpf.o", \ + .btf_src_file = "btf__core_reloc_" #name ".bpf.o", \ .raw_tp_name = "sys_enter", \ .prog_name = "test_core_ints" @@ -223,18 +223,18 @@ static int duration = 0; #define FIELD_EXISTS_CASE_COMMON(name) \ .case_name = #name, \ - .bpf_obj_file = "test_core_reloc_existence.o", \ - .btf_src_file = "btf__core_reloc_" #name ".o", \ + .bpf_obj_file = "test_core_reloc_existence.bpf.o", \ + .btf_src_file = "btf__core_reloc_" #name ".bpf.o", \ .raw_tp_name = "sys_enter", \ .prog_name = "test_core_existence" #define BITFIELDS_CASE_COMMON(objfile, test_name_prefix, name) \ .case_name = test_name_prefix#name, \ .bpf_obj_file = objfile, \ - .btf_src_file = "btf__core_reloc_" #name ".o" + .btf_src_file = "btf__core_reloc_" #name ".bpf.o" #define BITFIELDS_CASE(name, ...) { \ - BITFIELDS_CASE_COMMON("test_core_reloc_bitfields_probed.o", \ + BITFIELDS_CASE_COMMON("test_core_reloc_bitfields_probed.bpf.o", \ "probed:", name), \ .input = STRUCT_TO_CHAR_PTR(core_reloc_##name) __VA_ARGS__, \ .input_len = sizeof(struct core_reloc_##name), \ @@ -244,7 +244,7 @@ static int duration = 0; .raw_tp_name = "sys_enter", \ .prog_name = "test_core_bitfields", \ }, { \ - BITFIELDS_CASE_COMMON("test_core_reloc_bitfields_direct.o", \ + BITFIELDS_CASE_COMMON("test_core_reloc_bitfields_direct.bpf.o", \ "direct:", name), \ .input = STRUCT_TO_CHAR_PTR(core_reloc_##name) __VA_ARGS__, \ .input_len = sizeof(struct core_reloc_##name), \ @@ -256,14 +256,14 @@ static int duration = 0; #define BITFIELDS_ERR_CASE(name) { \ - BITFIELDS_CASE_COMMON("test_core_reloc_bitfields_probed.o", \ + BITFIELDS_CASE_COMMON("test_core_reloc_bitfields_probed.bpf.o", \ "probed:", name), \ .fails = true, \ - .run_btfgen_fails = true, \ + .run_btfgen_fails = true, \ .raw_tp_name = "sys_enter", \ .prog_name = "test_core_bitfields", \ }, { \ - BITFIELDS_CASE_COMMON("test_core_reloc_bitfields_direct.o", \ + BITFIELDS_CASE_COMMON("test_core_reloc_bitfields_direct.bpf.o", \ "direct:", name), \ .fails = true, \ .run_btfgen_fails = true, \ @@ -272,8 +272,8 @@ static int duration = 0; #define SIZE_CASE_COMMON(name) \ .case_name = #name, \ - .bpf_obj_file = "test_core_reloc_size.o", \ - .btf_src_file = "btf__core_reloc_" #name ".o", \ + .bpf_obj_file = "test_core_reloc_size.bpf.o", \ + .btf_src_file = "btf__core_reloc_" #name ".bpf.o", \ .raw_tp_name = "sys_enter", \ .prog_name = "test_core_size" @@ -307,13 +307,13 @@ static int duration = 0; #define SIZE_ERR_CASE(name) { \ SIZE_CASE_COMMON(name), \ .fails = true, \ - .run_btfgen_fails = true, \ + .run_btfgen_fails = true, \ } #define TYPE_BASED_CASE_COMMON(name) \ .case_name = #name, \ - .bpf_obj_file = "test_core_reloc_type_based.o", \ - .btf_src_file = "btf__core_reloc_" #name ".o", \ + .bpf_obj_file = "test_core_reloc_type_based.bpf.o", \ + .btf_src_file = "btf__core_reloc_" #name ".bpf.o", \ .raw_tp_name = "sys_enter", \ .prog_name = "test_core_type_based" @@ -331,8 +331,8 @@ static int duration = 0; #define TYPE_ID_CASE_COMMON(name) \ .case_name = #name, \ - .bpf_obj_file = "test_core_reloc_type_id.o", \ - .btf_src_file = "btf__core_reloc_" #name ".o", \ + .bpf_obj_file = "test_core_reloc_type_id.bpf.o", \ + .btf_src_file = "btf__core_reloc_" #name ".bpf.o", \ .raw_tp_name = "sys_enter", \ .prog_name = "test_core_type_id" @@ -350,8 +350,8 @@ static int duration = 0; #define ENUMVAL_CASE_COMMON(name) \ .case_name = #name, \ - .bpf_obj_file = "test_core_reloc_enumval.o", \ - .btf_src_file = "btf__core_reloc_" #name ".o", \ + .bpf_obj_file = "test_core_reloc_enumval.bpf.o", \ + .btf_src_file = "btf__core_reloc_" #name ".bpf.o", \ .raw_tp_name = "sys_enter", \ .prog_name = "test_core_enumval" @@ -369,8 +369,8 @@ static int duration = 0; #define ENUM64VAL_CASE_COMMON(name) \ .case_name = #name, \ - .bpf_obj_file = "test_core_reloc_enum64val.o", \ - .btf_src_file = "btf__core_reloc_" #name ".o", \ + .bpf_obj_file = "test_core_reloc_enum64val.bpf.o", \ + .btf_src_file = "btf__core_reloc_" #name ".bpf.o", \ .raw_tp_name = "sys_enter", \ .prog_name = "test_core_enum64val" @@ -547,7 +547,7 @@ static const struct core_reloc_test_case test_cases[] = { /* validate we can find kernel image and use its BTF for relocs */ { .case_name = "kernel", - .bpf_obj_file = "test_core_reloc_kernel.o", + .bpf_obj_file = "test_core_reloc_kernel.bpf.o", .btf_src_file = NULL, /* load from /lib/modules/$(uname -r) */ .input = "", .input_len = 0, @@ -629,8 +629,8 @@ static const struct core_reloc_test_case test_cases[] = { /* validate edge cases of capturing relocations */ { .case_name = "misc", - .bpf_obj_file = "test_core_reloc_misc.o", - .btf_src_file = "btf__core_reloc_misc.o", + .bpf_obj_file = "test_core_reloc_misc.bpf.o", + .btf_src_file = "btf__core_reloc_misc.bpf.o", .input = (const char *)&(struct core_reloc_misc_extensible[]){ { .a = 1 }, { .a = 2 }, /* not read */ diff --git a/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c b/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c index da860b07abb5..d1e32e792536 100644 --- a/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c +++ b/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c @@ -174,8 +174,8 @@ static void test_target_no_callees(void) const char *prog_name[] = { "fexit/test_pkt_md_access", }; - test_fexit_bpf2bpf_common("./fexit_bpf2bpf_simple.o", - "./test_pkt_md_access.o", + test_fexit_bpf2bpf_common("./fexit_bpf2bpf_simple.bpf.o", + "./test_pkt_md_access.bpf.o", ARRAY_SIZE(prog_name), prog_name, true, NULL); } @@ -188,8 +188,8 @@ static void test_target_yes_callees(void) "fexit/test_pkt_access_subprog2", "fexit/test_pkt_access_subprog3", }; - test_fexit_bpf2bpf_common("./fexit_bpf2bpf.o", - "./test_pkt_access.o", + test_fexit_bpf2bpf_common("./fexit_bpf2bpf.bpf.o", + "./test_pkt_access.bpf.o", ARRAY_SIZE(prog_name), prog_name, true, NULL); } @@ -206,8 +206,8 @@ static void test_func_replace(void) "freplace/get_constant", "freplace/test_pkt_write_access_subprog", }; - test_fexit_bpf2bpf_common("./fexit_bpf2bpf.o", - "./test_pkt_access.o", + test_fexit_bpf2bpf_common("./fexit_bpf2bpf.bpf.o", + "./test_pkt_access.bpf.o", ARRAY_SIZE(prog_name), prog_name, true, NULL); } @@ -217,8 +217,8 @@ static void test_func_replace_verify(void) const char *prog_name[] = { "freplace/do_bind", }; - test_fexit_bpf2bpf_common("./freplace_connect4.o", - "./connect4_prog.o", + test_fexit_bpf2bpf_common("./freplace_connect4.bpf.o", + "./connect4_prog.bpf.o", ARRAY_SIZE(prog_name), prog_name, false, NULL); } @@ -227,7 +227,7 @@ static int test_second_attach(struct bpf_object *obj) { const char *prog_name = "security_new_get_constant"; const char *tgt_name = "get_constant"; - const char *tgt_obj_file = "./test_pkt_access.o"; + const char *tgt_obj_file = "./test_pkt_access.bpf.o"; struct bpf_program *prog = NULL; struct bpf_object *tgt_obj; struct bpf_link *link; @@ -272,8 +272,8 @@ static void test_func_replace_multi(void) const char *prog_name[] = { "freplace/get_constant", }; - test_fexit_bpf2bpf_common("./freplace_get_constant.o", - "./test_pkt_access.o", + test_fexit_bpf2bpf_common("./freplace_get_constant.bpf.o", + "./test_pkt_access.bpf.o", ARRAY_SIZE(prog_name), prog_name, true, test_second_attach); } @@ -281,10 +281,10 @@ static void test_func_replace_multi(void) static void test_fmod_ret_freplace(void) { struct bpf_object *freplace_obj = NULL, *pkt_obj, *fmod_obj = NULL; - const char *freplace_name = "./freplace_get_constant.o"; - const char *fmod_ret_name = "./fmod_ret_freplace.o"; + const char *freplace_name = "./freplace_get_constant.bpf.o"; + const char *fmod_ret_name = "./fmod_ret_freplace.bpf.o"; DECLARE_LIBBPF_OPTS(bpf_object_open_opts, opts); - const char *tgt_name = "./test_pkt_access.o"; + const char *tgt_name = "./test_pkt_access.bpf.o"; struct bpf_link *freplace_link = NULL; struct bpf_program *prog; __u32 duration = 0; @@ -339,8 +339,8 @@ static void test_func_sockmap_update(void) const char *prog_name[] = { "freplace/cls_redirect", }; - test_fexit_bpf2bpf_common("./freplace_cls_redirect.o", - "./test_cls_redirect.o", + test_fexit_bpf2bpf_common("./freplace_cls_redirect.bpf.o", + "./test_cls_redirect.bpf.o", ARRAY_SIZE(prog_name), prog_name, false, NULL); } @@ -385,15 +385,15 @@ close_prog: static void test_func_replace_return_code(void) { /* test invalid return code in the replaced program */ - test_obj_load_failure_common("./freplace_connect_v4_prog.o", - "./connect4_prog.o"); + test_obj_load_failure_common("./freplace_connect_v4_prog.bpf.o", + "./connect4_prog.bpf.o"); } static void test_func_map_prog_compatibility(void) { /* test with spin lock map value in the replaced program */ - test_obj_load_failure_common("./freplace_attach_probe.o", - "./test_attach_probe.o"); + test_obj_load_failure_common("./freplace_attach_probe.bpf.o", + "./test_attach_probe.bpf.o"); } static void test_func_replace_global_func(void) @@ -402,8 +402,8 @@ static void test_func_replace_global_func(void) "freplace/test_pkt_access", }; - test_fexit_bpf2bpf_common("./freplace_global_func.o", - "./test_pkt_access.o", + test_fexit_bpf2bpf_common("./freplace_global_func.bpf.o", + "./test_pkt_access.bpf.o", ARRAY_SIZE(prog_name), prog_name, false, NULL); } diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c index 0c1661ea996e..7acca37a3d2b 100644 --- a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c +++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c @@ -8,6 +8,8 @@ #include "bpf_flow.skel.h" +#define FLOW_CONTINUE_SADDR 0x7f00007f /* 127.0.0.127 */ + #ifndef IP_MF #define IP_MF 0x2000 #endif @@ -100,6 +102,7 @@ struct test { } pkt; struct bpf_flow_keys keys; __u32 flags; + __u32 retval; }; #define VLAN_HLEN 4 @@ -126,6 +129,7 @@ struct test tests[] = { .sport = 80, .dport = 8080, }, + .retval = BPF_OK, }, { .name = "ipv6", @@ -146,6 +150,7 @@ struct test tests[] = { .sport = 80, .dport = 8080, }, + .retval = BPF_OK, }, { .name = "802.1q-ipv4", @@ -168,6 +173,7 @@ struct test tests[] = { .sport = 80, .dport = 8080, }, + .retval = BPF_OK, }, { .name = "802.1ad-ipv6", @@ -191,6 +197,7 @@ struct test tests[] = { .sport = 80, .dport = 8080, }, + .retval = BPF_OK, }, { .name = "ipv4-frag", @@ -217,6 +224,7 @@ struct test tests[] = { .dport = 8080, }, .flags = BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG, + .retval = BPF_OK, }, { .name = "ipv4-no-frag", @@ -239,6 +247,7 @@ struct test tests[] = { .is_frag = true, .is_first_frag = true, }, + .retval = BPF_OK, }, { .name = "ipv6-frag", @@ -265,6 +274,7 @@ struct test tests[] = { .dport = 8080, }, .flags = BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG, + .retval = BPF_OK, }, { .name = "ipv6-no-frag", @@ -287,6 +297,7 @@ struct test tests[] = { .is_frag = true, .is_first_frag = true, }, + .retval = BPF_OK, }, { .name = "ipv6-flow-label", @@ -309,6 +320,7 @@ struct test tests[] = { .dport = 8080, .flow_label = __bpf_constant_htonl(0xbeeef), }, + .retval = BPF_OK, }, { .name = "ipv6-no-flow-label", @@ -331,6 +343,7 @@ struct test tests[] = { .flow_label = __bpf_constant_htonl(0xbeeef), }, .flags = BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL, + .retval = BPF_OK, }, { .name = "ipip-encap", @@ -359,6 +372,7 @@ struct test tests[] = { .sport = 80, .dport = 8080, }, + .retval = BPF_OK, }, { .name = "ipip-no-encap", @@ -386,6 +400,26 @@ struct test tests[] = { .is_encap = true, }, .flags = BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP, + .retval = BPF_OK, + }, + { + .name = "ipip-encap-dissector-continue", + .pkt.ipip = { + .eth.h_proto = __bpf_constant_htons(ETH_P_IP), + .iph.ihl = 5, + .iph.protocol = IPPROTO_IPIP, + .iph.tot_len = __bpf_constant_htons(MAGIC_BYTES), + .iph.saddr = __bpf_constant_htonl(FLOW_CONTINUE_SADDR), + .iph_inner.ihl = 5, + .iph_inner.protocol = IPPROTO_TCP, + .iph_inner.tot_len = + __bpf_constant_htons(MAGIC_BYTES) - + sizeof(struct iphdr), + .tcp.doff = 5, + .tcp.source = 99, + .tcp.dest = 9090, + }, + .retval = BPF_FLOW_DISSECTOR_CONTINUE, }, }; @@ -503,6 +537,10 @@ static void run_tests_skb_less(int tap_fd, struct bpf_map *keys) err = tx_tap(tap_fd, &tests[i].pkt, sizeof(tests[i].pkt)); CHECK(err < 0, "tx_tap", "err %d errno %d\n", err, errno); + /* check the stored flow_keys only if BPF_OK expected */ + if (tests[i].retval != BPF_OK) + continue; + err = bpf_map_lookup_elem(keys_fd, &key, &flow_keys); ASSERT_OK(err, "bpf_map_lookup_elem"); @@ -588,7 +626,11 @@ void test_flow_dissector(void) err = bpf_prog_test_run_opts(prog_fd, &topts); ASSERT_OK(err, "test_run"); - ASSERT_EQ(topts.retval, 1, "test_run retval"); + ASSERT_EQ(topts.retval, tests[i].retval, "test_run retval"); + + /* check the resulting flow_keys only if BPF_OK returned */ + if (topts.retval != BPF_OK) + continue; ASSERT_EQ(topts.data_size_out, sizeof(flow_keys), "test_run data_size_out"); CHECK_FLOW_KEYS(tests[i].name, flow_keys, tests[i].keys); diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector_load_bytes.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector_load_bytes.c index 36afb409c25f..c7a47b57ac91 100644 --- a/tools/testing/selftests/bpf/prog_tests/flow_dissector_load_bytes.c +++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector_load_bytes.c @@ -44,7 +44,7 @@ void serial_test_flow_dissector_load_bytes(void) ASSERT_OK(err, "test_run"); ASSERT_EQ(topts.data_size_out, sizeof(flow_keys), "test_run data_size_out"); - ASSERT_EQ(topts.retval, 1, "test_run retval"); + ASSERT_EQ(topts.retval, BPF_OK, "test_run retval"); if (fd >= -1) close(fd); diff --git a/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c b/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c index 16048978a1ef..858e0575f502 100644 --- a/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c +++ b/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c @@ -84,8 +84,8 @@ static void get_stack_print_output(void *ctx, int cpu, void *data, __u32 size) void test_get_stack_raw_tp(void) { - const char *file = "./test_get_stack_rawtp.o"; - const char *file_err = "./test_get_stack_rawtp_err.o"; + const char *file = "./test_get_stack_rawtp.bpf.o"; + const char *file_err = "./test_get_stack_rawtp_err.bpf.o"; const char *prog_name = "bpf_prog1"; int i, err, prog_fd, exp_cnt = MAX_CNT_RAWTP; struct perf_buffer *pb = NULL; diff --git a/tools/testing/selftests/bpf/prog_tests/global_data.c b/tools/testing/selftests/bpf/prog_tests/global_data.c index 027685858925..fadfb64e2a71 100644 --- a/tools/testing/selftests/bpf/prog_tests/global_data.c +++ b/tools/testing/selftests/bpf/prog_tests/global_data.c @@ -131,7 +131,7 @@ static void test_global_data_rdonly(struct bpf_object *obj, __u32 duration) void test_global_data(void) { - const char *file = "./test_global_data.o"; + const char *file = "./test_global_data.bpf.o"; struct bpf_object *obj; int err, prog_fd; LIBBPF_OPTS(bpf_test_run_opts, topts, diff --git a/tools/testing/selftests/bpf/prog_tests/global_data_init.c b/tools/testing/selftests/bpf/prog_tests/global_data_init.c index 57331c606964..8466332d7406 100644 --- a/tools/testing/selftests/bpf/prog_tests/global_data_init.c +++ b/tools/testing/selftests/bpf/prog_tests/global_data_init.c @@ -3,7 +3,7 @@ void test_global_data_init(void) { - const char *file = "./test_global_data.o"; + const char *file = "./test_global_data.bpf.o"; int err = -ENOMEM, map_fd, zero = 0; __u8 *buff = NULL, *newval = NULL; struct bpf_object *obj; diff --git a/tools/testing/selftests/bpf/prog_tests/global_func_args.c b/tools/testing/selftests/bpf/prog_tests/global_func_args.c index 29039a36cce5..d997099f62d0 100644 --- a/tools/testing/selftests/bpf/prog_tests/global_func_args.c +++ b/tools/testing/selftests/bpf/prog_tests/global_func_args.c @@ -39,7 +39,7 @@ static void test_global_func_args0(struct bpf_object *obj) void test_global_func_args(void) { - const char *file = "./test_global_func_args.o"; + const char *file = "./test_global_func_args.bpf.o"; struct bpf_object *obj; int err, prog_fd; LIBBPF_OPTS(bpf_test_run_opts, topts, diff --git a/tools/testing/selftests/bpf/prog_tests/htab_update.c b/tools/testing/selftests/bpf/prog_tests/htab_update.c new file mode 100644 index 000000000000..2bc85f4814f4 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/htab_update.c @@ -0,0 +1,126 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2022. Huawei Technologies Co., Ltd */ +#define _GNU_SOURCE +#include <sched.h> +#include <stdbool.h> +#include <test_progs.h> +#include "htab_update.skel.h" + +struct htab_update_ctx { + int fd; + int loop; + bool stop; +}; + +static void test_reenter_update(void) +{ + struct htab_update *skel; + unsigned int key, value; + int err; + + skel = htab_update__open(); + if (!ASSERT_OK_PTR(skel, "htab_update__open")) + return; + + /* lookup_elem_raw() may be inlined and find_kernel_btf_id() will return -ESRCH */ + bpf_program__set_autoload(skel->progs.lookup_elem_raw, true); + err = htab_update__load(skel); + if (!ASSERT_TRUE(!err || err == -ESRCH, "htab_update__load") || err) + goto out; + + skel->bss->pid = getpid(); + err = htab_update__attach(skel); + if (!ASSERT_OK(err, "htab_update__attach")) + goto out; + + /* Will trigger the reentrancy of bpf_map_update_elem() */ + key = 0; + value = 0; + err = bpf_map_update_elem(bpf_map__fd(skel->maps.htab), &key, &value, 0); + if (!ASSERT_OK(err, "add element")) + goto out; + + ASSERT_EQ(skel->bss->update_err, -EBUSY, "no reentrancy"); +out: + htab_update__destroy(skel); +} + +static void *htab_update_thread(void *arg) +{ + struct htab_update_ctx *ctx = arg; + cpu_set_t cpus; + int i; + + /* Pinned on CPU 0 */ + CPU_ZERO(&cpus); + CPU_SET(0, &cpus); + pthread_setaffinity_np(pthread_self(), sizeof(cpus), &cpus); + + i = 0; + while (i++ < ctx->loop && !ctx->stop) { + unsigned int key = 0, value = 0; + int err; + + err = bpf_map_update_elem(ctx->fd, &key, &value, 0); + if (err) { + ctx->stop = true; + return (void *)(long)err; + } + } + + return NULL; +} + +static void test_concurrent_update(void) +{ + struct htab_update_ctx ctx; + struct htab_update *skel; + unsigned int i, nr; + pthread_t *tids; + int err; + + skel = htab_update__open_and_load(); + if (!ASSERT_OK_PTR(skel, "htab_update__open_and_load")) + return; + + ctx.fd = bpf_map__fd(skel->maps.htab); + ctx.loop = 1000; + ctx.stop = false; + + nr = 4; + tids = calloc(nr, sizeof(*tids)); + if (!ASSERT_NEQ(tids, NULL, "no mem")) + goto out; + + for (i = 0; i < nr; i++) { + err = pthread_create(&tids[i], NULL, htab_update_thread, &ctx); + if (!ASSERT_OK(err, "pthread_create")) { + unsigned int j; + + ctx.stop = true; + for (j = 0; j < i; j++) + pthread_join(tids[j], NULL); + goto out; + } + } + + for (i = 0; i < nr; i++) { + void *thread_err = NULL; + + pthread_join(tids[i], &thread_err); + ASSERT_EQ(thread_err, NULL, "update error"); + } + +out: + if (tids) + free(tids); + htab_update__destroy(skel); +} + +void test_htab_update(void) +{ + if (test__start_subtest("reenter_update")) + test_reenter_update(); + if (test__start_subtest("concurrent_update")) + test_concurrent_update(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/kfree_skb.c b/tools/testing/selftests/bpf/prog_tests/kfree_skb.c index 1cee6957285e..73579370bfbd 100644 --- a/tools/testing/selftests/bpf/prog_tests/kfree_skb.c +++ b/tools/testing/selftests/bpf/prog_tests/kfree_skb.c @@ -69,7 +69,7 @@ void serial_test_kfree_skb(void) const int zero = 0; bool test_ok[2]; - err = bpf_prog_test_load("./test_pkt_access.o", BPF_PROG_TYPE_SCHED_CLS, + err = bpf_prog_test_load("./test_pkt_access.bpf.o", BPF_PROG_TYPE_SCHED_CLS, &obj, &prog_fd); if (CHECK(err, "prog_load sched cls", "err %d errno %d\n", err, errno)) return; diff --git a/tools/testing/selftests/bpf/prog_tests/kfunc_call.c b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c index 351fafa006fb..eede7c304f86 100644 --- a/tools/testing/selftests/bpf/prog_tests/kfunc_call.c +++ b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c @@ -109,7 +109,7 @@ static void test_destructive(void) { __u64 save_caps = 0; - ASSERT_OK(test_destructive_open_and_load(), "succesful_load"); + ASSERT_OK(test_destructive_open_and_load(), "successful_load"); if (!ASSERT_OK(cap_disable_effective(1ULL << CAP_SYS_BOOT, &save_caps), "drop_caps")) return; diff --git a/tools/testing/selftests/bpf/prog_tests/l4lb_all.c b/tools/testing/selftests/bpf/prog_tests/l4lb_all.c index 55f733ff4109..9c1a18573ffd 100644 --- a/tools/testing/selftests/bpf/prog_tests/l4lb_all.c +++ b/tools/testing/selftests/bpf/prog_tests/l4lb_all.c @@ -90,7 +90,7 @@ out: void test_l4lb_all(void) { if (test__start_subtest("l4lb_inline")) - test_l4lb("test_l4lb.o"); + test_l4lb("test_l4lb.bpf.o"); if (test__start_subtest("l4lb_noinline")) - test_l4lb("test_l4lb_noinline.o"); + test_l4lb("test_l4lb_noinline.bpf.o"); } diff --git a/tools/testing/selftests/bpf/prog_tests/load_bytes_relative.c b/tools/testing/selftests/bpf/prog_tests/load_bytes_relative.c index 4e0b2ec057aa..581c0eb0a0a1 100644 --- a/tools/testing/selftests/bpf/prog_tests/load_bytes_relative.c +++ b/tools/testing/selftests/bpf/prog_tests/load_bytes_relative.c @@ -27,8 +27,8 @@ void test_load_bytes_relative(void) if (CHECK_FAIL(server_fd < 0)) goto close_cgroup_fd; - err = bpf_prog_test_load("./load_bytes_relative.o", BPF_PROG_TYPE_CGROUP_SKB, - &obj, &prog_fd); + err = bpf_prog_test_load("./load_bytes_relative.bpf.o", BPF_PROG_TYPE_CGROUP_SKB, + &obj, &prog_fd); if (CHECK_FAIL(err)) goto close_server_fd; diff --git a/tools/testing/selftests/bpf/prog_tests/map_lock.c b/tools/testing/selftests/bpf/prog_tests/map_lock.c index e4e99b37df64..1d6726f01dd2 100644 --- a/tools/testing/selftests/bpf/prog_tests/map_lock.c +++ b/tools/testing/selftests/bpf/prog_tests/map_lock.c @@ -49,7 +49,7 @@ out: void test_map_lock(void) { - const char *file = "./test_map_lock.o"; + const char *file = "./test_map_lock.bpf.o"; int prog_fd, map_fd[2], vars[17] = {}; pthread_t thread_id[6]; struct bpf_object *obj = NULL; diff --git a/tools/testing/selftests/bpf/prog_tests/pinning.c b/tools/testing/selftests/bpf/prog_tests/pinning.c index 31c09ba577eb..d95cee5867b7 100644 --- a/tools/testing/selftests/bpf/prog_tests/pinning.c +++ b/tools/testing/selftests/bpf/prog_tests/pinning.c @@ -26,13 +26,13 @@ __u32 get_map_id(struct bpf_object *obj, const char *name) void test_pinning(void) { - const char *file_invalid = "./test_pinning_invalid.o"; + const char *file_invalid = "./test_pinning_invalid.bpf.o"; const char *custpinpath = "/sys/fs/bpf/custom/pinmap"; const char *nopinpath = "/sys/fs/bpf/nopinmap"; const char *nopinpath2 = "/sys/fs/bpf/nopinmap2"; const char *custpath = "/sys/fs/bpf/custom"; const char *pinpath = "/sys/fs/bpf/pinmap"; - const char *file = "./test_pinning.o"; + const char *file = "./test_pinning.bpf.o"; __u32 map_id, map_id2, duration = 0; struct stat statbuf = {}; struct bpf_object *obj; diff --git a/tools/testing/selftests/bpf/prog_tests/pkt_access.c b/tools/testing/selftests/bpf/prog_tests/pkt_access.c index 0bcccdc34fbc..682e4ff45b01 100644 --- a/tools/testing/selftests/bpf/prog_tests/pkt_access.c +++ b/tools/testing/selftests/bpf/prog_tests/pkt_access.c @@ -4,7 +4,7 @@ void test_pkt_access(void) { - const char *file = "./test_pkt_access.o"; + const char *file = "./test_pkt_access.bpf.o"; struct bpf_object *obj; int err, prog_fd; LIBBPF_OPTS(bpf_test_run_opts, topts, diff --git a/tools/testing/selftests/bpf/prog_tests/pkt_md_access.c b/tools/testing/selftests/bpf/prog_tests/pkt_md_access.c index 00ee1dd792aa..0d85e0642811 100644 --- a/tools/testing/selftests/bpf/prog_tests/pkt_md_access.c +++ b/tools/testing/selftests/bpf/prog_tests/pkt_md_access.c @@ -4,7 +4,7 @@ void test_pkt_md_access(void) { - const char *file = "./test_pkt_md_access.o"; + const char *file = "./test_pkt_md_access.bpf.o"; struct bpf_object *obj; int err, prog_fd; LIBBPF_OPTS(bpf_test_run_opts, topts, diff --git a/tools/testing/selftests/bpf/prog_tests/probe_user.c b/tools/testing/selftests/bpf/prog_tests/probe_user.c index 34dbd2adc157..8721671321de 100644 --- a/tools/testing/selftests/bpf/prog_tests/probe_user.c +++ b/tools/testing/selftests/bpf/prog_tests/probe_user.c @@ -11,7 +11,7 @@ void serial_test_probe_user(void) #endif }; enum { prog_count = ARRAY_SIZE(prog_names) }; - const char *obj_file = "./test_probe_user.o"; + const char *obj_file = "./test_probe_user.bpf.o"; DECLARE_LIBBPF_OPTS(bpf_object_open_opts, opts, ); int err, results_map_fd, sock_fd, duration = 0; struct sockaddr curr, orig, tmp; diff --git a/tools/testing/selftests/bpf/prog_tests/queue_stack_map.c b/tools/testing/selftests/bpf/prog_tests/queue_stack_map.c index d2743fc10032..722c5f2a7776 100644 --- a/tools/testing/selftests/bpf/prog_tests/queue_stack_map.c +++ b/tools/testing/selftests/bpf/prog_tests/queue_stack_map.c @@ -28,9 +28,9 @@ static void test_queue_stack_map_by_type(int type) vals[i] = rand(); if (type == QUEUE) - strncpy(file, "./test_queue_map.o", sizeof(file)); + strncpy(file, "./test_queue_map.bpf.o", sizeof(file)); else if (type == STACK) - strncpy(file, "./test_stack_map.o", sizeof(file)); + strncpy(file, "./test_stack_map.bpf.o", sizeof(file)); else return; diff --git a/tools/testing/selftests/bpf/prog_tests/rdonly_maps.c b/tools/testing/selftests/bpf/prog_tests/rdonly_maps.c index fd5d2ddfb062..19e2f2526dbd 100644 --- a/tools/testing/selftests/bpf/prog_tests/rdonly_maps.c +++ b/tools/testing/selftests/bpf/prog_tests/rdonly_maps.c @@ -16,7 +16,7 @@ struct rdonly_map_subtest { void test_rdonly_maps(void) { - const char *file = "test_rdonly_maps.o"; + const char *file = "test_rdonly_maps.bpf.o"; struct rdonly_map_subtest subtests[] = { { "skip loop", "skip_loop", 0, 0 }, { "part loop", "part_loop", 3, 2 + 3 + 4 }, diff --git a/tools/testing/selftests/bpf/prog_tests/reference_tracking.c b/tools/testing/selftests/bpf/prog_tests/reference_tracking.c index 739d2ea6ca55..d863205bbe95 100644 --- a/tools/testing/selftests/bpf/prog_tests/reference_tracking.c +++ b/tools/testing/selftests/bpf/prog_tests/reference_tracking.c @@ -3,7 +3,7 @@ void test_reference_tracking(void) { - const char *file = "test_sk_lookup_kern.o"; + const char *file = "test_sk_lookup_kern.bpf.o"; const char *obj_name = "ref_track"; DECLARE_LIBBPF_OPTS(bpf_object_open_opts, open_opts, .object_name = obj_name, diff --git a/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c b/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c index c197261d02e2..f81d08d429a2 100644 --- a/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c +++ b/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c @@ -101,7 +101,7 @@ static int resolve_symbols(void) int type_id; __u32 nr; - btf = btf__parse_elf("btf_data.o", NULL); + btf = btf__parse_elf("btf_data.bpf.o", NULL); if (CHECK(libbpf_get_error(btf), "resolve", "Failed to load BTF from btf_data.o\n")) return -1; diff --git a/tools/testing/selftests/bpf/prog_tests/select_reuseport.c b/tools/testing/selftests/bpf/prog_tests/select_reuseport.c index 1cbd8cd64044..64c5f5eb2994 100644 --- a/tools/testing/selftests/bpf/prog_tests/select_reuseport.c +++ b/tools/testing/selftests/bpf/prog_tests/select_reuseport.c @@ -91,9 +91,9 @@ static int prepare_bpf_obj(void) struct bpf_map *map; int err; - obj = bpf_object__open("test_select_reuseport_kern.o"); + obj = bpf_object__open("test_select_reuseport_kern.bpf.o"); err = libbpf_get_error(obj); - RET_ERR(err, "open test_select_reuseport_kern.o", + RET_ERR(err, "open test_select_reuseport_kern.bpf.o", "obj:%p PTR_ERR(obj):%d\n", obj, err); map = bpf_object__find_map_by_name(obj, "outer_map"); diff --git a/tools/testing/selftests/bpf/prog_tests/setget_sockopt.c b/tools/testing/selftests/bpf/prog_tests/setget_sockopt.c new file mode 100644 index 000000000000..018611e6b248 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/setget_sockopt.c @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) Meta Platforms, Inc. and affiliates. */ + +#define _GNU_SOURCE +#include <sched.h> +#include <linux/socket.h> +#include <net/if.h> + +#include "test_progs.h" +#include "cgroup_helpers.h" +#include "network_helpers.h" + +#include "setget_sockopt.skel.h" + +#define CG_NAME "/setget-sockopt-test" + +static const char addr4_str[] = "127.0.0.1"; +static const char addr6_str[] = "::1"; +static struct setget_sockopt *skel; +static int cg_fd; + +static int create_netns(void) +{ + if (!ASSERT_OK(unshare(CLONE_NEWNET), "create netns")) + return -1; + + if (!ASSERT_OK(system("ip link set dev lo up"), "set lo up")) + return -1; + + if (!ASSERT_OK(system("ip link add dev binddevtest1 type veth peer name binddevtest2"), + "add veth")) + return -1; + + if (!ASSERT_OK(system("ip link set dev binddevtest1 up"), + "bring veth up")) + return -1; + + return 0; +} + +static void test_tcp(int family) +{ + struct setget_sockopt__bss *bss = skel->bss; + int sfd, cfd; + + memset(bss, 0, sizeof(*bss)); + + sfd = start_server(family, SOCK_STREAM, + family == AF_INET6 ? addr6_str : addr4_str, 0, 0); + if (!ASSERT_GE(sfd, 0, "start_server")) + return; + + cfd = connect_to_fd(sfd, 0); + if (!ASSERT_GE(cfd, 0, "connect_to_fd_server")) { + close(sfd); + return; + } + close(sfd); + close(cfd); + + ASSERT_EQ(bss->nr_listen, 1, "nr_listen"); + ASSERT_EQ(bss->nr_connect, 1, "nr_connect"); + ASSERT_EQ(bss->nr_active, 1, "nr_active"); + ASSERT_EQ(bss->nr_passive, 1, "nr_passive"); + ASSERT_EQ(bss->nr_socket_post_create, 2, "nr_socket_post_create"); + ASSERT_EQ(bss->nr_binddev, 2, "nr_bind"); +} + +static void test_udp(int family) +{ + struct setget_sockopt__bss *bss = skel->bss; + int sfd; + + memset(bss, 0, sizeof(*bss)); + + sfd = start_server(family, SOCK_DGRAM, + family == AF_INET6 ? addr6_str : addr4_str, 0, 0); + if (!ASSERT_GE(sfd, 0, "start_server")) + return; + close(sfd); + + ASSERT_GE(bss->nr_socket_post_create, 1, "nr_socket_post_create"); + ASSERT_EQ(bss->nr_binddev, 1, "nr_bind"); +} + +void test_setget_sockopt(void) +{ + cg_fd = test__join_cgroup(CG_NAME); + if (cg_fd < 0) + return; + + if (create_netns()) + goto done; + + skel = setget_sockopt__open(); + if (!ASSERT_OK_PTR(skel, "open skel")) + goto done; + + strcpy(skel->rodata->veth, "binddevtest1"); + skel->rodata->veth_ifindex = if_nametoindex("binddevtest1"); + if (!ASSERT_GT(skel->rodata->veth_ifindex, 0, "if_nametoindex")) + goto done; + + if (!ASSERT_OK(setget_sockopt__load(skel), "load skel")) + goto done; + + skel->links.skops_sockopt = + bpf_program__attach_cgroup(skel->progs.skops_sockopt, cg_fd); + if (!ASSERT_OK_PTR(skel->links.skops_sockopt, "attach cgroup")) + goto done; + + skel->links.socket_post_create = + bpf_program__attach_cgroup(skel->progs.socket_post_create, cg_fd); + if (!ASSERT_OK_PTR(skel->links.socket_post_create, "attach_cgroup")) + goto done; + + test_tcp(AF_INET6); + test_tcp(AF_INET); + test_udp(AF_INET6); + test_udp(AF_INET); + +done: + setget_sockopt__destroy(skel); + close(cg_fd); +} diff --git a/tools/testing/selftests/bpf/prog_tests/sk_assign.c b/tools/testing/selftests/bpf/prog_tests/sk_assign.c index 1d272e05188e..3e190ed63976 100644 --- a/tools/testing/selftests/bpf/prog_tests/sk_assign.c +++ b/tools/testing/selftests/bpf/prog_tests/sk_assign.c @@ -47,7 +47,7 @@ configure_stack(void) if (CHECK_FAIL(system("tc qdisc add dev lo clsact"))) return false; sprintf(tc_cmd, "%s %s %s %s", "tc filter add dev lo ingress bpf", - "direct-action object-file ./test_sk_assign.o", + "direct-action object-file ./test_sk_assign.bpf.o", "section tc", (env.verbosity < VERBOSE_VERY) ? " 2>/dev/null" : "verbose"); if (CHECK(system(tc_cmd), "BPF load failed;", diff --git a/tools/testing/selftests/bpf/prog_tests/skb_ctx.c b/tools/testing/selftests/bpf/prog_tests/skb_ctx.c index ce0e555b5e38..33f950e2dae3 100644 --- a/tools/testing/selftests/bpf/prog_tests/skb_ctx.c +++ b/tools/testing/selftests/bpf/prog_tests/skb_ctx.c @@ -31,7 +31,7 @@ void test_skb_ctx(void) struct bpf_object *obj; int err, prog_fd, i; - err = bpf_prog_test_load("./test_skb_ctx.o", BPF_PROG_TYPE_SCHED_CLS, + err = bpf_prog_test_load("./test_skb_ctx.bpf.o", BPF_PROG_TYPE_SCHED_CLS, &obj, &prog_fd); if (!ASSERT_OK(err, "load")) return; diff --git a/tools/testing/selftests/bpf/prog_tests/skb_helpers.c b/tools/testing/selftests/bpf/prog_tests/skb_helpers.c index 97dc8b14be48..f7ee25f290f7 100644 --- a/tools/testing/selftests/bpf/prog_tests/skb_helpers.c +++ b/tools/testing/selftests/bpf/prog_tests/skb_helpers.c @@ -20,7 +20,7 @@ void test_skb_helpers(void) struct bpf_object *obj; int err, prog_fd; - err = bpf_prog_test_load("./test_skb_helpers.o", + err = bpf_prog_test_load("./test_skb_helpers.bpf.o", BPF_PROG_TYPE_SCHED_CLS, &obj, &prog_fd); if (!ASSERT_OK(err, "load")) return; diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c b/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c index 8ed78a9383ba..c5cb6e8374b6 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c +++ b/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c @@ -174,7 +174,7 @@ static void run_test(int cgroup_fd) pthread_t tid; int err; - obj = bpf_object__open_file("sockopt_inherit.o", NULL); + obj = bpf_object__open_file("sockopt_inherit.bpf.o", NULL); if (!ASSERT_OK_PTR(obj, "obj_open")) return; diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_multi.c b/tools/testing/selftests/bpf/prog_tests/sockopt_multi.c index abce12ddcc37..28d592dc54a7 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockopt_multi.c +++ b/tools/testing/selftests/bpf/prog_tests/sockopt_multi.c @@ -310,7 +310,7 @@ void test_sockopt_multi(void) if (CHECK_FAIL(cg_child < 0)) goto out; - obj = bpf_object__open_file("sockopt_multi.o", NULL); + obj = bpf_object__open_file("sockopt_multi.bpf.o", NULL); if (!ASSERT_OK_PTR(obj, "obj_load")) goto out; diff --git a/tools/testing/selftests/bpf/prog_tests/spinlock.c b/tools/testing/selftests/bpf/prog_tests/spinlock.c index 8e329eaee6d7..15eb1372d771 100644 --- a/tools/testing/selftests/bpf/prog_tests/spinlock.c +++ b/tools/testing/selftests/bpf/prog_tests/spinlock.c @@ -19,7 +19,7 @@ static void *spin_lock_thread(void *arg) void test_spinlock(void) { - const char *file = "./test_spin_lock.o"; + const char *file = "./test_spin_lock.bpf.o"; pthread_t thread_id[4]; struct bpf_object *obj = NULL; int prog_fd; diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_map.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_map.c index 313f0a66232e..df59e4ae2951 100644 --- a/tools/testing/selftests/bpf/prog_tests/stacktrace_map.c +++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_map.c @@ -6,7 +6,7 @@ void test_stacktrace_map(void) int control_map_fd, stackid_hmap_fd, stackmap_fd, stack_amap_fd; const char *prog_name = "oncpu"; int err, prog_fd, stack_trace_len; - const char *file = "./test_stacktrace_map.o"; + const char *file = "./test_stacktrace_map.bpf.o"; __u32 key, val, duration = 0; struct bpf_program *prog; struct bpf_object *obj; diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_map_raw_tp.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_map_raw_tp.c index 1cb8dd36bd8f..c6ef06f55cdb 100644 --- a/tools/testing/selftests/bpf/prog_tests/stacktrace_map_raw_tp.c +++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_map_raw_tp.c @@ -5,7 +5,7 @@ void test_stacktrace_map_raw_tp(void) { const char *prog_name = "oncpu"; int control_map_fd, stackid_hmap_fd, stackmap_fd; - const char *file = "./test_stacktrace_map.o"; + const char *file = "./test_stacktrace_map.bpf.o"; __u32 key, val, duration = 0; int err, prog_fd; struct bpf_program *prog; diff --git a/tools/testing/selftests/bpf/prog_tests/tailcalls.c b/tools/testing/selftests/bpf/prog_tests/tailcalls.c index 19c70880cfb3..58fe2c586ed7 100644 --- a/tools/testing/selftests/bpf/prog_tests/tailcalls.c +++ b/tools/testing/selftests/bpf/prog_tests/tailcalls.c @@ -20,8 +20,8 @@ static void test_tailcall_1(void) .repeat = 1, ); - err = bpf_prog_test_load("tailcall1.o", BPF_PROG_TYPE_SCHED_CLS, &obj, - &prog_fd); + err = bpf_prog_test_load("tailcall1.bpf.o", BPF_PROG_TYPE_SCHED_CLS, &obj, + &prog_fd); if (CHECK_FAIL(err)) return; @@ -156,8 +156,8 @@ static void test_tailcall_2(void) .repeat = 1, ); - err = bpf_prog_test_load("tailcall2.o", BPF_PROG_TYPE_SCHED_CLS, &obj, - &prog_fd); + err = bpf_prog_test_load("tailcall2.bpf.o", BPF_PROG_TYPE_SCHED_CLS, &obj, + &prog_fd); if (CHECK_FAIL(err)) return; @@ -299,7 +299,7 @@ out: */ static void test_tailcall_3(void) { - test_tailcall_count("tailcall3.o"); + test_tailcall_count("tailcall3.bpf.o"); } /* test_tailcall_6 checks that the count value of the tail call limit @@ -307,7 +307,7 @@ static void test_tailcall_3(void) */ static void test_tailcall_6(void) { - test_tailcall_count("tailcall6.o"); + test_tailcall_count("tailcall6.bpf.o"); } /* test_tailcall_4 checks that the kernel properly selects indirect jump @@ -329,8 +329,8 @@ static void test_tailcall_4(void) .repeat = 1, ); - err = bpf_prog_test_load("tailcall4.o", BPF_PROG_TYPE_SCHED_CLS, &obj, - &prog_fd); + err = bpf_prog_test_load("tailcall4.bpf.o", BPF_PROG_TYPE_SCHED_CLS, &obj, + &prog_fd); if (CHECK_FAIL(err)) return; @@ -419,8 +419,8 @@ static void test_tailcall_5(void) .repeat = 1, ); - err = bpf_prog_test_load("tailcall5.o", BPF_PROG_TYPE_SCHED_CLS, &obj, - &prog_fd); + err = bpf_prog_test_load("tailcall5.bpf.o", BPF_PROG_TYPE_SCHED_CLS, &obj, + &prog_fd); if (CHECK_FAIL(err)) return; @@ -507,8 +507,8 @@ static void test_tailcall_bpf2bpf_1(void) .repeat = 1, ); - err = bpf_prog_test_load("tailcall_bpf2bpf1.o", BPF_PROG_TYPE_SCHED_CLS, - &obj, &prog_fd); + err = bpf_prog_test_load("tailcall_bpf2bpf1.bpf.o", BPF_PROG_TYPE_SCHED_CLS, + &obj, &prog_fd); if (CHECK_FAIL(err)) return; @@ -591,8 +591,8 @@ static void test_tailcall_bpf2bpf_2(void) .repeat = 1, ); - err = bpf_prog_test_load("tailcall_bpf2bpf2.o", BPF_PROG_TYPE_SCHED_CLS, - &obj, &prog_fd); + err = bpf_prog_test_load("tailcall_bpf2bpf2.bpf.o", BPF_PROG_TYPE_SCHED_CLS, + &obj, &prog_fd); if (CHECK_FAIL(err)) return; @@ -671,8 +671,8 @@ static void test_tailcall_bpf2bpf_3(void) .repeat = 1, ); - err = bpf_prog_test_load("tailcall_bpf2bpf3.o", BPF_PROG_TYPE_SCHED_CLS, - &obj, &prog_fd); + err = bpf_prog_test_load("tailcall_bpf2bpf3.bpf.o", BPF_PROG_TYPE_SCHED_CLS, + &obj, &prog_fd); if (CHECK_FAIL(err)) return; @@ -766,8 +766,8 @@ static void test_tailcall_bpf2bpf_4(bool noise) .repeat = 1, ); - err = bpf_prog_test_load("tailcall_bpf2bpf4.o", BPF_PROG_TYPE_SCHED_CLS, - &obj, &prog_fd); + err = bpf_prog_test_load("tailcall_bpf2bpf4.bpf.o", BPF_PROG_TYPE_SCHED_CLS, + &obj, &prog_fd); if (CHECK_FAIL(err)) return; diff --git a/tools/testing/selftests/bpf/prog_tests/task_fd_query_rawtp.c b/tools/testing/selftests/bpf/prog_tests/task_fd_query_rawtp.c index 17947c9e1d66..3d34bab01e48 100644 --- a/tools/testing/selftests/bpf/prog_tests/task_fd_query_rawtp.c +++ b/tools/testing/selftests/bpf/prog_tests/task_fd_query_rawtp.c @@ -3,7 +3,7 @@ void test_task_fd_query_rawtp(void) { - const char *file = "./test_get_stack_rawtp.o"; + const char *file = "./test_get_stack_rawtp.bpf.o"; __u64 probe_offset, probe_addr; __u32 len, prog_id, fd_type; struct bpf_object *obj; diff --git a/tools/testing/selftests/bpf/prog_tests/task_fd_query_tp.c b/tools/testing/selftests/bpf/prog_tests/task_fd_query_tp.c index c2a98a7a8dfc..c717741bf8b6 100644 --- a/tools/testing/selftests/bpf/prog_tests/task_fd_query_tp.c +++ b/tools/testing/selftests/bpf/prog_tests/task_fd_query_tp.c @@ -4,7 +4,7 @@ static void test_task_fd_query_tp_core(const char *probe_name, const char *tp_name) { - const char *file = "./test_tracepoint.o"; + const char *file = "./test_tracepoint.bpf.o"; int err, bytes, efd, prog_fd, pmu_fd; struct perf_event_attr attr = {}; __u64 probe_offset, probe_addr; diff --git a/tools/testing/selftests/bpf/prog_tests/tcp_estats.c b/tools/testing/selftests/bpf/prog_tests/tcp_estats.c index 11bf755be4c9..032dbfb26256 100644 --- a/tools/testing/selftests/bpf/prog_tests/tcp_estats.c +++ b/tools/testing/selftests/bpf/prog_tests/tcp_estats.c @@ -3,7 +3,7 @@ void test_tcp_estats(void) { - const char *file = "./test_tcp_estats.o"; + const char *file = "./test_tcp_estats.bpf.o"; int err, prog_fd; struct bpf_object *obj; __u32 duration = 0; diff --git a/tools/testing/selftests/bpf/prog_tests/test_bprm_opts.c b/tools/testing/selftests/bpf/prog_tests/test_bprm_opts.c index 2559bb775762..a0054019e677 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_bprm_opts.c +++ b/tools/testing/selftests/bpf/prog_tests/test_bprm_opts.c @@ -9,18 +9,10 @@ #include "bprm_opts.skel.h" #include "network_helpers.h" - -#ifndef __NR_pidfd_open -#define __NR_pidfd_open 434 -#endif +#include "task_local_storage_helpers.h" static const char * const bash_envp[] = { "TMPDIR=shouldnotbeset", NULL }; -static inline int sys_pidfd_open(pid_t pid, unsigned int flags) -{ - return syscall(__NR_pidfd_open, pid, flags); -} - static int update_storage(int map_fd, int secureexec) { int task_fd, ret = 0; diff --git a/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c b/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c index b90ee47d3111..7295cc60f724 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c +++ b/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c @@ -65,23 +65,23 @@ struct test_def { void test_test_global_funcs(void) { struct test_def tests[] = { - { "test_global_func1.o", "combined stack size of 4 calls is 544" }, - { "test_global_func2.o" }, - { "test_global_func3.o" , "the call stack of 8 frames" }, - { "test_global_func4.o" }, - { "test_global_func5.o" , "expected pointer to ctx, but got PTR" }, - { "test_global_func6.o" , "modified ctx ptr R2" }, - { "test_global_func7.o" , "foo() doesn't return scalar" }, - { "test_global_func8.o" }, - { "test_global_func9.o" }, - { "test_global_func10.o", "invalid indirect read from stack" }, - { "test_global_func11.o", "Caller passes invalid args into func#1" }, - { "test_global_func12.o", "invalid mem access 'mem_or_null'" }, - { "test_global_func13.o", "Caller passes invalid args into func#1" }, - { "test_global_func14.o", "reference type('FWD S') size cannot be determined" }, - { "test_global_func15.o", "At program exit the register R0 has value" }, - { "test_global_func16.o", "invalid indirect read from stack" }, - { "test_global_func17.o", "Caller passes invalid args into func#1" }, + { "test_global_func1.bpf.o", "combined stack size of 4 calls is 544" }, + { "test_global_func2.bpf.o" }, + { "test_global_func3.bpf.o", "the call stack of 8 frames" }, + { "test_global_func4.bpf.o" }, + { "test_global_func5.bpf.o", "expected pointer to ctx, but got PTR" }, + { "test_global_func6.bpf.o", "modified ctx ptr R2" }, + { "test_global_func7.bpf.o", "foo() doesn't return scalar" }, + { "test_global_func8.bpf.o" }, + { "test_global_func9.bpf.o" }, + { "test_global_func10.bpf.o", "invalid indirect read from stack" }, + { "test_global_func11.bpf.o", "Caller passes invalid args into func#1" }, + { "test_global_func12.bpf.o", "invalid mem access 'mem_or_null'" }, + { "test_global_func13.bpf.o", "Caller passes invalid args into func#1" }, + { "test_global_func14.bpf.o", "reference type('FWD S') size cannot be determined" }, + { "test_global_func15.bpf.o", "At program exit the register R0 has value" }, + { "test_global_func16.bpf.o", "invalid indirect read from stack" }, + { "test_global_func17.bpf.o", "Caller passes invalid args into func#1" }, }; libbpf_print_fn_t old_print_fn = NULL; int err, i, duration = 0; diff --git a/tools/testing/selftests/bpf/prog_tests/test_local_storage.c b/tools/testing/selftests/bpf/prog_tests/test_local_storage.c index 26ac26a88026..9c77cd6b1eaf 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_local_storage.c +++ b/tools/testing/selftests/bpf/prog_tests/test_local_storage.c @@ -11,15 +11,7 @@ #include "local_storage.skel.h" #include "network_helpers.h" - -#ifndef __NR_pidfd_open -#define __NR_pidfd_open 434 -#endif - -static inline int sys_pidfd_open(pid_t pid, unsigned int flags) -{ - return syscall(__NR_pidfd_open, pid, flags); -} +#include "task_local_storage_helpers.h" static unsigned int duration; diff --git a/tools/testing/selftests/bpf/prog_tests/test_overhead.c b/tools/testing/selftests/bpf/prog_tests/test_overhead.c index 05acb376f74d..f27013e38d03 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_overhead.c +++ b/tools/testing/selftests/bpf/prog_tests/test_overhead.c @@ -72,7 +72,7 @@ void test_test_overhead(void) if (CHECK_FAIL(prctl(PR_GET_NAME, comm, 0L, 0L, 0L))) return; - obj = bpf_object__open_file("./test_overhead.o", NULL); + obj = bpf_object__open_file("./test_overhead.bpf.o", NULL); if (!ASSERT_OK_PTR(obj, "obj_open_file")) return; diff --git a/tools/testing/selftests/bpf/prog_tests/tp_attach_query.c b/tools/testing/selftests/bpf/prog_tests/tp_attach_query.c index 39e79291c82b..a479080533db 100644 --- a/tools/testing/selftests/bpf/prog_tests/tp_attach_query.c +++ b/tools/testing/selftests/bpf/prog_tests/tp_attach_query.c @@ -6,7 +6,7 @@ void serial_test_tp_attach_query(void) const int num_progs = 3; int i, j, bytes, efd, err, prog_fd[num_progs], pmu_fd[num_progs]; __u32 duration = 0, info_len, saved_prog_ids[num_progs]; - const char *file = "./test_tracepoint.o"; + const char *file = "./test_tracepoint.bpf.o"; struct perf_event_query_bpf *query; struct perf_event_attr attr = {}; struct bpf_object *obj[num_progs]; diff --git a/tools/testing/selftests/bpf/prog_tests/trampoline_count.c b/tools/testing/selftests/bpf/prog_tests/trampoline_count.c index b0acbda6dbf5..564b75bc087f 100644 --- a/tools/testing/selftests/bpf/prog_tests/trampoline_count.c +++ b/tools/testing/selftests/bpf/prog_tests/trampoline_count.c @@ -35,7 +35,7 @@ static struct bpf_program *load_prog(char *file, char *name, struct inst *inst) /* TODO: use different target function to run in concurrent mode */ void serial_test_trampoline_count(void) { - char *file = "test_trampoline_count.o"; + char *file = "test_trampoline_count.bpf.o"; char *const progs[] = { "fentry_test", "fmod_ret_test", "fexit_test" }; struct inst inst[MAX_TRAMP_PROGS + 1] = {}; struct bpf_program *prog; diff --git a/tools/testing/selftests/bpf/prog_tests/xdp.c b/tools/testing/selftests/bpf/prog_tests/xdp.c index ec21c53cb1da..947863a1d536 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp.c @@ -8,7 +8,7 @@ void test_xdp(void) struct vip key6 = {.protocol = 6, .family = AF_INET6}; struct iptnl_info value4 = {.family = AF_INET}; struct iptnl_info value6 = {.family = AF_INET6}; - const char *file = "./test_xdp.o"; + const char *file = "./test_xdp.bpf.o"; struct bpf_object *obj; char buf[128]; struct ipv6hdr iph6; diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_adjust_frags.c b/tools/testing/selftests/bpf/prog_tests/xdp_adjust_frags.c index 2f033da4cd45..fce203640f8c 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_adjust_frags.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_adjust_frags.c @@ -4,7 +4,7 @@ static void test_xdp_update_frags(void) { - const char *file = "./test_xdp_update_frags.o"; + const char *file = "./test_xdp_update_frags.bpf.o"; int err, prog_fd, max_skb_frags, buf_size, num; struct bpf_program *prog; struct bpf_object *obj; diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c b/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c index 21ceac24e174..9b9cf8458adf 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c @@ -4,7 +4,7 @@ static void test_xdp_adjust_tail_shrink(void) { - const char *file = "./test_xdp_adjust_tail_shrink.o"; + const char *file = "./test_xdp_adjust_tail_shrink.bpf.o"; __u32 expect_sz; struct bpf_object *obj; int err, prog_fd; @@ -39,7 +39,7 @@ static void test_xdp_adjust_tail_shrink(void) static void test_xdp_adjust_tail_grow(void) { - const char *file = "./test_xdp_adjust_tail_grow.o"; + const char *file = "./test_xdp_adjust_tail_grow.bpf.o"; struct bpf_object *obj; char buf[4096]; /* avoid segfault: large buf to hold grow results */ __u32 expect_sz; @@ -73,7 +73,7 @@ static void test_xdp_adjust_tail_grow(void) static void test_xdp_adjust_tail_grow2(void) { - const char *file = "./test_xdp_adjust_tail_grow.o"; + const char *file = "./test_xdp_adjust_tail_grow.bpf.o"; char buf[4096]; /* avoid segfault: large buf to hold grow results */ int tailroom = 320; /* SKB_DATA_ALIGN(sizeof(struct skb_shared_info))*/; struct bpf_object *obj; @@ -135,7 +135,7 @@ static void test_xdp_adjust_tail_grow2(void) static void test_xdp_adjust_frags_tail_shrink(void) { - const char *file = "./test_xdp_adjust_tail_shrink.o"; + const char *file = "./test_xdp_adjust_tail_shrink.bpf.o"; __u32 exp_size; struct bpf_program *prog; struct bpf_object *obj; @@ -202,7 +202,7 @@ out: static void test_xdp_adjust_frags_tail_grow(void) { - const char *file = "./test_xdp_adjust_tail_grow.o"; + const char *file = "./test_xdp_adjust_tail_grow.bpf.o"; __u32 exp_size; struct bpf_program *prog; struct bpf_object *obj; diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_attach.c b/tools/testing/selftests/bpf/prog_tests/xdp_attach.c index 62aa3edda5e6..062fbc8c8e5e 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_attach.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_attach.c @@ -8,7 +8,7 @@ void serial_test_xdp_attach(void) { __u32 duration = 0, id1, id2, id0 = 0, len; struct bpf_object *obj1, *obj2, *obj3; - const char *file = "./test_xdp.o"; + const char *file = "./test_xdp.bpf.o"; struct bpf_prog_info info = {}; int err, fd1, fd2, fd3; LIBBPF_OPTS(bpf_xdp_attach_opts, opts); diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_info.c b/tools/testing/selftests/bpf/prog_tests/xdp_info.c index 0d01ff6cb91a..cd3aa340e65e 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_info.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_info.c @@ -7,7 +7,7 @@ void serial_test_xdp_info(void) { __u32 len = sizeof(struct bpf_prog_info), duration = 0, prog_id; - const char *file = "./xdp_dummy.o"; + const char *file = "./xdp_dummy.bpf.o"; struct bpf_prog_info info = {}; struct bpf_object *obj; int err, prog_fd; diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_perf.c b/tools/testing/selftests/bpf/prog_tests/xdp_perf.c index f543d1bd21b8..ec5369f247cb 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_perf.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_perf.c @@ -3,7 +3,7 @@ void test_xdp_perf(void) { - const char *file = "./xdp_dummy.o"; + const char *file = "./xdp_dummy.bpf.o"; struct bpf_object *obj; char in[128], out[128]; int err, prog_fd; diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c b/tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c index 874a846e298c..75550a40e029 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c @@ -82,7 +82,7 @@ static void test_synproxy(bool xdp) SYS("ethtool -K tmp0 tx off"); if (xdp) /* Workaround required for veth. */ - SYS("ip link set tmp0 xdp object xdp_dummy.o section xdp 2> /dev/null"); + SYS("ip link set tmp0 xdp object xdp_dummy.bpf.o section xdp 2> /dev/null"); ns = open_netns("synproxy"); if (!ASSERT_OK_PTR(ns, "setns")) diff --git a/tools/testing/selftests/bpf/progs/bind4_prog.c b/tools/testing/selftests/bpf/progs/bind4_prog.c index 474c6a62078a..a487f60b73ac 100644 --- a/tools/testing/selftests/bpf/progs/bind4_prog.c +++ b/tools/testing/selftests/bpf/progs/bind4_prog.c @@ -6,8 +6,6 @@ #include <linux/bpf.h> #include <linux/in.h> #include <linux/in6.h> -#include <sys/socket.h> -#include <netinet/tcp.h> #include <linux/if.h> #include <errno.h> diff --git a/tools/testing/selftests/bpf/progs/bind6_prog.c b/tools/testing/selftests/bpf/progs/bind6_prog.c index c19cfa869f30..d62cd9e9cf0e 100644 --- a/tools/testing/selftests/bpf/progs/bind6_prog.c +++ b/tools/testing/selftests/bpf/progs/bind6_prog.c @@ -6,8 +6,6 @@ #include <linux/bpf.h> #include <linux/in.h> #include <linux/in6.h> -#include <sys/socket.h> -#include <netinet/tcp.h> #include <linux/if.h> #include <errno.h> diff --git a/tools/testing/selftests/bpf/progs/bpf_flow.c b/tools/testing/selftests/bpf/progs/bpf_flow.c index f266c757b3df..a20c5ed5e454 100644 --- a/tools/testing/selftests/bpf/progs/bpf_flow.c +++ b/tools/testing/selftests/bpf/progs/bpf_flow.c @@ -22,6 +22,8 @@ #define PROG(F) PROG_(F, _##F) #define PROG_(NUM, NAME) SEC("flow_dissector") int flow_dissector_##NUM +#define FLOW_CONTINUE_SADDR 0x7f00007f /* 127.0.0.127 */ + /* These are the identifiers of the BPF programs that will be used in tail * calls. Name is limited to 16 characters, with the terminating character and * bpf_func_ above, we have only 6 to work with, anything after will be cropped. @@ -143,6 +145,19 @@ int _dissect(struct __sk_buff *skb) { struct bpf_flow_keys *keys = skb->flow_keys; + if (keys->n_proto == bpf_htons(ETH_P_IP)) { + /* IP traffic from FLOW_CONTINUE_SADDR falls-back to + * standard dissector + */ + struct iphdr *iph, _iph; + + iph = bpf_flow_dissect_get_header(skb, sizeof(*iph), &_iph); + if (iph && iph->ihl == 5 && + iph->saddr == bpf_htonl(FLOW_CONTINUE_SADDR)) { + return BPF_FLOW_DISSECTOR_CONTINUE; + } + } + return parse_eth_proto(skb, keys->n_proto); } diff --git a/tools/testing/selftests/bpf/progs/bpf_iter.h b/tools/testing/selftests/bpf/progs/bpf_iter.h index e9846606690d..c41ee80533ca 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter.h +++ b/tools/testing/selftests/bpf/progs/bpf_iter.h @@ -17,6 +17,7 @@ #define bpf_iter__bpf_sk_storage_map bpf_iter__bpf_sk_storage_map___not_used #define bpf_iter__sockmap bpf_iter__sockmap___not_used #define bpf_iter__bpf_link bpf_iter__bpf_link___not_used +#define bpf_iter__cgroup bpf_iter__cgroup___not_used #define btf_ptr btf_ptr___not_used #define BTF_F_COMPACT BTF_F_COMPACT___not_used #define BTF_F_NONAME BTF_F_NONAME___not_used @@ -40,6 +41,7 @@ #undef bpf_iter__bpf_sk_storage_map #undef bpf_iter__sockmap #undef bpf_iter__bpf_link +#undef bpf_iter__cgroup #undef btf_ptr #undef BTF_F_COMPACT #undef BTF_F_NONAME @@ -141,6 +143,11 @@ struct bpf_iter__bpf_link { struct bpf_link *link; }; +struct bpf_iter__cgroup { + struct bpf_iter_meta *meta; + struct cgroup *cgroup; +} __attribute__((preserve_access_index)); + struct btf_ptr { void *ptr; __u32 type_id; diff --git a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h index 98dd2c4815f0..adb087aecc9e 100644 --- a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h +++ b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h @@ -6,13 +6,41 @@ #define AF_INET6 10 #define SOL_SOCKET 1 +#define SO_REUSEADDR 2 #define SO_SNDBUF 7 -#define __SO_ACCEPTCON (1 << 16) +#define SO_RCVBUF 8 +#define SO_KEEPALIVE 9 #define SO_PRIORITY 12 +#define SO_REUSEPORT 15 +#define SO_RCVLOWAT 18 +#define SO_BINDTODEVICE 25 +#define SO_MARK 36 +#define SO_MAX_PACING_RATE 47 +#define SO_BINDTOIFINDEX 62 +#define SO_TXREHASH 74 +#define __SO_ACCEPTCON (1 << 16) + +#define IP_TOS 1 + +#define IPV6_TCLASS 67 +#define IPV6_AUTOFLOWLABEL 70 #define SOL_TCP 6 +#define TCP_NODELAY 1 +#define TCP_MAXSEG 2 +#define TCP_KEEPIDLE 4 +#define TCP_KEEPINTVL 5 +#define TCP_KEEPCNT 6 +#define TCP_SYNCNT 7 +#define TCP_WINDOW_CLAMP 10 #define TCP_CONGESTION 13 +#define TCP_THIN_LINEAR_TIMEOUTS 16 +#define TCP_USER_TIMEOUT 18 +#define TCP_NOTSENT_LOWAT 25 +#define TCP_SAVE_SYN 27 +#define TCP_SAVED_SYN 28 #define TCP_CA_NAME_MAX 16 +#define TCP_NAGLE_OFF 1 #define ICSK_TIME_RETRANS 1 #define ICSK_TIME_PROBE0 3 @@ -49,6 +77,8 @@ #define sk_state __sk_common.skc_state #define sk_v6_daddr __sk_common.skc_v6_daddr #define sk_v6_rcv_saddr __sk_common.skc_v6_rcv_saddr +#define sk_flags __sk_common.skc_flags +#define sk_reuse __sk_common.skc_reuse #define s6_addr32 in6_u.u6_addr32 diff --git a/tools/testing/selftests/bpf/progs/cb_refs.c b/tools/testing/selftests/bpf/progs/cb_refs.c new file mode 100644 index 000000000000..7653df1bc787 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/cb_refs.c @@ -0,0 +1,116 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <vmlinux.h> +#include <bpf/bpf_tracing.h> +#include <bpf/bpf_helpers.h> + +struct map_value { + struct prog_test_ref_kfunc __kptr_ref *ptr; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, int); + __type(value, struct map_value); + __uint(max_entries, 16); +} array_map SEC(".maps"); + +extern struct prog_test_ref_kfunc *bpf_kfunc_call_test_acquire(unsigned long *sp) __ksym; +extern void bpf_kfunc_call_test_release(struct prog_test_ref_kfunc *p) __ksym; + +static __noinline int cb1(void *map, void *key, void *value, void *ctx) +{ + void *p = *(void **)ctx; + bpf_kfunc_call_test_release(p); + /* Without the fix this would cause underflow */ + return 0; +} + +SEC("?tc") +int underflow_prog(void *ctx) +{ + struct prog_test_ref_kfunc *p; + unsigned long sl = 0; + + p = bpf_kfunc_call_test_acquire(&sl); + if (!p) + return 0; + bpf_for_each_map_elem(&array_map, cb1, &p, 0); + return 0; +} + +static __always_inline int cb2(void *map, void *key, void *value, void *ctx) +{ + unsigned long sl = 0; + + *(void **)ctx = bpf_kfunc_call_test_acquire(&sl); + /* Without the fix this would leak memory */ + return 0; +} + +SEC("?tc") +int leak_prog(void *ctx) +{ + struct prog_test_ref_kfunc *p; + struct map_value *v; + unsigned long sl; + + v = bpf_map_lookup_elem(&array_map, &(int){0}); + if (!v) + return 0; + + p = NULL; + bpf_for_each_map_elem(&array_map, cb2, &p, 0); + p = bpf_kptr_xchg(&v->ptr, p); + if (p) + bpf_kfunc_call_test_release(p); + return 0; +} + +static __always_inline int cb(void *map, void *key, void *value, void *ctx) +{ + return 0; +} + +static __always_inline int cb3(void *map, void *key, void *value, void *ctx) +{ + unsigned long sl = 0; + void *p; + + bpf_kfunc_call_test_acquire(&sl); + bpf_for_each_map_elem(&array_map, cb, &p, 0); + /* It should only complain here, not in cb. This is why we need + * callback_ref to be set to frameno. + */ + return 0; +} + +SEC("?tc") +int nested_cb(void *ctx) +{ + struct prog_test_ref_kfunc *p; + unsigned long sl = 0; + int sp = 0; + + p = bpf_kfunc_call_test_acquire(&sl); + if (!p) + return 0; + bpf_for_each_map_elem(&array_map, cb3, &sp, 0); + bpf_kfunc_call_test_release(p); + return 0; +} + +SEC("?tc") +int non_cb_transfer_ref(void *ctx) +{ + struct prog_test_ref_kfunc *p; + unsigned long sl = 0; + + p = bpf_kfunc_call_test_acquire(&sl); + if (!p) + return 0; + cb1(NULL, NULL, NULL, &p); + bpf_kfunc_call_test_acquire(&sl); + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/cgroup_getset_retval_hooks.c b/tools/testing/selftests/bpf/progs/cgroup_getset_retval_hooks.c new file mode 100644 index 000000000000..13dfb4bbfd28 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/cgroup_getset_retval_hooks.c @@ -0,0 +1,16 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> + +#define BPF_RETVAL_HOOK(name, section, ctx, expected_err) \ + __attribute__((__section__("?" section))) \ + int name(struct ctx *_ctx) \ + { \ + bpf_set_retval(bpf_get_retval()); \ + return 1; \ + } + +#include "cgroup_getset_retval_hooks.h" + +#undef BPF_RETVAL_HOOK diff --git a/tools/testing/selftests/bpf/progs/cgroup_hierarchical_stats.c b/tools/testing/selftests/bpf/progs/cgroup_hierarchical_stats.c new file mode 100644 index 000000000000..8ab4253a1592 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/cgroup_hierarchical_stats.c @@ -0,0 +1,226 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Functions to manage eBPF programs attached to cgroup subsystems + * + * Copyright 2022 Google LLC. + */ +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include <bpf/bpf_core_read.h> + +char _license[] SEC("license") = "GPL"; + +/* + * Start times are stored per-task, not per-cgroup, as multiple tasks in one + * cgroup can perform reclaim concurrently. + */ +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, __u64); +} vmscan_start_time SEC(".maps"); + +struct vmscan_percpu { + /* Previous percpu state, to figure out if we have new updates */ + __u64 prev; + /* Current percpu state */ + __u64 state; +}; + +struct vmscan { + /* State propagated through children, pending aggregation */ + __u64 pending; + /* Total state, including all cpus and all children */ + __u64 state; +}; + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_HASH); + __uint(max_entries, 100); + __type(key, __u64); + __type(value, struct vmscan_percpu); +} pcpu_cgroup_vmscan_elapsed SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 100); + __type(key, __u64); + __type(value, struct vmscan); +} cgroup_vmscan_elapsed SEC(".maps"); + +extern void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) __ksym; +extern void cgroup_rstat_flush(struct cgroup *cgrp) __ksym; + +static struct cgroup *task_memcg(struct task_struct *task) +{ + int cgrp_id; + +#if __has_builtin(__builtin_preserve_enum_value) + cgrp_id = bpf_core_enum_value(enum cgroup_subsys_id, memory_cgrp_id); +#else + cgrp_id = memory_cgrp_id; +#endif + return task->cgroups->subsys[cgrp_id]->cgroup; +} + +static uint64_t cgroup_id(struct cgroup *cgrp) +{ + return cgrp->kn->id; +} + +static int create_vmscan_percpu_elem(__u64 cg_id, __u64 state) +{ + struct vmscan_percpu pcpu_init = {.state = state, .prev = 0}; + + return bpf_map_update_elem(&pcpu_cgroup_vmscan_elapsed, &cg_id, + &pcpu_init, BPF_NOEXIST); +} + +static int create_vmscan_elem(__u64 cg_id, __u64 state, __u64 pending) +{ + struct vmscan init = {.state = state, .pending = pending}; + + return bpf_map_update_elem(&cgroup_vmscan_elapsed, &cg_id, + &init, BPF_NOEXIST); +} + +SEC("tp_btf/mm_vmscan_memcg_reclaim_begin") +int BPF_PROG(vmscan_start, int order, gfp_t gfp_flags) +{ + struct task_struct *task = bpf_get_current_task_btf(); + __u64 *start_time_ptr; + + start_time_ptr = bpf_task_storage_get(&vmscan_start_time, task, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (start_time_ptr) + *start_time_ptr = bpf_ktime_get_ns(); + return 0; +} + +SEC("tp_btf/mm_vmscan_memcg_reclaim_end") +int BPF_PROG(vmscan_end, unsigned long nr_reclaimed) +{ + struct vmscan_percpu *pcpu_stat; + struct task_struct *current = bpf_get_current_task_btf(); + struct cgroup *cgrp; + __u64 *start_time_ptr; + __u64 current_elapsed, cg_id; + __u64 end_time = bpf_ktime_get_ns(); + + /* + * cgrp is the first parent cgroup of current that has memcg enabled in + * its subtree_control, or NULL if memcg is disabled in the entire tree. + * In a cgroup hierarchy like this: + * a + * / \ + * b c + * If "a" has memcg enabled, while "b" doesn't, then processes in "b" + * will accumulate their stats directly to "a". This makes sure that no + * stats are lost from processes in leaf cgroups that don't have memcg + * enabled, but only exposes stats for cgroups that have memcg enabled. + */ + cgrp = task_memcg(current); + if (!cgrp) + return 0; + + cg_id = cgroup_id(cgrp); + start_time_ptr = bpf_task_storage_get(&vmscan_start_time, current, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (!start_time_ptr) + return 0; + + current_elapsed = end_time - *start_time_ptr; + pcpu_stat = bpf_map_lookup_elem(&pcpu_cgroup_vmscan_elapsed, + &cg_id); + if (pcpu_stat) + pcpu_stat->state += current_elapsed; + else if (create_vmscan_percpu_elem(cg_id, current_elapsed)) + return 0; + + cgroup_rstat_updated(cgrp, bpf_get_smp_processor_id()); + return 0; +} + +SEC("fentry/bpf_rstat_flush") +int BPF_PROG(vmscan_flush, struct cgroup *cgrp, struct cgroup *parent, int cpu) +{ + struct vmscan_percpu *pcpu_stat; + struct vmscan *total_stat, *parent_stat; + __u64 cg_id = cgroup_id(cgrp); + __u64 parent_cg_id = parent ? cgroup_id(parent) : 0; + __u64 *pcpu_vmscan; + __u64 state; + __u64 delta = 0; + + /* Add CPU changes on this level since the last flush */ + pcpu_stat = bpf_map_lookup_percpu_elem(&pcpu_cgroup_vmscan_elapsed, + &cg_id, cpu); + if (pcpu_stat) { + state = pcpu_stat->state; + delta += state - pcpu_stat->prev; + pcpu_stat->prev = state; + } + + total_stat = bpf_map_lookup_elem(&cgroup_vmscan_elapsed, &cg_id); + if (!total_stat) { + if (create_vmscan_elem(cg_id, delta, 0)) + return 0; + + goto update_parent; + } + + /* Collect pending stats from subtree */ + if (total_stat->pending) { + delta += total_stat->pending; + total_stat->pending = 0; + } + + /* Propagate changes to this cgroup's total */ + total_stat->state += delta; + +update_parent: + /* Skip if there are no changes to propagate, or no parent */ + if (!delta || !parent_cg_id) + return 0; + + /* Propagate changes to cgroup's parent */ + parent_stat = bpf_map_lookup_elem(&cgroup_vmscan_elapsed, + &parent_cg_id); + if (parent_stat) + parent_stat->pending += delta; + else + create_vmscan_elem(parent_cg_id, 0, delta); + return 0; +} + +SEC("iter.s/cgroup") +int BPF_PROG(dump_vmscan, struct bpf_iter_meta *meta, struct cgroup *cgrp) +{ + struct seq_file *seq = meta->seq; + struct vmscan *total_stat; + __u64 cg_id = cgrp ? cgroup_id(cgrp) : 0; + + /* Do nothing for the terminal call */ + if (!cg_id) + return 1; + + /* Flush the stats to make sure we get the most updated numbers */ + cgroup_rstat_flush(cgrp); + + total_stat = bpf_map_lookup_elem(&cgroup_vmscan_elapsed, &cg_id); + if (!total_stat) { + BPF_SEQ_PRINTF(seq, "cg_id: %llu, total_vmscan_delay: 0\n", + cg_id); + } else { + BPF_SEQ_PRINTF(seq, "cg_id: %llu, total_vmscan_delay: %llu\n", + cg_id, total_stat->state); + } + + /* + * We only dump stats for one cgroup here, so return 1 to stop + * iteration after the first cgroup. + */ + return 1; +} diff --git a/tools/testing/selftests/bpf/progs/cgroup_iter.c b/tools/testing/selftests/bpf/progs/cgroup_iter.c new file mode 100644 index 000000000000..de03997322a7 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/cgroup_iter.c @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022 Google */ + +#include "bpf_iter.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +char _license[] SEC("license") = "GPL"; +int terminate_early = 0; +u64 terminal_cgroup = 0; + +static inline u64 cgroup_id(struct cgroup *cgrp) +{ + return cgrp->kn->id; +} + +SEC("iter/cgroup") +int cgroup_id_printer(struct bpf_iter__cgroup *ctx) +{ + struct seq_file *seq = ctx->meta->seq; + struct cgroup *cgrp = ctx->cgroup; + + /* epilogue */ + if (cgrp == NULL) { + BPF_SEQ_PRINTF(seq, "epilogue\n"); + return 0; + } + + /* prologue */ + if (ctx->meta->seq_num == 0) + BPF_SEQ_PRINTF(seq, "prologue\n"); + + BPF_SEQ_PRINTF(seq, "%8llu\n", cgroup_id(cgrp)); + + if (terminal_cgroup == cgroup_id(cgrp)) + return 1; + + return terminate_early ? 1 : 0; +} diff --git a/tools/testing/selftests/bpf/progs/connect4_prog.c b/tools/testing/selftests/bpf/progs/connect4_prog.c index b241932911db..ec25371de789 100644 --- a/tools/testing/selftests/bpf/progs/connect4_prog.c +++ b/tools/testing/selftests/bpf/progs/connect4_prog.c @@ -7,14 +7,15 @@ #include <linux/bpf.h> #include <linux/in.h> #include <linux/in6.h> -#include <sys/socket.h> -#include <netinet/tcp.h> +#include <linux/tcp.h> #include <linux/if.h> #include <errno.h> #include <bpf/bpf_helpers.h> #include <bpf/bpf_endian.h> +#include "bpf_tcp_helpers.h" + #define SRC_REWRITE_IP4 0x7f000004U #define DST_REWRITE_IP4 0x7f000001U #define DST_REWRITE_PORT4 4444 diff --git a/tools/testing/selftests/bpf/progs/fexit_bpf2bpf.c b/tools/testing/selftests/bpf/progs/fexit_bpf2bpf.c index 48cd14b43741..4547b059d487 100644 --- a/tools/testing/selftests/bpf/progs/fexit_bpf2bpf.c +++ b/tools/testing/selftests/bpf/progs/fexit_bpf2bpf.c @@ -73,10 +73,10 @@ int test_subprog2(struct args_subprog2 *ctx) __builtin_preserve_access_index(&skb->len)); ret = ctx->ret; - /* bpf_prog_test_load() loads "test_pkt_access.o" with BPF_F_TEST_RND_HI32 - * which randomizes upper 32 bits after BPF_ALU32 insns. - * Hence after 'w0 <<= 1' upper bits of $rax are random. - * That is expected and correct. Trim them. + /* bpf_prog_test_load() loads "test_pkt_access.bpf.o" with + * BPF_F_TEST_RND_HI32 which randomizes upper 32 bits after BPF_ALU32 + * insns. Hence after 'w0 <<= 1' upper bits of $rax are random. That is + * expected and correct. Trim them. */ ret = (__u32) ret; if (len != 74 || ret != 148) diff --git a/tools/testing/selftests/bpf/progs/htab_update.c b/tools/testing/selftests/bpf/progs/htab_update.c new file mode 100644 index 000000000000..7481bb30b29b --- /dev/null +++ b/tools/testing/selftests/bpf/progs/htab_update.c @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2022. Huawei Technologies Co., Ltd */ +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 1); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u32)); +} htab SEC(".maps"); + +int pid = 0; +int update_err = 0; + +SEC("?fentry/lookup_elem_raw") +int lookup_elem_raw(void *ctx) +{ + __u32 key = 0, value = 1; + + if ((bpf_get_current_pid_tgid() >> 32) != pid) + return 0; + + update_err = bpf_map_update_elem(&htab, &key, &value, 0); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/read_bpf_task_storage_busy.c b/tools/testing/selftests/bpf/progs/read_bpf_task_storage_busy.c new file mode 100644 index 000000000000..a47bb0120719 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/read_bpf_task_storage_busy.c @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2022. Huawei Technologies Co., Ltd */ +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +extern bool CONFIG_PREEMPT __kconfig __weak; +extern const int bpf_task_storage_busy __ksym; + +char _license[] SEC("license") = "GPL"; + +int pid = 0; +int busy = 0; + +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, long); +} task SEC(".maps"); + +SEC("raw_tp/sys_enter") +int BPF_PROG(read_bpf_task_storage_busy) +{ + int *value; + int key; + + if (!CONFIG_PREEMPT) + return 0; + + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + value = bpf_this_cpu_ptr(&bpf_task_storage_busy); + if (value) + busy = *value; + + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/setget_sockopt.c b/tools/testing/selftests/bpf/progs/setget_sockopt.c new file mode 100644 index 000000000000..9523333b8905 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/setget_sockopt.c @@ -0,0 +1,395 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) Meta Platforms, Inc. and affiliates. */ + +#include "vmlinux.h" +#include "bpf_tracing_net.h" +#include <bpf/bpf_core_read.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +#ifndef ARRAY_SIZE +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) +#endif + +extern unsigned long CONFIG_HZ __kconfig; + +const volatile char veth[IFNAMSIZ]; +const volatile int veth_ifindex; + +int nr_listen; +int nr_passive; +int nr_active; +int nr_connect; +int nr_binddev; +int nr_socket_post_create; + +struct sockopt_test { + int opt; + int new; + int restore; + int expected; + int tcp_expected; + unsigned int flip:1; +}; + +static const char not_exist_cc[] = "not_exist"; +static const char cubic_cc[] = "cubic"; +static const char reno_cc[] = "reno"; + +static const struct sockopt_test sol_socket_tests[] = { + { .opt = SO_REUSEADDR, .flip = 1, }, + { .opt = SO_SNDBUF, .new = 8123, .expected = 8123 * 2, }, + { .opt = SO_RCVBUF, .new = 8123, .expected = 8123 * 2, }, + { .opt = SO_KEEPALIVE, .flip = 1, }, + { .opt = SO_PRIORITY, .new = 0xeb9f, .expected = 0xeb9f, }, + { .opt = SO_REUSEPORT, .flip = 1, }, + { .opt = SO_RCVLOWAT, .new = 8123, .expected = 8123, }, + { .opt = SO_MARK, .new = 0xeb9f, .expected = 0xeb9f, }, + { .opt = SO_MAX_PACING_RATE, .new = 0xeb9f, .expected = 0xeb9f, }, + { .opt = SO_TXREHASH, .flip = 1, }, + { .opt = 0, }, +}; + +static const struct sockopt_test sol_tcp_tests[] = { + { .opt = TCP_NODELAY, .flip = 1, }, + { .opt = TCP_KEEPIDLE, .new = 123, .expected = 123, .restore = 321, }, + { .opt = TCP_KEEPINTVL, .new = 123, .expected = 123, .restore = 321, }, + { .opt = TCP_KEEPCNT, .new = 123, .expected = 123, .restore = 124, }, + { .opt = TCP_SYNCNT, .new = 123, .expected = 123, .restore = 124, }, + { .opt = TCP_WINDOW_CLAMP, .new = 8123, .expected = 8123, .restore = 8124, }, + { .opt = TCP_CONGESTION, }, + { .opt = TCP_THIN_LINEAR_TIMEOUTS, .flip = 1, }, + { .opt = TCP_USER_TIMEOUT, .new = 123400, .expected = 123400, }, + { .opt = TCP_NOTSENT_LOWAT, .new = 1314, .expected = 1314, }, + { .opt = 0, }, +}; + +static const struct sockopt_test sol_ip_tests[] = { + { .opt = IP_TOS, .new = 0xe1, .expected = 0xe1, .tcp_expected = 0xe0, }, + { .opt = 0, }, +}; + +static const struct sockopt_test sol_ipv6_tests[] = { + { .opt = IPV6_TCLASS, .new = 0xe1, .expected = 0xe1, .tcp_expected = 0xe0, }, + { .opt = IPV6_AUTOFLOWLABEL, .flip = 1, }, + { .opt = 0, }, +}; + +struct loop_ctx { + void *ctx; + struct sock *sk; +}; + +static int bpf_test_sockopt_flip(void *ctx, struct sock *sk, + const struct sockopt_test *t, + int level) +{ + int old, tmp, new, opt = t->opt; + + opt = t->opt; + + if (bpf_getsockopt(ctx, level, opt, &old, sizeof(old))) + return 1; + /* kernel initialized txrehash to 255 */ + if (level == SOL_SOCKET && opt == SO_TXREHASH && old != 0 && old != 1) + old = 1; + + new = !old; + if (bpf_setsockopt(ctx, level, opt, &new, sizeof(new))) + return 1; + if (bpf_getsockopt(ctx, level, opt, &tmp, sizeof(tmp)) || + tmp != new) + return 1; + + if (bpf_setsockopt(ctx, level, opt, &old, sizeof(old))) + return 1; + + return 0; +} + +static int bpf_test_sockopt_int(void *ctx, struct sock *sk, + const struct sockopt_test *t, + int level) +{ + int old, tmp, new, expected, opt; + + opt = t->opt; + new = t->new; + if (sk->sk_type == SOCK_STREAM && t->tcp_expected) + expected = t->tcp_expected; + else + expected = t->expected; + + if (bpf_getsockopt(ctx, level, opt, &old, sizeof(old)) || + old == new) + return 1; + + if (bpf_setsockopt(ctx, level, opt, &new, sizeof(new))) + return 1; + if (bpf_getsockopt(ctx, level, opt, &tmp, sizeof(tmp)) || + tmp != expected) + return 1; + + if (t->restore) + old = t->restore; + if (bpf_setsockopt(ctx, level, opt, &old, sizeof(old))) + return 1; + + return 0; +} + +static int bpf_test_socket_sockopt(__u32 i, struct loop_ctx *lc) +{ + const struct sockopt_test *t; + + if (i >= ARRAY_SIZE(sol_socket_tests)) + return 1; + + t = &sol_socket_tests[i]; + if (!t->opt) + return 1; + + if (t->flip) + return bpf_test_sockopt_flip(lc->ctx, lc->sk, t, SOL_SOCKET); + + return bpf_test_sockopt_int(lc->ctx, lc->sk, t, SOL_SOCKET); +} + +static int bpf_test_ip_sockopt(__u32 i, struct loop_ctx *lc) +{ + const struct sockopt_test *t; + + if (i >= ARRAY_SIZE(sol_ip_tests)) + return 1; + + t = &sol_ip_tests[i]; + if (!t->opt) + return 1; + + if (t->flip) + return bpf_test_sockopt_flip(lc->ctx, lc->sk, t, IPPROTO_IP); + + return bpf_test_sockopt_int(lc->ctx, lc->sk, t, IPPROTO_IP); +} + +static int bpf_test_ipv6_sockopt(__u32 i, struct loop_ctx *lc) +{ + const struct sockopt_test *t; + + if (i >= ARRAY_SIZE(sol_ipv6_tests)) + return 1; + + t = &sol_ipv6_tests[i]; + if (!t->opt) + return 1; + + if (t->flip) + return bpf_test_sockopt_flip(lc->ctx, lc->sk, t, IPPROTO_IPV6); + + return bpf_test_sockopt_int(lc->ctx, lc->sk, t, IPPROTO_IPV6); +} + +static int bpf_test_tcp_sockopt(__u32 i, struct loop_ctx *lc) +{ + const struct sockopt_test *t; + struct sock *sk; + void *ctx; + + if (i >= ARRAY_SIZE(sol_tcp_tests)) + return 1; + + t = &sol_tcp_tests[i]; + if (!t->opt) + return 1; + + ctx = lc->ctx; + sk = lc->sk; + + if (t->opt == TCP_CONGESTION) { + char old_cc[16], tmp_cc[16]; + const char *new_cc; + int new_cc_len; + + if (!bpf_setsockopt(ctx, IPPROTO_TCP, TCP_CONGESTION, + (void *)not_exist_cc, sizeof(not_exist_cc))) + return 1; + if (bpf_getsockopt(ctx, IPPROTO_TCP, TCP_CONGESTION, old_cc, sizeof(old_cc))) + return 1; + if (!bpf_strncmp(old_cc, sizeof(old_cc), cubic_cc)) { + new_cc = reno_cc; + new_cc_len = sizeof(reno_cc); + } else { + new_cc = cubic_cc; + new_cc_len = sizeof(cubic_cc); + } + if (bpf_setsockopt(ctx, IPPROTO_TCP, TCP_CONGESTION, (void *)new_cc, + new_cc_len)) + return 1; + if (bpf_getsockopt(ctx, IPPROTO_TCP, TCP_CONGESTION, tmp_cc, sizeof(tmp_cc))) + return 1; + if (bpf_strncmp(tmp_cc, sizeof(tmp_cc), new_cc)) + return 1; + if (bpf_setsockopt(ctx, IPPROTO_TCP, TCP_CONGESTION, old_cc, sizeof(old_cc))) + return 1; + return 0; + } + + if (t->flip) + return bpf_test_sockopt_flip(ctx, sk, t, IPPROTO_TCP); + + return bpf_test_sockopt_int(ctx, sk, t, IPPROTO_TCP); +} + +static int bpf_test_sockopt(void *ctx, struct sock *sk) +{ + struct loop_ctx lc = { .ctx = ctx, .sk = sk, }; + __u16 family, proto; + int n; + + family = sk->sk_family; + proto = sk->sk_protocol; + + n = bpf_loop(ARRAY_SIZE(sol_socket_tests), bpf_test_socket_sockopt, &lc, 0); + if (n != ARRAY_SIZE(sol_socket_tests)) + return -1; + + if (proto == IPPROTO_TCP) { + n = bpf_loop(ARRAY_SIZE(sol_tcp_tests), bpf_test_tcp_sockopt, &lc, 0); + if (n != ARRAY_SIZE(sol_tcp_tests)) + return -1; + } + + if (family == AF_INET) { + n = bpf_loop(ARRAY_SIZE(sol_ip_tests), bpf_test_ip_sockopt, &lc, 0); + if (n != ARRAY_SIZE(sol_ip_tests)) + return -1; + } else { + n = bpf_loop(ARRAY_SIZE(sol_ipv6_tests), bpf_test_ipv6_sockopt, &lc, 0); + if (n != ARRAY_SIZE(sol_ipv6_tests)) + return -1; + } + + return 0; +} + +static int binddev_test(void *ctx) +{ + const char empty_ifname[] = ""; + int ifindex, zero = 0; + + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE, + (void *)veth, sizeof(veth))) + return -1; + if (bpf_getsockopt(ctx, SOL_SOCKET, SO_BINDTOIFINDEX, + &ifindex, sizeof(int)) || + ifindex != veth_ifindex) + return -1; + + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE, + (void *)empty_ifname, sizeof(empty_ifname))) + return -1; + if (bpf_getsockopt(ctx, SOL_SOCKET, SO_BINDTOIFINDEX, + &ifindex, sizeof(int)) || + ifindex != 0) + return -1; + + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTOIFINDEX, + (void *)&veth_ifindex, sizeof(int))) + return -1; + if (bpf_getsockopt(ctx, SOL_SOCKET, SO_BINDTOIFINDEX, + &ifindex, sizeof(int)) || + ifindex != veth_ifindex) + return -1; + + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTOIFINDEX, + &zero, sizeof(int))) + return -1; + if (bpf_getsockopt(ctx, SOL_SOCKET, SO_BINDTOIFINDEX, + &ifindex, sizeof(int)) || + ifindex != 0) + return -1; + + return 0; +} + +static int test_tcp_maxseg(void *ctx, struct sock *sk) +{ + int val = 1314, tmp; + + if (sk->sk_state != TCP_ESTABLISHED) + return bpf_setsockopt(ctx, IPPROTO_TCP, TCP_MAXSEG, + &val, sizeof(val)); + + if (bpf_getsockopt(ctx, IPPROTO_TCP, TCP_MAXSEG, &tmp, sizeof(tmp)) || + tmp > val) + return -1; + + return 0; +} + +static int test_tcp_saved_syn(void *ctx, struct sock *sk) +{ + __u8 saved_syn[20]; + int one = 1; + + if (sk->sk_state == TCP_LISTEN) + return bpf_setsockopt(ctx, IPPROTO_TCP, TCP_SAVE_SYN, + &one, sizeof(one)); + + return bpf_getsockopt(ctx, IPPROTO_TCP, TCP_SAVED_SYN, + saved_syn, sizeof(saved_syn)); +} + +SEC("lsm_cgroup/socket_post_create") +int BPF_PROG(socket_post_create, struct socket *sock, int family, + int type, int protocol, int kern) +{ + struct sock *sk = sock->sk; + + if (!sk) + return 1; + + nr_socket_post_create += !bpf_test_sockopt(sk, sk); + nr_binddev += !binddev_test(sk); + + return 1; +} + +SEC("sockops") +int skops_sockopt(struct bpf_sock_ops *skops) +{ + struct bpf_sock *bpf_sk = skops->sk; + struct sock *sk; + + if (!bpf_sk) + return 1; + + sk = (struct sock *)bpf_skc_to_tcp_sock(bpf_sk); + if (!sk) + return 1; + + switch (skops->op) { + case BPF_SOCK_OPS_TCP_LISTEN_CB: + nr_listen += !(bpf_test_sockopt(skops, sk) || + test_tcp_maxseg(skops, sk) || + test_tcp_saved_syn(skops, sk)); + break; + case BPF_SOCK_OPS_TCP_CONNECT_CB: + nr_connect += !(bpf_test_sockopt(skops, sk) || + test_tcp_maxseg(skops, sk)); + break; + case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: + nr_active += !(bpf_test_sockopt(skops, sk) || + test_tcp_maxseg(skops, sk)); + break; + case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: + nr_passive += !(bpf_test_sockopt(skops, sk) || + test_tcp_maxseg(skops, sk) || + test_tcp_saved_syn(skops, sk)); + break; + } + + return 1; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf4.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf4.c index b67e8022d500..a017d6b2f1dd 100644 --- a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf4.c +++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf4.c @@ -19,7 +19,7 @@ struct { int count = 0; int noise = 0; -__always_inline int subprog_noise(void) +static __always_inline int subprog_noise(void) { __u32 key = 0; diff --git a/tools/testing/selftests/bpf/progs/test_tc_dtime.c b/tools/testing/selftests/bpf/progs/test_tc_dtime.c index b596479a9ebe..125beec31834 100644 --- a/tools/testing/selftests/bpf/progs/test_tc_dtime.c +++ b/tools/testing/selftests/bpf/progs/test_tc_dtime.c @@ -15,7 +15,6 @@ #include <linux/udp.h> #include <bpf/bpf_helpers.h> #include <bpf/bpf_endian.h> -#include <sys/socket.h> /* veth_src --- veth_src_fwd --- veth_det_fwd --- veth_dst * | | diff --git a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c index df0673c4ecbe..98af55f0bcd3 100644 --- a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c +++ b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c @@ -12,6 +12,7 @@ #include <linux/bpf.h> #include <linux/if_ether.h> #include <linux/if_packet.h> +#include <linux/if_tunnel.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/icmp.h> @@ -386,7 +387,8 @@ int vxlan_get_tunnel_src(struct __sk_buff *skb) __u32 orig_daddr; __u32 index = 0; - ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0); + ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), + BPF_F_TUNINFO_FLAGS); if (ret < 0) { log_err(ret); return TC_ACT_SHOT; @@ -398,10 +400,13 @@ int vxlan_get_tunnel_src(struct __sk_buff *skb) return TC_ACT_SHOT; } - if (key.local_ipv4 != ASSIGNED_ADDR_VETH1 || md.gbp != 0x800FF) { - bpf_printk("vxlan key %d local ip 0x%x remote ip 0x%x gbp 0x%x\n", + if (key.local_ipv4 != ASSIGNED_ADDR_VETH1 || md.gbp != 0x800FF || + !(key.tunnel_flags & TUNNEL_KEY) || + (key.tunnel_flags & TUNNEL_CSUM)) { + bpf_printk("vxlan key %d local ip 0x%x remote ip 0x%x gbp 0x%x flags 0x%x\n", key.tunnel_id, key.local_ipv4, - key.remote_ipv4, md.gbp); + key.remote_ipv4, md.gbp, + bpf_ntohs(key.tunnel_flags)); log_err(ret); return TC_ACT_SHOT; } @@ -541,16 +546,19 @@ int ip6vxlan_get_tunnel_src(struct __sk_buff *skb) } ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), - BPF_F_TUNINFO_IPV6); + BPF_F_TUNINFO_IPV6 | BPF_F_TUNINFO_FLAGS); if (ret < 0) { log_err(ret); return TC_ACT_SHOT; } - if (bpf_ntohl(key.local_ipv6[3]) != *local_ip) { - bpf_printk("ip6vxlan key %d local ip6 ::%x remote ip6 ::%x label 0x%x\n", + if (bpf_ntohl(key.local_ipv6[3]) != *local_ip || + !(key.tunnel_flags & TUNNEL_KEY) || + !(key.tunnel_flags & TUNNEL_CSUM)) { + bpf_printk("ip6vxlan key %d local ip6 ::%x remote ip6 ::%x label 0x%x flags 0x%x\n", key.tunnel_id, bpf_ntohl(key.local_ipv6[3]), - bpf_ntohl(key.remote_ipv6[3]), key.tunnel_label); + bpf_ntohl(key.remote_ipv6[3]), key.tunnel_label, + bpf_ntohs(key.tunnel_flags)); bpf_printk("local_ip 0x%x\n", *local_ip); log_err(ret); return TC_ACT_SHOT; diff --git a/tools/testing/selftests/bpf/progs/timer.c b/tools/testing/selftests/bpf/progs/timer.c index 5f5309791649..0053c5402173 100644 --- a/tools/testing/selftests/bpf/progs/timer.c +++ b/tools/testing/selftests/bpf/progs/timer.c @@ -208,17 +208,6 @@ static int timer_cb2(void *map, int *key, struct hmap_elem *val) */ bpf_map_delete_elem(map, key); - /* in non-preallocated hashmap both 'key' and 'val' are RCU - * protected and still valid though this element was deleted - * from the map. Arm this timer for ~35 seconds. When callback - * finishes the call_rcu will invoke: - * htab_elem_free_rcu - * check_and_free_timer - * bpf_timer_cancel_and_free - * to cancel this 35 second sleep and delete the timer for real. - */ - if (bpf_timer_start(&val->timer, 1ull << 35, 0) != 0) - err |= 256; ok |= 4; } return 0; diff --git a/tools/testing/selftests/bpf/task_local_storage_helpers.h b/tools/testing/selftests/bpf/task_local_storage_helpers.h new file mode 100644 index 000000000000..711d5abb7d51 --- /dev/null +++ b/tools/testing/selftests/bpf/task_local_storage_helpers.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __TASK_LOCAL_STORAGE_HELPER_H +#define __TASK_LOCAL_STORAGE_HELPER_H + +#include <unistd.h> +#include <sys/syscall.h> +#include <sys/types.h> + +#ifndef __NR_pidfd_open +#define __NR_pidfd_open 434 +#endif + +static inline int sys_pidfd_open(pid_t pid, unsigned int flags) +{ + return syscall(__NR_pidfd_open, pid, flags); +} + +#endif diff --git a/tools/testing/selftests/bpf/test_dev_cgroup.c b/tools/testing/selftests/bpf/test_dev_cgroup.c index 7886265846a0..adeaf63cb6fa 100644 --- a/tools/testing/selftests/bpf/test_dev_cgroup.c +++ b/tools/testing/selftests/bpf/test_dev_cgroup.c @@ -16,7 +16,7 @@ #include "cgroup_helpers.h" #include "testing_helpers.h" -#define DEV_CGROUP_PROG "./dev_cgroup.o" +#define DEV_CGROUP_PROG "./dev_cgroup.bpf.o" #define TEST_CGROUP "/test-bpf-based-device-cgroup/" diff --git a/tools/testing/selftests/bpf/test_flow_dissector.sh b/tools/testing/selftests/bpf/test_flow_dissector.sh index dbd91221727d..5303ce0c977b 100755 --- a/tools/testing/selftests/bpf/test_flow_dissector.sh +++ b/tools/testing/selftests/bpf/test_flow_dissector.sh @@ -115,6 +115,14 @@ tc filter add dev lo parent ffff: protocol ip pref 1337 flower ip_proto \ # Send 10 IPv4/UDP packets from port 10. Filter should not drop any. ./test_flow_dissector -i 4 -f 10 +echo "Testing IPv4 from 127.0.0.127 (fallback to generic dissector)..." +# Send 10 IPv4/UDP packets from port 8. Filter should not drop any. +./test_flow_dissector -i 4 -S 127.0.0.127 -f 8 +# Send 10 IPv4/UDP packets from port 9. Filter should drop all. +./test_flow_dissector -i 4 -S 127.0.0.127 -f 9 -F +# Send 10 IPv4/UDP packets from port 10. Filter should not drop any. +./test_flow_dissector -i 4 -S 127.0.0.127 -f 10 + echo "Testing IPIP..." # Send 10 IPv4/IPv4/UDP packets from port 8. Filter should not drop any. ./with_addr.sh ./with_tunnels.sh ./test_flow_dissector -o 4 -e bare -i 4 \ diff --git a/tools/testing/selftests/bpf/test_lirc_mode2_user.c b/tools/testing/selftests/bpf/test_lirc_mode2_user.c index 2893e9f2f1e0..4694422aa76c 100644 --- a/tools/testing/selftests/bpf/test_lirc_mode2_user.c +++ b/tools/testing/selftests/bpf/test_lirc_mode2_user.c @@ -59,7 +59,7 @@ int main(int argc, char **argv) return 2; } - ret = bpf_prog_test_load("test_lirc_mode2_kern.o", + ret = bpf_prog_test_load("test_lirc_mode2_kern.bpf.o", BPF_PROG_TYPE_LIRC_MODE2, &obj, &progfd); if (ret) { printf("Failed to load bpf program\n"); diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c index cbebfaa7c1e8..00b9cc305e58 100644 --- a/tools/testing/selftests/bpf/test_maps.c +++ b/tools/testing/selftests/bpf/test_maps.c @@ -264,10 +264,11 @@ static void test_hashmap_percpu(unsigned int task, void *data) close(fd); } +#define VALUE_SIZE 3 static int helper_fill_hashmap(int max_entries) { int i, fd, ret; - long long key, value; + long long key, value[VALUE_SIZE] = {}; fd = bpf_map_create(BPF_MAP_TYPE_HASH, NULL, sizeof(key), sizeof(value), max_entries, &map_opts); @@ -276,8 +277,8 @@ static int helper_fill_hashmap(int max_entries) "err: %s, flags: 0x%x\n", strerror(errno), map_opts.map_flags); for (i = 0; i < max_entries; i++) { - key = i; value = key; - ret = bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST); + key = i; value[0] = key; + ret = bpf_map_update_elem(fd, &key, value, BPF_NOEXIST); CHECK(ret != 0, "can't update hashmap", "err: %s\n", strerror(ret)); @@ -288,8 +289,8 @@ static int helper_fill_hashmap(int max_entries) static void test_hashmap_walk(unsigned int task, void *data) { - int fd, i, max_entries = 1000; - long long key, value, next_key; + int fd, i, max_entries = 10000; + long long key, value[VALUE_SIZE], next_key; bool next_key_valid = true; fd = helper_fill_hashmap(max_entries); @@ -297,7 +298,7 @@ static void test_hashmap_walk(unsigned int task, void *data) for (i = 0; bpf_map_get_next_key(fd, !i ? NULL : &key, &next_key) == 0; i++) { key = next_key; - assert(bpf_map_lookup_elem(fd, &key, &value) == 0); + assert(bpf_map_lookup_elem(fd, &key, value) == 0); } assert(i == max_entries); @@ -305,9 +306,9 @@ static void test_hashmap_walk(unsigned int task, void *data) assert(bpf_map_get_next_key(fd, NULL, &key) == 0); for (i = 0; next_key_valid; i++) { next_key_valid = bpf_map_get_next_key(fd, &key, &next_key) == 0; - assert(bpf_map_lookup_elem(fd, &key, &value) == 0); - value++; - assert(bpf_map_update_elem(fd, &key, &value, BPF_EXIST) == 0); + assert(bpf_map_lookup_elem(fd, &key, value) == 0); + value[0]++; + assert(bpf_map_update_elem(fd, &key, value, BPF_EXIST) == 0); key = next_key; } @@ -316,8 +317,8 @@ static void test_hashmap_walk(unsigned int task, void *data) for (i = 0; bpf_map_get_next_key(fd, !i ? NULL : &key, &next_key) == 0; i++) { key = next_key; - assert(bpf_map_lookup_elem(fd, &key, &value) == 0); - assert(value - 1 == key); + assert(bpf_map_lookup_elem(fd, &key, value) == 0); + assert(value[0] - 1 == key); } assert(i == max_entries); @@ -651,9 +652,9 @@ static void test_stackmap(unsigned int task, void *data) #include <arpa/inet.h> #include <sys/select.h> #include <linux/err.h> -#define SOCKMAP_PARSE_PROG "./sockmap_parse_prog.o" -#define SOCKMAP_VERDICT_PROG "./sockmap_verdict_prog.o" -#define SOCKMAP_TCP_MSG_PROG "./sockmap_tcp_msg_prog.o" +#define SOCKMAP_PARSE_PROG "./sockmap_parse_prog.bpf.o" +#define SOCKMAP_VERDICT_PROG "./sockmap_verdict_prog.bpf.o" +#define SOCKMAP_TCP_MSG_PROG "./sockmap_tcp_msg_prog.bpf.o" static void test_sockmap(unsigned int tasks, void *data) { struct bpf_map *bpf_map_rx, *bpf_map_tx, *bpf_map_msg, *bpf_map_break; @@ -1143,8 +1144,8 @@ out_sockmap: exit(1); } -#define MAPINMAP_PROG "./test_map_in_map.o" -#define MAPINMAP_INVALID_PROG "./test_map_in_map_invalid.o" +#define MAPINMAP_PROG "./test_map_in_map.bpf.o" +#define MAPINMAP_INVALID_PROG "./test_map_in_map_invalid.bpf.o" static void test_map_in_map(void) { struct bpf_object *obj; @@ -1371,16 +1372,16 @@ static void __run_parallel(unsigned int tasks, static void test_map_stress(void) { + run_parallel(100, test_hashmap_walk, NULL); run_parallel(100, test_hashmap, NULL); run_parallel(100, test_hashmap_percpu, NULL); run_parallel(100, test_hashmap_sizes, NULL); - run_parallel(100, test_hashmap_walk, NULL); run_parallel(100, test_arraymap, NULL); run_parallel(100, test_arraymap_percpu, NULL); } -#define TASKS 1024 +#define TASKS 100 #define DO_UPDATE 1 #define DO_DELETE 0 @@ -1432,6 +1433,8 @@ static void test_update_delete(unsigned int fn, void *data) int fd = ((int *)data)[0]; int i, key, value, err; + if (fn & 1) + test_hashmap_walk(fn, NULL); for (i = fn; i < MAP_SIZE; i += TASKS) { key = value = i; @@ -1455,7 +1458,7 @@ static void test_update_delete(unsigned int fn, void *data) static void test_map_parallel(void) { - int i, fd, key = 0, value = 0; + int i, fd, key = 0, value = 0, j = 0; int data[2]; fd = bpf_map_create(BPF_MAP_TYPE_HASH, NULL, sizeof(key), sizeof(value), @@ -1466,6 +1469,7 @@ static void test_map_parallel(void) exit(1); } +again: /* Use the same fd in children to add elements to this map: * child_0 adds key=0, key=1024, key=2048, ... * child_1 adds key=1, key=1025, key=2049, ... @@ -1502,6 +1506,12 @@ static void test_map_parallel(void) key = -1; assert(bpf_map_get_next_key(fd, NULL, &key) < 0 && errno == ENOENT); assert(bpf_map_get_next_key(fd, &key, &key) < 0 && errno == ENOENT); + + key = 0; + bpf_map_delete_elem(fd, &key); + if (j++ < 5) + goto again; + close(fd); } static void test_map_rdonly(void) diff --git a/tools/testing/selftests/bpf/test_offload.py b/tools/testing/selftests/bpf/test_offload.py index 6cd6ef9fc20b..7fc15e0d24a9 100755 --- a/tools/testing/selftests/bpf/test_offload.py +++ b/tools/testing/selftests/bpf/test_offload.py @@ -782,7 +782,7 @@ if out.find("/sys/kernel/debug type debugfs") == -1: cmd("mount -t debugfs none /sys/kernel/debug") # Check samples are compiled -samples = ["sample_ret0.o", "sample_map_ret0.o"] +samples = ["sample_ret0.bpf.o", "sample_map_ret0.bpf.o"] for s in samples: ret, out = cmd("ls %s/%s" % (bpf_test_dir, s), fail=False) skip(ret != 0, "sample %s/%s not found, please compile it" % @@ -803,7 +803,7 @@ cmd("ip netns delete %s" % (ns)) netns = [] try: - obj = bpf_obj("sample_ret0.o") + obj = bpf_obj("sample_ret0.bpf.o") bytecode = bpf_bytecode("1,6 0 0 4294967295,") start_test("Test destruction of generic XDP...") @@ -1023,7 +1023,7 @@ try: sim.wait_for_flush() start_test("Test non-offload XDP attaching to HW...") - bpftool_prog_load("sample_ret0.o", "/sys/fs/bpf/nooffload") + bpftool_prog_load("sample_ret0.bpf.o", "/sys/fs/bpf/nooffload") nooffload = bpf_pinned("/sys/fs/bpf/nooffload") ret, _, err = sim.set_xdp(nooffload, "offload", fail=False, include_stderr=True) @@ -1032,7 +1032,7 @@ try: rm("/sys/fs/bpf/nooffload") start_test("Test offload XDP attaching to drv...") - bpftool_prog_load("sample_ret0.o", "/sys/fs/bpf/offload", + bpftool_prog_load("sample_ret0.bpf.o", "/sys/fs/bpf/offload", dev=sim['ifname']) offload = bpf_pinned("/sys/fs/bpf/offload") ret, _, err = sim.set_xdp(offload, "drv", fail=False, include_stderr=True) @@ -1043,7 +1043,7 @@ try: start_test("Test XDP load failure...") sim.dfs["dev/bpf_bind_verifier_accept"] = 0 - ret, _, err = bpftool_prog_load("sample_ret0.o", "/sys/fs/bpf/offload", + ret, _, err = bpftool_prog_load("sample_ret0.bpf.o", "/sys/fs/bpf/offload", dev=sim['ifname'], fail=False, include_stderr=True) fail(ret == 0, "verifier should fail on load") check_verifier_log(err, "[netdevsim] Hello from netdevsim!") @@ -1169,7 +1169,7 @@ try: simdev = NetdevSimDev() sim, = simdev.nsims - map_obj = bpf_obj("sample_map_ret0.o") + map_obj = bpf_obj("sample_map_ret0.bpf.o") start_test("Test loading program with maps...") sim.set_xdp(map_obj, "offload", JSON=False) # map fixup msg breaks JSON @@ -1307,10 +1307,10 @@ try: sims = (simA, simB1, simB2, simB3) simB = (simB1, simB2, simB3) - bpftool_prog_load("sample_map_ret0.o", "/sys/fs/bpf/nsimA", + bpftool_prog_load("sample_map_ret0.bpf.o", "/sys/fs/bpf/nsimA", dev=simA['ifname']) progA = bpf_pinned("/sys/fs/bpf/nsimA") - bpftool_prog_load("sample_map_ret0.o", "/sys/fs/bpf/nsimB", + bpftool_prog_load("sample_map_ret0.bpf.o", "/sys/fs/bpf/nsimB", dev=simB1['ifname']) progB = bpf_pinned("/sys/fs/bpf/nsimB") @@ -1344,14 +1344,14 @@ try: mapA = bpftool("prog show %s" % (progA))[1]["map_ids"][0] mapB = bpftool("prog show %s" % (progB))[1]["map_ids"][0] - ret, _ = bpftool_prog_load("sample_map_ret0.o", "/sys/fs/bpf/nsimB_", + ret, _ = bpftool_prog_load("sample_map_ret0.bpf.o", "/sys/fs/bpf/nsimB_", dev=simB3['ifname'], maps=["idx 0 id %d" % (mapB)], fail=False) fail(ret != 0, "couldn't reuse a map on the same ASIC") rm("/sys/fs/bpf/nsimB_") - ret, _, err = bpftool_prog_load("sample_map_ret0.o", "/sys/fs/bpf/nsimA_", + ret, _, err = bpftool_prog_load("sample_map_ret0.bpf.o", "/sys/fs/bpf/nsimA_", dev=simA['ifname'], maps=["idx 0 id %d" % (mapB)], fail=False, include_stderr=True) @@ -1359,7 +1359,7 @@ try: fail(err.count("offload device mismatch between prog and map") == 0, "error message missing for cross-ASIC map") - ret, _, err = bpftool_prog_load("sample_map_ret0.o", "/sys/fs/bpf/nsimB_", + ret, _, err = bpftool_prog_load("sample_map_ret0.bpf.o", "/sys/fs/bpf/nsimB_", dev=simB1['ifname'], maps=["idx 0 id %d" % (mapA)], fail=False, include_stderr=True) diff --git a/tools/testing/selftests/bpf/test_skb_cgroup_id.sh b/tools/testing/selftests/bpf/test_skb_cgroup_id.sh index a9bc6f82abc1..515c2eafc97f 100755 --- a/tools/testing/selftests/bpf/test_skb_cgroup_id.sh +++ b/tools/testing/selftests/bpf/test_skb_cgroup_id.sh @@ -54,7 +54,7 @@ DIR=$(dirname $0) TEST_IF="test_cgid_1" TEST_IF_PEER="test_cgid_2" MAX_PING_TRIES=5 -BPF_PROG_OBJ="${DIR}/test_skb_cgroup_id_kern.o" +BPF_PROG_OBJ="${DIR}/test_skb_cgroup_id_kern.bpf.o" BPF_PROG_SECTION="cgroup_id_logger" BPF_PROG_ID=0 PROG="${DIR}/test_skb_cgroup_id_user" diff --git a/tools/testing/selftests/bpf/test_sock_addr.c b/tools/testing/selftests/bpf/test_sock_addr.c index 458564fcfc82..2c89674fc62c 100644 --- a/tools/testing/selftests/bpf/test_sock_addr.c +++ b/tools/testing/selftests/bpf/test_sock_addr.c @@ -26,14 +26,14 @@ #endif #define CG_PATH "/foo" -#define CONNECT4_PROG_PATH "./connect4_prog.o" -#define CONNECT6_PROG_PATH "./connect6_prog.o" -#define SENDMSG4_PROG_PATH "./sendmsg4_prog.o" -#define SENDMSG6_PROG_PATH "./sendmsg6_prog.o" -#define RECVMSG4_PROG_PATH "./recvmsg4_prog.o" -#define RECVMSG6_PROG_PATH "./recvmsg6_prog.o" -#define BIND4_PROG_PATH "./bind4_prog.o" -#define BIND6_PROG_PATH "./bind6_prog.o" +#define CONNECT4_PROG_PATH "./connect4_prog.bpf.o" +#define CONNECT6_PROG_PATH "./connect6_prog.bpf.o" +#define SENDMSG4_PROG_PATH "./sendmsg4_prog.bpf.o" +#define SENDMSG6_PROG_PATH "./sendmsg6_prog.bpf.o" +#define RECVMSG4_PROG_PATH "./recvmsg4_prog.bpf.o" +#define RECVMSG6_PROG_PATH "./recvmsg6_prog.bpf.o" +#define BIND4_PROG_PATH "./bind4_prog.bpf.o" +#define BIND6_PROG_PATH "./bind6_prog.bpf.o" #define SERV4_IP "192.168.1.254" #define SERV4_REWRITE_IP "127.0.0.1" diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c index 0fbaccdc8861..dcb038e342d8 100644 --- a/tools/testing/selftests/bpf/test_sockmap.c +++ b/tools/testing/selftests/bpf/test_sockmap.c @@ -52,8 +52,8 @@ static void running_handler(int a); #define S1_PORT 10000 #define S2_PORT 10001 -#define BPF_SOCKMAP_FILENAME "test_sockmap_kern.o" -#define BPF_SOCKHASH_FILENAME "test_sockhash_kern.o" +#define BPF_SOCKMAP_FILENAME "test_sockmap_kern.bpf.o" +#define BPF_SOCKHASH_FILENAME "test_sockhash_kern.bpf.o" #define CG_PATH "/sockmap" /* global sockets */ diff --git a/tools/testing/selftests/bpf/test_sysctl.c b/tools/testing/selftests/bpf/test_sysctl.c index 57620e7c9048..bcdbd27f22f0 100644 --- a/tools/testing/selftests/bpf/test_sysctl.c +++ b/tools/testing/selftests/bpf/test_sysctl.c @@ -1372,7 +1372,7 @@ static struct sysctl_test tests[] = { }, { "C prog: deny all writes", - .prog_file = "./test_sysctl_prog.o", + .prog_file = "./test_sysctl_prog.bpf.o", .attach_type = BPF_CGROUP_SYSCTL, .sysctl = "net/ipv4/tcp_mem", .open_flags = O_WRONLY, @@ -1381,7 +1381,7 @@ static struct sysctl_test tests[] = { }, { "C prog: deny access by name", - .prog_file = "./test_sysctl_prog.o", + .prog_file = "./test_sysctl_prog.bpf.o", .attach_type = BPF_CGROUP_SYSCTL, .sysctl = "net/ipv4/route/mtu_expires", .open_flags = O_RDONLY, @@ -1389,7 +1389,7 @@ static struct sysctl_test tests[] = { }, { "C prog: read tcp_mem", - .prog_file = "./test_sysctl_prog.o", + .prog_file = "./test_sysctl_prog.bpf.o", .attach_type = BPF_CGROUP_SYSCTL, .sysctl = "net/ipv4/tcp_mem", .open_flags = O_RDONLY, diff --git a/tools/testing/selftests/bpf/test_tcp_check_syncookie.sh b/tools/testing/selftests/bpf/test_tcp_check_syncookie.sh index 102e6588e2fe..b42c24282c25 100755 --- a/tools/testing/selftests/bpf/test_tcp_check_syncookie.sh +++ b/tools/testing/selftests/bpf/test_tcp_check_syncookie.sh @@ -76,7 +76,7 @@ main() DIR=$(dirname $0) TEST_IF=lo MAX_PING_TRIES=5 -BPF_PROG_OBJ="${DIR}/test_tcp_check_syncookie_kern.o" +BPF_PROG_OBJ="${DIR}/test_tcp_check_syncookie_kern.bpf.o" CLSACT_SECTION="tc" XDP_SECTION="xdp" BPF_PROG_ID=0 diff --git a/tools/testing/selftests/bpf/test_tcpnotify_user.c b/tools/testing/selftests/bpf/test_tcpnotify_user.c index 8284db8b0f13..595194453ff8 100644 --- a/tools/testing/selftests/bpf/test_tcpnotify_user.c +++ b/tools/testing/selftests/bpf/test_tcpnotify_user.c @@ -69,7 +69,7 @@ int verify_result(const struct tcpnotify_globals *result) int main(int argc, char **argv) { - const char *file = "test_tcpnotify_kern.o"; + const char *file = "test_tcpnotify_kern.bpf.o"; struct bpf_map *perf_map, *global_map; struct tcpnotify_globals g = {0}; struct perf_buffer *pb = NULL; diff --git a/tools/testing/selftests/bpf/test_xdp_redirect.sh b/tools/testing/selftests/bpf/test_xdp_redirect.sh index 1d79f31480ad..0746a4fde9d3 100755 --- a/tools/testing/selftests/bpf/test_xdp_redirect.sh +++ b/tools/testing/selftests/bpf/test_xdp_redirect.sh @@ -54,10 +54,10 @@ test_xdp_redirect() return 0 fi - ip -n ${NS1} link set veth11 $xdpmode obj xdp_dummy.o sec xdp &> /dev/null - ip -n ${NS2} link set veth22 $xdpmode obj xdp_dummy.o sec xdp &> /dev/null - ip link set dev veth1 $xdpmode obj test_xdp_redirect.o sec redirect_to_222 &> /dev/null - ip link set dev veth2 $xdpmode obj test_xdp_redirect.o sec redirect_to_111 &> /dev/null + ip -n ${NS1} link set veth11 $xdpmode obj xdp_dummy.bpf.o sec xdp &> /dev/null + ip -n ${NS2} link set veth22 $xdpmode obj xdp_dummy.bpf.o sec xdp &> /dev/null + ip link set dev veth1 $xdpmode obj test_xdp_redirect.bpf.o sec redirect_to_222 &> /dev/null + ip link set dev veth2 $xdpmode obj test_xdp_redirect.bpf.o sec redirect_to_111 &> /dev/null if ip netns exec ${NS1} ping -c 1 10.1.1.22 &> /dev/null && ip netns exec ${NS2} ping -c 1 10.1.1.11 &> /dev/null; then diff --git a/tools/testing/selftests/bpf/test_xdp_redirect_multi.sh b/tools/testing/selftests/bpf/test_xdp_redirect_multi.sh index cc57cb87e65f..4c3c3fdd2d73 100755 --- a/tools/testing/selftests/bpf/test_xdp_redirect_multi.sh +++ b/tools/testing/selftests/bpf/test_xdp_redirect_multi.sh @@ -94,7 +94,7 @@ setup_ns() # Add a neigh entry for IPv4 ping test ip -n ${NS[$i]} neigh add 192.0.2.253 lladdr 00:00:00:00:00:01 dev veth0 ip -n ${NS[$i]} link set veth0 $mode obj \ - xdp_dummy.o sec xdp &> /dev/null || \ + xdp_dummy.bpf.o sec xdp &> /dev/null || \ { test_fail "Unable to load dummy xdp" && exit 1; } IFACES="$IFACES veth$i" veth_mac[$i]=$(ip -n ${NS[0]} link show veth$i | awk '/link\/ether/ {print $2}') diff --git a/tools/testing/selftests/bpf/test_xdp_veth.sh b/tools/testing/selftests/bpf/test_xdp_veth.sh index 49936c4c8567..5211ca9a0239 100755 --- a/tools/testing/selftests/bpf/test_xdp_veth.sh +++ b/tools/testing/selftests/bpf/test_xdp_veth.sh @@ -101,7 +101,7 @@ ip -n ${NS3} link set dev veth33 up mkdir $BPF_DIR bpftool prog loadall \ - xdp_redirect_map.o $BPF_DIR/progs type xdp \ + xdp_redirect_map.bpf.o $BPF_DIR/progs type xdp \ pinmaps $BPF_DIR/maps bpftool map update pinned $BPF_DIR/maps/tx_port key 0 0 0 0 value 122 0 0 0 bpftool map update pinned $BPF_DIR/maps/tx_port key 1 0 0 0 value 133 0 0 0 @@ -110,9 +110,9 @@ ip link set dev veth1 xdp pinned $BPF_DIR/progs/xdp_redirect_map_0 ip link set dev veth2 xdp pinned $BPF_DIR/progs/xdp_redirect_map_1 ip link set dev veth3 xdp pinned $BPF_DIR/progs/xdp_redirect_map_2 -ip -n ${NS1} link set dev veth11 xdp obj xdp_dummy.o sec xdp -ip -n ${NS2} link set dev veth22 xdp obj xdp_tx.o sec xdp -ip -n ${NS3} link set dev veth33 xdp obj xdp_dummy.o sec xdp +ip -n ${NS1} link set dev veth11 xdp obj xdp_dummy.bpf.o sec xdp +ip -n ${NS2} link set dev veth22 xdp obj xdp_tx.bpf.o sec xdp +ip -n ${NS3} link set dev veth33 xdp obj xdp_dummy.bpf.o sec xdp trap cleanup EXIT diff --git a/tools/testing/selftests/bpf/test_xsk.sh b/tools/testing/selftests/bpf/test_xsk.sh index 096a957594cd..d821fd098504 100755 --- a/tools/testing/selftests/bpf/test_xsk.sh +++ b/tools/testing/selftests/bpf/test_xsk.sh @@ -73,14 +73,20 @@ # # Run and dump packet contents: # sudo ./test_xsk.sh -D +# +# Run test suite for physical device in loopback mode +# sudo ./test_xsk.sh -i IFACE . xsk_prereqs.sh -while getopts "vD" flag +ETH="" + +while getopts "vDi:" flag do case "${flag}" in v) verbose=1;; D) dump_pkts=1;; + i) ETH=${OPTARG};; esac done @@ -132,18 +138,25 @@ setup_vethPairs() { ip link set ${VETH0} up } -validate_root_exec -validate_veth_support ${VETH0} -validate_ip_utility -setup_vethPairs - -retval=$? -if [ $retval -ne 0 ]; then - test_status $retval "${TEST_NAME}" - cleanup_exit ${VETH0} ${VETH1} ${NS1} - exit $retval +if [ ! -z $ETH ]; then + VETH0=${ETH} + VETH1=${ETH} + NS1="" +else + validate_root_exec + validate_veth_support ${VETH0} + validate_ip_utility + setup_vethPairs + + retval=$? + if [ $retval -ne 0 ]; then + test_status $retval "${TEST_NAME}" + cleanup_exit ${VETH0} ${VETH1} ${NS1} + exit $retval + fi fi + if [[ $verbose -eq 1 ]]; then ARGS+="-v " fi @@ -152,26 +165,33 @@ if [[ $dump_pkts -eq 1 ]]; then ARGS="-D " fi +retval=$? test_status $retval "${TEST_NAME}" ## START TESTS statusList=() -TEST_NAME="XSK_SELFTESTS_SOFTIRQ" +TEST_NAME="XSK_SELFTESTS_${VETH0}_SOFTIRQ" exec_xskxceiver -cleanup_exit ${VETH0} ${VETH1} ${NS1} -TEST_NAME="XSK_SELFTESTS_BUSY_POLL" +if [ -z $ETH ]; then + cleanup_exit ${VETH0} ${VETH1} ${NS1} +fi +TEST_NAME="XSK_SELFTESTS_${VETH0}_BUSY_POLL" busy_poll=1 -setup_vethPairs +if [ -z $ETH ]; then + setup_vethPairs +fi exec_xskxceiver ## END TESTS -cleanup_exit ${VETH0} ${VETH1} ${NS1} +if [ -z $ETH ]; then + cleanup_exit ${VETH0} ${VETH1} ${NS1} +fi failures=0 echo -e "\nSummary:" diff --git a/tools/testing/selftests/bpf/xdp_redirect_multi.c b/tools/testing/selftests/bpf/xdp_redirect_multi.c index c03b3a75991f..c1fc44c87c30 100644 --- a/tools/testing/selftests/bpf/xdp_redirect_multi.c +++ b/tools/testing/selftests/bpf/xdp_redirect_multi.c @@ -142,7 +142,7 @@ int main(int argc, char **argv) } printf("\n"); - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + snprintf(filename, sizeof(filename), "%s_kern.bpf.o", argv[0]); obj = bpf_object__open_file(filename, NULL); err = libbpf_get_error(obj); if (err) diff --git a/tools/testing/selftests/bpf/xdp_synproxy.c b/tools/testing/selftests/bpf/xdp_synproxy.c index d874ddfb39c4..ff35320d2be9 100644 --- a/tools/testing/selftests/bpf/xdp_synproxy.c +++ b/tools/testing/selftests/bpf/xdp_synproxy.c @@ -193,7 +193,7 @@ static int syncookie_attach(const char *argv0, unsigned int ifindex, bool tc) int prog_fd; int err; - snprintf(xdp_filename, sizeof(xdp_filename), "%s_kern.o", argv0); + snprintf(xdp_filename, sizeof(xdp_filename), "%s_kern.bpf.o", argv0); obj = bpf_object__open_file(xdp_filename, NULL); err = libbpf_get_error(obj); if (err < 0) { diff --git a/tools/testing/selftests/bpf/xdping.c b/tools/testing/selftests/bpf/xdping.c index 5b6f977870f8..1503a1d2faa0 100644 --- a/tools/testing/selftests/bpf/xdping.c +++ b/tools/testing/selftests/bpf/xdping.c @@ -168,7 +168,7 @@ int main(int argc, char **argv) /* Use libbpf 1.0 API mode */ libbpf_set_strict_mode(LIBBPF_STRICT_ALL); - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + snprintf(filename, sizeof(filename), "%s_kern.bpf.o", argv[0]); if (bpf_prog_test_load(filename, BPF_PROG_TYPE_XDP, &obj, &prog_fd)) { fprintf(stderr, "load of %s failed\n", filename); diff --git a/tools/testing/selftests/bpf/xsk.c b/tools/testing/selftests/bpf/xsk.c index f2721a4ae7c5..0b3ff49c740d 100644 --- a/tools/testing/selftests/bpf/xsk.c +++ b/tools/testing/selftests/bpf/xsk.c @@ -1237,15 +1237,15 @@ void xsk_socket__delete(struct xsk_socket *xsk) ctx = xsk->ctx; umem = ctx->umem; - xsk_put_ctx(ctx, true); - - if (!ctx->refcount) { + if (ctx->refcount == 1) { xsk_delete_bpf_maps(xsk); close(ctx->prog_fd); if (ctx->has_bpf_link) close(ctx->link_fd); } + xsk_put_ctx(ctx, true); + err = xsk_get_mmap_offsets(xsk->fd, &off); if (!err) { if (xsk->rx) { diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c index 14b4737b223c..ef33309bbe49 100644 --- a/tools/testing/selftests/bpf/xskxceiver.c +++ b/tools/testing/selftests/bpf/xskxceiver.c @@ -99,6 +99,8 @@ #include <stdatomic.h> #include "xsk.h" #include "xskxceiver.h" +#include <bpf/bpf.h> +#include <linux/filter.h> #include "../kselftest.h" /* AF_XDP APIs were moved into libxdp and marked as deprecated in libbpf. @@ -122,9 +124,20 @@ static void __exit_with_error(int error, const char *file, const char *func, int } #define exit_with_error(error) __exit_with_error(error, __FILE__, __func__, __LINE__) - -#define mode_string(test) (test)->ifobj_tx->xdp_flags & XDP_FLAGS_SKB_MODE ? "SKB" : "DRV" #define busy_poll_string(test) (test)->ifobj_tx->busy_poll ? "BUSY-POLL " : "" +static char *mode_string(struct test_spec *test) +{ + switch (test->mode) { + case TEST_MODE_SKB: + return "SKB"; + case TEST_MODE_DRV: + return "DRV"; + case TEST_MODE_ZC: + return "ZC"; + default: + return "BOGUS"; + } +} static void report_failure(struct test_spec *test) { @@ -299,8 +312,8 @@ static void enable_busy_poll(struct xsk_socket_info *xsk) exit_with_error(errno); } -static int xsk_configure_socket(struct xsk_socket_info *xsk, struct xsk_umem_info *umem, - struct ifobject *ifobject, bool shared) +static int __xsk_configure_socket(struct xsk_socket_info *xsk, struct xsk_umem_info *umem, + struct ifobject *ifobject, bool shared) { struct xsk_socket_config cfg = {}; struct xsk_ring_cons *rxr; @@ -320,6 +333,51 @@ static int xsk_configure_socket(struct xsk_socket_info *xsk, struct xsk_umem_inf return xsk_socket__create(&xsk->xsk, ifobject->ifname, 0, umem->umem, rxr, txr, &cfg); } +static bool ifobj_zc_avail(struct ifobject *ifobject) +{ + size_t umem_sz = DEFAULT_UMEM_BUFFERS * XSK_UMEM__DEFAULT_FRAME_SIZE; + int mmap_flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE; + struct xsk_socket_info *xsk; + struct xsk_umem_info *umem; + bool zc_avail = false; + void *bufs; + int ret; + + bufs = mmap(NULL, umem_sz, PROT_READ | PROT_WRITE, mmap_flags, -1, 0); + if (bufs == MAP_FAILED) + exit_with_error(errno); + + umem = calloc(1, sizeof(struct xsk_umem_info)); + if (!umem) { + munmap(bufs, umem_sz); + exit_with_error(-ENOMEM); + } + umem->frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE; + ret = xsk_configure_umem(umem, bufs, umem_sz); + if (ret) + exit_with_error(-ret); + + xsk = calloc(1, sizeof(struct xsk_socket_info)); + if (!xsk) + goto out; + ifobject->xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; + ifobject->xdp_flags |= XDP_FLAGS_DRV_MODE; + ifobject->bind_flags = XDP_USE_NEED_WAKEUP | XDP_ZEROCOPY; + ifobject->rx_on = true; + xsk->rxqsize = XSK_RING_CONS__DEFAULT_NUM_DESCS; + ret = __xsk_configure_socket(xsk, umem, ifobject, false); + if (!ret) + zc_avail = true; + + xsk_socket__delete(xsk->xsk); + free(xsk); +out: + munmap(umem->buffer, umem_sz); + xsk_umem__delete(umem->umem); + free(umem); + return zc_avail; +} + static struct option long_options[] = { {"interface", required_argument, 0, 'i'}, {"busy-poll", no_argument, 0, 'b'}, @@ -431,20 +489,24 @@ static void __test_spec_init(struct test_spec *test, struct ifobject *ifobj_tx, ifobj->use_poll = false; ifobj->use_fill_ring = true; ifobj->release_rx = true; - ifobj->pkt_stream = test->pkt_stream_default; ifobj->validation_func = NULL; if (i == 0) { ifobj->rx_on = false; ifobj->tx_on = true; + ifobj->pkt_stream = test->tx_pkt_stream_default; } else { ifobj->rx_on = true; ifobj->tx_on = false; + ifobj->pkt_stream = test->rx_pkt_stream_default; } memset(ifobj->umem, 0, sizeof(*ifobj->umem)); ifobj->umem->num_frames = DEFAULT_UMEM_BUFFERS; ifobj->umem->frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE; + if (ifobj->shared_umem && ifobj->rx_on) + ifobj->umem->base_addr = DEFAULT_UMEM_BUFFERS * + XSK_UMEM__DEFAULT_FRAME_SIZE; for (j = 0; j < MAX_SOCKETS; j++) { memset(&ifobj->xsk_arr[j], 0, sizeof(ifobj->xsk_arr[j])); @@ -463,12 +525,15 @@ static void __test_spec_init(struct test_spec *test, struct ifobject *ifobj_tx, static void test_spec_init(struct test_spec *test, struct ifobject *ifobj_tx, struct ifobject *ifobj_rx, enum test_mode mode) { - struct pkt_stream *pkt_stream; + struct pkt_stream *tx_pkt_stream; + struct pkt_stream *rx_pkt_stream; u32 i; - pkt_stream = test->pkt_stream_default; + tx_pkt_stream = test->tx_pkt_stream_default; + rx_pkt_stream = test->rx_pkt_stream_default; memset(test, 0, sizeof(*test)); - test->pkt_stream_default = pkt_stream; + test->tx_pkt_stream_default = tx_pkt_stream; + test->rx_pkt_stream_default = rx_pkt_stream; for (i = 0; i < MAX_INTERFACES; i++) { struct ifobject *ifobj = i ? ifobj_rx : ifobj_tx; @@ -479,9 +544,14 @@ static void test_spec_init(struct test_spec *test, struct ifobject *ifobj_tx, else ifobj->xdp_flags |= XDP_FLAGS_DRV_MODE; - ifobj->bind_flags = XDP_USE_NEED_WAKEUP | XDP_COPY; + ifobj->bind_flags = XDP_USE_NEED_WAKEUP; + if (mode == TEST_MODE_ZC) + ifobj->bind_flags |= XDP_ZEROCOPY; + else + ifobj->bind_flags |= XDP_COPY; } + test->mode = mode; __test_spec_init(test, ifobj_tx, ifobj_rx); } @@ -529,16 +599,17 @@ static void pkt_stream_delete(struct pkt_stream *pkt_stream) static void pkt_stream_restore_default(struct test_spec *test) { struct pkt_stream *tx_pkt_stream = test->ifobj_tx->pkt_stream; + struct pkt_stream *rx_pkt_stream = test->ifobj_rx->pkt_stream; - if (tx_pkt_stream != test->pkt_stream_default) { + if (tx_pkt_stream != test->tx_pkt_stream_default) { pkt_stream_delete(test->ifobj_tx->pkt_stream); - test->ifobj_tx->pkt_stream = test->pkt_stream_default; + test->ifobj_tx->pkt_stream = test->tx_pkt_stream_default; } - if (test->ifobj_rx->pkt_stream != test->pkt_stream_default && - test->ifobj_rx->pkt_stream != tx_pkt_stream) + if (rx_pkt_stream != test->rx_pkt_stream_default) { pkt_stream_delete(test->ifobj_rx->pkt_stream); - test->ifobj_rx->pkt_stream = test->pkt_stream_default; + test->ifobj_rx->pkt_stream = test->rx_pkt_stream_default; + } } static struct pkt_stream *__pkt_stream_alloc(u32 nb_pkts) @@ -561,7 +632,7 @@ static struct pkt_stream *__pkt_stream_alloc(u32 nb_pkts) static void pkt_set(struct xsk_umem_info *umem, struct pkt *pkt, u64 addr, u32 len) { - pkt->addr = addr; + pkt->addr = addr + umem->base_addr; pkt->len = len; if (len > umem->frame_size - XDP_PACKET_HEADROOM - MIN_PKT_SIZE * 2 - umem->frame_headroom) pkt->valid = false; @@ -600,22 +671,29 @@ static void pkt_stream_replace(struct test_spec *test, u32 nb_pkts, u32 pkt_len) pkt_stream = pkt_stream_generate(test->ifobj_tx->umem, nb_pkts, pkt_len); test->ifobj_tx->pkt_stream = pkt_stream; + pkt_stream = pkt_stream_generate(test->ifobj_rx->umem, nb_pkts, pkt_len); test->ifobj_rx->pkt_stream = pkt_stream; } -static void pkt_stream_replace_half(struct test_spec *test, u32 pkt_len, int offset) +static void __pkt_stream_replace_half(struct ifobject *ifobj, u32 pkt_len, + int offset) { - struct xsk_umem_info *umem = test->ifobj_tx->umem; + struct xsk_umem_info *umem = ifobj->umem; struct pkt_stream *pkt_stream; u32 i; - pkt_stream = pkt_stream_clone(umem, test->pkt_stream_default); - for (i = 1; i < test->pkt_stream_default->nb_pkts; i += 2) + pkt_stream = pkt_stream_clone(umem, ifobj->pkt_stream); + for (i = 1; i < ifobj->pkt_stream->nb_pkts; i += 2) pkt_set(umem, &pkt_stream->pkts[i], (i % umem->num_frames) * umem->frame_size + offset, pkt_len); - test->ifobj_tx->pkt_stream = pkt_stream; - test->ifobj_rx->pkt_stream = pkt_stream; + ifobj->pkt_stream = pkt_stream; +} + +static void pkt_stream_replace_half(struct test_spec *test, u32 pkt_len, int offset) +{ + __pkt_stream_replace_half(test->ifobj_tx, pkt_len, offset); + __pkt_stream_replace_half(test->ifobj_rx, pkt_len, offset); } static void pkt_stream_receive_half(struct test_spec *test) @@ -657,7 +735,8 @@ static struct pkt *pkt_generate(struct ifobject *ifobject, u32 pkt_nb) return pkt; } -static void pkt_stream_generate_custom(struct test_spec *test, struct pkt *pkts, u32 nb_pkts) +static void __pkt_stream_generate_custom(struct ifobject *ifobj, + struct pkt *pkts, u32 nb_pkts) { struct pkt_stream *pkt_stream; u32 i; @@ -666,15 +745,20 @@ static void pkt_stream_generate_custom(struct test_spec *test, struct pkt *pkts, if (!pkt_stream) exit_with_error(ENOMEM); - test->ifobj_tx->pkt_stream = pkt_stream; - test->ifobj_rx->pkt_stream = pkt_stream; - for (i = 0; i < nb_pkts; i++) { - pkt_stream->pkts[i].addr = pkts[i].addr; + pkt_stream->pkts[i].addr = pkts[i].addr + ifobj->umem->base_addr; pkt_stream->pkts[i].len = pkts[i].len; pkt_stream->pkts[i].payload = i; pkt_stream->pkts[i].valid = pkts[i].valid; } + + ifobj->pkt_stream = pkt_stream; +} + +static void pkt_stream_generate_custom(struct test_spec *test, struct pkt *pkts, u32 nb_pkts) +{ + __pkt_stream_generate_custom(test->ifobj_tx, pkts, nb_pkts); + __pkt_stream_generate_custom(test->ifobj_rx, pkts, nb_pkts); } static void pkt_dump(void *pkt, u32 len) @@ -1126,6 +1210,70 @@ static int validate_tx_invalid_descs(struct ifobject *ifobject) return TEST_PASS; } +static void xsk_configure_socket(struct test_spec *test, struct ifobject *ifobject, + struct xsk_umem_info *umem, bool tx) +{ + int i, ret; + + for (i = 0; i < test->nb_sockets; i++) { + bool shared = (ifobject->shared_umem && tx) ? true : !!i; + u32 ctr = 0; + + while (ctr++ < SOCK_RECONF_CTR) { + ret = __xsk_configure_socket(&ifobject->xsk_arr[i], umem, + ifobject, shared); + if (!ret) + break; + + /* Retry if it fails as xsk_socket__create() is asynchronous */ + if (ctr >= SOCK_RECONF_CTR) + exit_with_error(-ret); + usleep(USLEEP_MAX); + } + if (ifobject->busy_poll) + enable_busy_poll(&ifobject->xsk_arr[i]); + } +} + +static void thread_common_ops_tx(struct test_spec *test, struct ifobject *ifobject) +{ + xsk_configure_socket(test, ifobject, test->ifobj_rx->umem, true); + ifobject->xsk = &ifobject->xsk_arr[0]; + ifobject->xsk_map_fd = test->ifobj_rx->xsk_map_fd; + memcpy(ifobject->umem, test->ifobj_rx->umem, sizeof(struct xsk_umem_info)); +} + +static void xsk_populate_fill_ring(struct xsk_umem_info *umem, struct pkt_stream *pkt_stream) +{ + u32 idx = 0, i, buffers_to_fill; + int ret; + + if (umem->num_frames < XSK_RING_PROD__DEFAULT_NUM_DESCS) + buffers_to_fill = umem->num_frames; + else + buffers_to_fill = XSK_RING_PROD__DEFAULT_NUM_DESCS; + + ret = xsk_ring_prod__reserve(&umem->fq, buffers_to_fill, &idx); + if (ret != buffers_to_fill) + exit_with_error(ENOSPC); + for (i = 0; i < buffers_to_fill; i++) { + u64 addr; + + if (pkt_stream->use_addr_for_fill) { + struct pkt *pkt = pkt_stream_get_pkt(pkt_stream, i); + + if (!pkt) + break; + addr = pkt->addr; + } else { + addr = i * umem->frame_size; + } + + *xsk_ring_prod__fill_addr(&umem->fq, idx++) = addr; + } + xsk_ring_prod__submit(&umem->fq, buffers_to_fill); +} + static void thread_common_ops(struct test_spec *test, struct ifobject *ifobject) { u64 umem_sz = ifobject->umem->num_frames * ifobject->umem->frame_size; @@ -1133,13 +1281,15 @@ static void thread_common_ops(struct test_spec *test, struct ifobject *ifobject) LIBBPF_OPTS(bpf_xdp_query_opts, opts); int ret, ifindex; void *bufs; - u32 i; ifobject->ns_fd = switch_namespace(ifobject->nsname); if (ifobject->umem->unaligned_mode) mmap_flags |= MAP_HUGETLB; + if (ifobject->shared_umem) + umem_sz *= 2; + bufs = mmap(NULL, umem_sz, PROT_READ | PROT_WRITE, mmap_flags, -1, 0); if (bufs == MAP_FAILED) exit_with_error(errno); @@ -1148,24 +1298,9 @@ static void thread_common_ops(struct test_spec *test, struct ifobject *ifobject) if (ret) exit_with_error(-ret); - for (i = 0; i < test->nb_sockets; i++) { - u32 ctr = 0; - - while (ctr++ < SOCK_RECONF_CTR) { - ret = xsk_configure_socket(&ifobject->xsk_arr[i], ifobject->umem, - ifobject, !!i); - if (!ret) - break; - - /* Retry if it fails as xsk_socket__create() is asynchronous */ - if (ctr >= SOCK_RECONF_CTR) - exit_with_error(-ret); - usleep(USLEEP_MAX); - } + xsk_populate_fill_ring(ifobject->umem, ifobject->pkt_stream); - if (ifobject->busy_poll) - enable_busy_poll(&ifobject->xsk_arr[i]); - } + xsk_configure_socket(test, ifobject, ifobject->umem, false); ifobject->xsk = &ifobject->xsk_arr[0]; @@ -1201,22 +1336,18 @@ static void thread_common_ops(struct test_spec *test, struct ifobject *ifobject) exit_with_error(-ret); } -static void testapp_cleanup_xsk_res(struct ifobject *ifobj) -{ - print_verbose("Destroying socket\n"); - xsk_socket__delete(ifobj->xsk->xsk); - munmap(ifobj->umem->buffer, ifobj->umem->num_frames * ifobj->umem->frame_size); - xsk_umem__delete(ifobj->umem->umem); -} - static void *worker_testapp_validate_tx(void *arg) { struct test_spec *test = (struct test_spec *)arg; struct ifobject *ifobject = test->ifobj_tx; int err; - if (test->current_step == 1) - thread_common_ops(test, ifobject); + if (test->current_step == 1) { + if (!ifobject->shared_umem) + thread_common_ops(test, ifobject); + else + thread_common_ops_tx(test, ifobject); + } print_verbose("Sending %d packets on interface %s\n", ifobject->pkt_stream->nb_pkts, ifobject->ifname); @@ -1227,53 +1358,23 @@ static void *worker_testapp_validate_tx(void *arg) if (err) report_failure(test); - if (test->total_steps == test->current_step || err) - testapp_cleanup_xsk_res(ifobject); pthread_exit(NULL); } -static void xsk_populate_fill_ring(struct xsk_umem_info *umem, struct pkt_stream *pkt_stream) -{ - u32 idx = 0, i, buffers_to_fill; - int ret; - - if (umem->num_frames < XSK_RING_PROD__DEFAULT_NUM_DESCS) - buffers_to_fill = umem->num_frames; - else - buffers_to_fill = XSK_RING_PROD__DEFAULT_NUM_DESCS; - - ret = xsk_ring_prod__reserve(&umem->fq, buffers_to_fill, &idx); - if (ret != buffers_to_fill) - exit_with_error(ENOSPC); - for (i = 0; i < buffers_to_fill; i++) { - u64 addr; - - if (pkt_stream->use_addr_for_fill) { - struct pkt *pkt = pkt_stream_get_pkt(pkt_stream, i); - - if (!pkt) - break; - addr = pkt->addr; - } else { - addr = i * umem->frame_size; - } - - *xsk_ring_prod__fill_addr(&umem->fq, idx++) = addr; - } - xsk_ring_prod__submit(&umem->fq, buffers_to_fill); -} - static void *worker_testapp_validate_rx(void *arg) { struct test_spec *test = (struct test_spec *)arg; struct ifobject *ifobject = test->ifobj_rx; struct pollfd fds = { }; + int id = 0; int err; - if (test->current_step == 1) + if (test->current_step == 1) { thread_common_ops(test, ifobject); - - xsk_populate_fill_ring(ifobject->umem, ifobject->pkt_stream); + } else { + bpf_map_delete_elem(ifobject->xsk_map_fd, &id); + xsk_socket__update_xskmap(ifobject->xsk->xsk, ifobject->xsk_map_fd); + } fds.fd = xsk_socket__fd(ifobject->xsk->xsk); fds.events = POLLIN; @@ -1291,25 +1392,44 @@ static void *worker_testapp_validate_rx(void *arg) pthread_mutex_unlock(&pacing_mutex); } - if (test->total_steps == test->current_step || err) - testapp_cleanup_xsk_res(ifobject); + pthread_exit(NULL); +} + +static void testapp_clean_xsk_umem(struct ifobject *ifobj) +{ + u64 umem_sz = ifobj->umem->num_frames * ifobj->umem->frame_size; + + if (ifobj->shared_umem) + umem_sz *= 2; + + xsk_umem__delete(ifobj->umem->umem); + munmap(ifobj->umem->buffer, umem_sz); +} + +static void handler(int signum) +{ pthread_exit(NULL); } static int testapp_validate_traffic_single_thread(struct test_spec *test, struct ifobject *ifobj, enum test_type type) { + bool old_shared_umem = ifobj->shared_umem; pthread_t t0; if (pthread_barrier_init(&barr, NULL, 2)) exit_with_error(errno); test->current_step++; - if (type == TEST_TYPE_POLL_RXQ_TMOUT) + if (type == TEST_TYPE_POLL_RXQ_TMOUT) pkt_stream_reset(ifobj->pkt_stream); pkts_in_flight = 0; - /*Spawn thread */ + test->ifobj_rx->shared_umem = false; + test->ifobj_tx->shared_umem = false; + + signal(SIGUSR1, handler); + /* Spawn thread */ pthread_create(&t0, NULL, ifobj->func_ptr, test); if (type != TEST_TYPE_POLL_TXQ_TMOUT) @@ -1318,8 +1438,17 @@ static int testapp_validate_traffic_single_thread(struct test_spec *test, struct if (pthread_barrier_destroy(&barr)) exit_with_error(errno); + pthread_kill(t0, SIGUSR1); pthread_join(t0, NULL); + if (test->total_steps == test->current_step || test->fail) { + xsk_socket__delete(ifobj->xsk->xsk); + testapp_clean_xsk_umem(ifobj); + } + + test->ifobj_rx->shared_umem = old_shared_umem; + test->ifobj_tx->shared_umem = old_shared_umem; + return !!test->fail; } @@ -1349,6 +1478,14 @@ static int testapp_validate_traffic(struct test_spec *test) pthread_join(t1, NULL); pthread_join(t0, NULL); + if (test->total_steps == test->current_step || test->fail) { + xsk_socket__delete(ifobj_tx->xsk->xsk); + xsk_socket__delete(ifobj_rx->xsk->xsk); + testapp_clean_xsk_umem(ifobj_rx); + if (!ifobj_tx->shared_umem) + testapp_clean_xsk_umem(ifobj_tx); + } + return !!test->fail; } @@ -1428,9 +1565,9 @@ static void testapp_headroom(struct test_spec *test) static void testapp_stats_rx_dropped(struct test_spec *test) { test_spec_set_name(test, "STAT_RX_DROPPED"); + pkt_stream_replace_half(test, MIN_PKT_SIZE * 4, 0); test->ifobj_rx->umem->frame_headroom = test->ifobj_rx->umem->frame_size - XDP_PACKET_HEADROOM - MIN_PKT_SIZE * 3; - pkt_stream_replace_half(test, MIN_PKT_SIZE * 4, 0); pkt_stream_receive_half(test); test->ifobj_rx->validation_func = validate_rx_dropped; testapp_validate_traffic(test); @@ -1553,6 +1690,11 @@ static void testapp_invalid_desc(struct test_spec *test) pkts[7].valid = false; } + if (test->ifobj_tx->shared_umem) { + pkts[4].addr += UMEM_SIZE; + pkts[5].addr += UMEM_SIZE; + } + pkt_stream_generate_custom(test, pkts, ARRAY_SIZE(pkts)); testapp_validate_traffic(test); pkt_stream_restore_default(test); @@ -1583,6 +1725,10 @@ static void run_pkt_test(struct test_spec *test, enum test_mode mode, enum test_ { switch (type) { case TEST_TYPE_STATS_RX_DROPPED: + if (mode == TEST_MODE_ZC) { + ksft_test_result_skip("Can not run RX_DROPPED test for ZC mode\n"); + return; + } testapp_stats_rx_dropped(test); break; case TEST_TYPE_STATS_TX_INVALID_DESCS: @@ -1712,12 +1858,44 @@ static void ifobject_delete(struct ifobject *ifobj) free(ifobj); } +static bool is_xdp_supported(struct ifobject *ifobject) +{ + int flags = XDP_FLAGS_DRV_MODE; + + LIBBPF_OPTS(bpf_link_create_opts, opts, .flags = flags); + struct bpf_insn insns[2] = { + BPF_MOV64_IMM(BPF_REG_0, XDP_PASS), + BPF_EXIT_INSN() + }; + int ifindex = if_nametoindex(ifobject->ifname); + int prog_fd, insn_cnt = ARRAY_SIZE(insns); + int err; + + prog_fd = bpf_prog_load(BPF_PROG_TYPE_XDP, NULL, "GPL", insns, insn_cnt, NULL); + if (prog_fd < 0) + return false; + + err = bpf_xdp_attach(ifindex, prog_fd, flags, NULL); + if (err) { + close(prog_fd); + return false; + } + + bpf_xdp_detach(ifindex, flags, NULL); + close(prog_fd); + + return true; +} + int main(int argc, char **argv) { - struct pkt_stream *pkt_stream_default; + struct pkt_stream *rx_pkt_stream_default; + struct pkt_stream *tx_pkt_stream_default; struct ifobject *ifobj_tx, *ifobj_rx; + int modes = TEST_MODE_SKB + 1; u32 i, j, failed_tests = 0; struct test_spec test; + bool shared_umem; /* Use libbpf 1.0 API mode */ libbpf_set_strict_mode(LIBBPF_STRICT_ALL); @@ -1732,6 +1910,10 @@ int main(int argc, char **argv) setlocale(LC_ALL, ""); parse_command_line(ifobj_tx, ifobj_rx, argc, argv); + shared_umem = !strcmp(ifobj_tx->ifname, ifobj_rx->ifname); + + ifobj_tx->shared_umem = shared_umem; + ifobj_rx->shared_umem = shared_umem; if (!validate_interface(ifobj_tx) || !validate_interface(ifobj_rx)) { usage(basename(argv[0])); @@ -1743,15 +1925,23 @@ int main(int argc, char **argv) init_iface(ifobj_rx, MAC2, MAC1, IP2, IP1, UDP_PORT2, UDP_PORT1, worker_testapp_validate_rx); + if (is_xdp_supported(ifobj_tx)) { + modes++; + if (ifobj_zc_avail(ifobj_tx)) + modes++; + } + test_spec_init(&test, ifobj_tx, ifobj_rx, 0); - pkt_stream_default = pkt_stream_generate(ifobj_tx->umem, DEFAULT_PKT_CNT, PKT_SIZE); - if (!pkt_stream_default) + tx_pkt_stream_default = pkt_stream_generate(ifobj_tx->umem, DEFAULT_PKT_CNT, PKT_SIZE); + rx_pkt_stream_default = pkt_stream_generate(ifobj_rx->umem, DEFAULT_PKT_CNT, PKT_SIZE); + if (!tx_pkt_stream_default || !rx_pkt_stream_default) exit_with_error(ENOMEM); - test.pkt_stream_default = pkt_stream_default; + test.tx_pkt_stream_default = tx_pkt_stream_default; + test.rx_pkt_stream_default = rx_pkt_stream_default; - ksft_set_plan(TEST_MODE_MAX * TEST_TYPE_MAX); + ksft_set_plan(modes * TEST_TYPE_MAX); - for (i = 0; i < TEST_MODE_MAX; i++) + for (i = 0; i < modes; i++) for (j = 0; j < TEST_TYPE_MAX; j++) { test_spec_init(&test, ifobj_tx, ifobj_rx, i); run_pkt_test(&test, i, j); @@ -1761,7 +1951,11 @@ int main(int argc, char **argv) failed_tests++; } - pkt_stream_delete(pkt_stream_default); + pkt_stream_delete(tx_pkt_stream_default); + pkt_stream_delete(rx_pkt_stream_default); + free(ifobj_rx->umem); + if (!ifobj_tx->shared_umem) + free(ifobj_tx->umem); ifobject_delete(ifobj_tx); ifobject_delete(ifobj_rx); diff --git a/tools/testing/selftests/bpf/xskxceiver.h b/tools/testing/selftests/bpf/xskxceiver.h index ee97576757a9..edb76d2def9f 100644 --- a/tools/testing/selftests/bpf/xskxceiver.h +++ b/tools/testing/selftests/bpf/xskxceiver.h @@ -29,8 +29,8 @@ #define TEST_FAILURE -1 #define TEST_CONTINUE 1 #define MAX_INTERFACES 2 -#define MAX_INTERFACE_NAME_CHARS 7 -#define MAX_INTERFACES_NAMESPACE_CHARS 10 +#define MAX_INTERFACE_NAME_CHARS 16 +#define MAX_INTERFACES_NAMESPACE_CHARS 16 #define MAX_SOCKETS 2 #define MAX_TEST_NAME_SIZE 32 #define MAX_TEARDOWN_ITER 10 @@ -62,6 +62,7 @@ enum test_mode { TEST_MODE_SKB, TEST_MODE_DRV, + TEST_MODE_ZC, TEST_MODE_MAX }; @@ -99,6 +100,7 @@ struct xsk_umem_info { u32 frame_headroom; void *buffer; u32 frame_size; + u32 base_addr; bool unaligned_mode; }; @@ -152,6 +154,7 @@ struct ifobject { bool busy_poll; bool use_fill_ring; bool release_rx; + bool shared_umem; u8 dst_mac[ETH_ALEN]; u8 src_mac[ETH_ALEN]; }; @@ -159,11 +162,13 @@ struct ifobject { struct test_spec { struct ifobject *ifobj_tx; struct ifobject *ifobj_rx; - struct pkt_stream *pkt_stream_default; + struct pkt_stream *tx_pkt_stream_default; + struct pkt_stream *rx_pkt_stream_default; u16 total_steps; u16 current_step; u16 nb_sockets; bool fail; + enum test_mode mode; char name[MAX_TEST_NAME_SIZE]; }; |