summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/mips/net/bpf_jit_comp32.c10
-rw-r--r--arch/mips/net/bpf_jit_comp64.c10
-rw-r--r--include/linux/bpf-cgroup.h17
-rw-r--r--include/linux/bpf.h22
-rw-r--r--include/linux/bpf_mem_alloc.h28
-rw-r--r--include/linux/bpf_verifier.h11
-rw-r--r--include/linux/filter.h3
-rw-r--r--include/linux/igmp.h4
-rw-r--r--include/linux/mroute.h6
-rw-r--r--include/linux/mroute6.h4
-rw-r--r--include/linux/skbuff.h4
-rw-r--r--include/linux/sockptr.h5
-rw-r--r--include/linux/tnum.h20
-rw-r--r--include/net/ip.h4
-rw-r--r--include/net/ipv6.h6
-rw-r--r--include/net/ipv6_stubs.h4
-rw-r--r--include/net/sock.h9
-rw-r--r--include/net/tcp.h4
-rw-r--r--include/uapi/linux/bpf.h83
-rw-r--r--kernel/bpf/Makefile5
-rw-r--r--kernel/bpf/bpf_iter.c5
-rw-r--r--kernel/bpf/bpf_local_storage.c4
-rw-r--r--kernel/bpf/bpf_lsm.c23
-rw-r--r--kernel/bpf/bpf_task_storage.c8
-rw-r--r--kernel/bpf/cgroup.c157
-rw-r--r--kernel/bpf/cgroup_iter.c282
-rw-r--r--kernel/bpf/hashtab.c168
-rw-r--r--kernel/bpf/helpers.c48
-rw-r--r--kernel/bpf/memalloc.c634
-rw-r--r--kernel/bpf/syscall.c15
-rw-r--r--kernel/bpf/trampoline.c8
-rw-r--r--kernel/bpf/verifier.c94
-rw-r--r--kernel/cgroup/rstat.c48
-rw-r--r--net/core/filter.c635
-rw-r--r--net/core/flow_dissector.c16
-rw-r--r--net/core/sock.c134
-rw-r--r--net/ipv4/igmp.c22
-rw-r--r--net/ipv4/ip_sockglue.c114
-rw-r--r--net/ipv4/ipmr.c9
-rw-r--r--net/ipv4/tcp.c116
-rw-r--r--net/ipv6/af_inet6.c2
-rw-r--r--net/ipv6/ip6mr.c10
-rw-r--r--net/ipv6/ipv6_sockglue.c113
-rw-r--r--net/ipv6/mcast.c8
-rw-r--r--samples/bpf/map_perf_test_kern.c44
-rw-r--r--samples/bpf/map_perf_test_user.c2
-rwxr-xr-xscripts/bpf_doc.py78
-rw-r--r--tools/bpf/bpftool/btf_dumper.c2
-rw-r--r--tools/bpf/bpftool/link.c35
-rw-r--r--tools/include/uapi/linux/bpf.h83
-rw-r--r--tools/lib/bpf/bpf_helpers.h19
-rw-r--r--tools/lib/bpf/skel_internal.h23
-rw-r--r--tools/testing/selftests/bpf/DENYLIST.s390x4
-rw-r--r--tools/testing/selftests/bpf/Makefile37
-rw-r--r--tools/testing/selftests/bpf/README.rst8
-rw-r--r--tools/testing/selftests/bpf/cgroup_getset_retval_hooks.h25
-rw-r--r--tools/testing/selftests/bpf/cgroup_helpers.c202
-rw-r--r--tools/testing/selftests/bpf/cgroup_helpers.h19
-rw-r--r--tools/testing/selftests/bpf/get_cgroup_id_user.c2
-rw-r--r--tools/testing/selftests/bpf/map_tests/task_storage_map.c122
-rw-r--r--tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c2
-rw-r--r--tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c54
-rw-r--r--tools/testing/selftests/bpf/prog_tests/btf.c4
-rw-r--r--tools/testing/selftests/bpf/prog_tests/btf_dump.c10
-rw-r--r--tools/testing/selftests/bpf/prog_tests/btf_endian.c2
-rw-r--r--tools/testing/selftests/bpf/prog_tests/cb_refs.c48
-rw-r--r--tools/testing/selftests/bpf/prog_tests/cgroup_getset_retval.c48
-rw-r--r--tools/testing/selftests/bpf/prog_tests/cgroup_hierarchical_stats.c357
-rw-r--r--tools/testing/selftests/bpf/prog_tests/cgroup_iter.c224
-rw-r--r--tools/testing/selftests/bpf/prog_tests/connect_force_port.c2
-rw-r--r--tools/testing/selftests/bpf/prog_tests/core_reloc.c74
-rw-r--r--tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c44
-rw-r--r--tools/testing/selftests/bpf/prog_tests/flow_dissector.c44
-rw-r--r--tools/testing/selftests/bpf/prog_tests/flow_dissector_load_bytes.c2
-rw-r--r--tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c4
-rw-r--r--tools/testing/selftests/bpf/prog_tests/global_data.c2
-rw-r--r--tools/testing/selftests/bpf/prog_tests/global_data_init.c2
-rw-r--r--tools/testing/selftests/bpf/prog_tests/global_func_args.c2
-rw-r--r--tools/testing/selftests/bpf/prog_tests/htab_update.c126
-rw-r--r--tools/testing/selftests/bpf/prog_tests/kfree_skb.c2
-rw-r--r--tools/testing/selftests/bpf/prog_tests/kfunc_call.c2
-rw-r--r--tools/testing/selftests/bpf/prog_tests/l4lb_all.c4
-rw-r--r--tools/testing/selftests/bpf/prog_tests/load_bytes_relative.c4
-rw-r--r--tools/testing/selftests/bpf/prog_tests/map_lock.c2
-rw-r--r--tools/testing/selftests/bpf/prog_tests/pinning.c4
-rw-r--r--tools/testing/selftests/bpf/prog_tests/pkt_access.c2
-rw-r--r--tools/testing/selftests/bpf/prog_tests/pkt_md_access.c2
-rw-r--r--tools/testing/selftests/bpf/prog_tests/probe_user.c2
-rw-r--r--tools/testing/selftests/bpf/prog_tests/queue_stack_map.c4
-rw-r--r--tools/testing/selftests/bpf/prog_tests/rdonly_maps.c2
-rw-r--r--tools/testing/selftests/bpf/prog_tests/reference_tracking.c2
-rw-r--r--tools/testing/selftests/bpf/prog_tests/resolve_btfids.c2
-rw-r--r--tools/testing/selftests/bpf/prog_tests/select_reuseport.c4
-rw-r--r--tools/testing/selftests/bpf/prog_tests/setget_sockopt.c125
-rw-r--r--tools/testing/selftests/bpf/prog_tests/sk_assign.c2
-rw-r--r--tools/testing/selftests/bpf/prog_tests/skb_ctx.c2
-rw-r--r--tools/testing/selftests/bpf/prog_tests/skb_helpers.c2
-rw-r--r--tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c2
-rw-r--r--tools/testing/selftests/bpf/prog_tests/sockopt_multi.c2
-rw-r--r--tools/testing/selftests/bpf/prog_tests/spinlock.c2
-rw-r--r--tools/testing/selftests/bpf/prog_tests/stacktrace_map.c2
-rw-r--r--tools/testing/selftests/bpf/prog_tests/stacktrace_map_raw_tp.c2
-rw-r--r--tools/testing/selftests/bpf/prog_tests/tailcalls.c36
-rw-r--r--tools/testing/selftests/bpf/prog_tests/task_fd_query_rawtp.c2
-rw-r--r--tools/testing/selftests/bpf/prog_tests/task_fd_query_tp.c2
-rw-r--r--tools/testing/selftests/bpf/prog_tests/tcp_estats.c2
-rw-r--r--tools/testing/selftests/bpf/prog_tests/test_bprm_opts.c10
-rw-r--r--tools/testing/selftests/bpf/prog_tests/test_global_funcs.c34
-rw-r--r--tools/testing/selftests/bpf/prog_tests/test_local_storage.c10
-rw-r--r--tools/testing/selftests/bpf/prog_tests/test_overhead.c2
-rw-r--r--tools/testing/selftests/bpf/prog_tests/tp_attach_query.c2
-rw-r--r--tools/testing/selftests/bpf/prog_tests/trampoline_count.c2
-rw-r--r--tools/testing/selftests/bpf/prog_tests/xdp.c2
-rw-r--r--tools/testing/selftests/bpf/prog_tests/xdp_adjust_frags.c2
-rw-r--r--tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c10
-rw-r--r--tools/testing/selftests/bpf/prog_tests/xdp_attach.c2
-rw-r--r--tools/testing/selftests/bpf/prog_tests/xdp_info.c2
-rw-r--r--tools/testing/selftests/bpf/prog_tests/xdp_perf.c2
-rw-r--r--tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c2
-rw-r--r--tools/testing/selftests/bpf/progs/bind4_prog.c2
-rw-r--r--tools/testing/selftests/bpf/progs/bind6_prog.c2
-rw-r--r--tools/testing/selftests/bpf/progs/bpf_flow.c15
-rw-r--r--tools/testing/selftests/bpf/progs/bpf_iter.h7
-rw-r--r--tools/testing/selftests/bpf/progs/bpf_tracing_net.h32
-rw-r--r--tools/testing/selftests/bpf/progs/cb_refs.c116
-rw-r--r--tools/testing/selftests/bpf/progs/cgroup_getset_retval_hooks.c16
-rw-r--r--tools/testing/selftests/bpf/progs/cgroup_hierarchical_stats.c226
-rw-r--r--tools/testing/selftests/bpf/progs/cgroup_iter.c39
-rw-r--r--tools/testing/selftests/bpf/progs/connect4_prog.c5
-rw-r--r--tools/testing/selftests/bpf/progs/fexit_bpf2bpf.c8
-rw-r--r--tools/testing/selftests/bpf/progs/htab_update.c29
-rw-r--r--tools/testing/selftests/bpf/progs/read_bpf_task_storage_busy.c39
-rw-r--r--tools/testing/selftests/bpf/progs/setget_sockopt.c395
-rw-r--r--tools/testing/selftests/bpf/progs/tailcall_bpf2bpf4.c2
-rw-r--r--tools/testing/selftests/bpf/progs/test_tc_dtime.c1
-rw-r--r--tools/testing/selftests/bpf/progs/test_tunnel_kern.c24
-rw-r--r--tools/testing/selftests/bpf/progs/timer.c11
-rw-r--r--tools/testing/selftests/bpf/task_local_storage_helpers.h18
-rw-r--r--tools/testing/selftests/bpf/test_dev_cgroup.c2
-rwxr-xr-xtools/testing/selftests/bpf/test_flow_dissector.sh8
-rw-r--r--tools/testing/selftests/bpf/test_lirc_mode2_user.c2
-rw-r--r--tools/testing/selftests/bpf/test_maps.c48
-rwxr-xr-xtools/testing/selftests/bpf/test_offload.py22
-rwxr-xr-xtools/testing/selftests/bpf/test_skb_cgroup_id.sh2
-rw-r--r--tools/testing/selftests/bpf/test_sock_addr.c16
-rw-r--r--tools/testing/selftests/bpf/test_sockmap.c4
-rw-r--r--tools/testing/selftests/bpf/test_sysctl.c6
-rwxr-xr-xtools/testing/selftests/bpf/test_tcp_check_syncookie.sh2
-rw-r--r--tools/testing/selftests/bpf/test_tcpnotify_user.c2
-rwxr-xr-xtools/testing/selftests/bpf/test_xdp_redirect.sh8
-rwxr-xr-xtools/testing/selftests/bpf/test_xdp_redirect_multi.sh2
-rwxr-xr-xtools/testing/selftests/bpf/test_xdp_veth.sh8
-rwxr-xr-xtools/testing/selftests/bpf/test_xsk.sh52
-rw-r--r--tools/testing/selftests/bpf/xdp_redirect_multi.c2
-rw-r--r--tools/testing/selftests/bpf/xdp_synproxy.c2
-rw-r--r--tools/testing/selftests/bpf/xdping.c2
-rw-r--r--tools/testing/selftests/bpf/xsk.c6
-rw-r--r--tools/testing/selftests/bpf/xskxceiver.c398
-rw-r--r--tools/testing/selftests/bpf/xskxceiver.h11
159 files changed, 5225 insertions, 1358 deletions
diff --git a/arch/mips/net/bpf_jit_comp32.c b/arch/mips/net/bpf_jit_comp32.c
index 83c975d5cca2..ace5db3fbd17 100644
--- a/arch/mips/net/bpf_jit_comp32.c
+++ b/arch/mips/net/bpf_jit_comp32.c
@@ -1377,11 +1377,19 @@ void build_prologue(struct jit_context *ctx)
int stack, saved, locals, reserved;
/*
+ * In the unlikely event that the TCC limit is raised to more
+ * than 16 bits, it is clamped to the maximum value allowed for
+ * the generated code (0xffff). It is better fail to compile
+ * instead of degrading gracefully.
+ */
+ BUILD_BUG_ON(MAX_TAIL_CALL_CNT > 0xffff);
+
+ /*
* The first two instructions initialize TCC in the reserved (for us)
* 16-byte area in the parent's stack frame. On a tail call, the
* calling function jumps into the prologue after these instructions.
*/
- emit(ctx, ori, MIPS_R_T9, MIPS_R_ZERO, min(MAX_TAIL_CALL_CNT, 0xffff));
+ emit(ctx, ori, MIPS_R_T9, MIPS_R_ZERO, MAX_TAIL_CALL_CNT);
emit(ctx, sw, MIPS_R_T9, 0, MIPS_R_SP);
/*
diff --git a/arch/mips/net/bpf_jit_comp64.c b/arch/mips/net/bpf_jit_comp64.c
index 6475828ffb36..0e7c1bdcf914 100644
--- a/arch/mips/net/bpf_jit_comp64.c
+++ b/arch/mips/net/bpf_jit_comp64.c
@@ -548,11 +548,19 @@ void build_prologue(struct jit_context *ctx)
int stack, saved, locals, reserved;
/*
+ * In the unlikely event that the TCC limit is raised to more
+ * than 16 bits, it is clamped to the maximum value allowed for
+ * the generated code (0xffff). It is better fail to compile
+ * instead of degrading gracefully.
+ */
+ BUILD_BUG_ON(MAX_TAIL_CALL_CNT > 0xffff);
+
+ /*
* The first instruction initializes the tail call count register.
* On a tail call, the calling function jumps into the prologue
* after this instruction.
*/
- emit(ctx, ori, tc, MIPS_R_ZERO, min(MAX_TAIL_CALL_CNT, 0xffff));
+ emit(ctx, ori, tc, MIPS_R_ZERO, MAX_TAIL_CALL_CNT);
/* === Entry-point for tail calls === */
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 2bd1b5f8de9b..57e9e109257e 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -414,6 +414,11 @@ int cgroup_bpf_prog_detach(const union bpf_attr *attr,
int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
int cgroup_bpf_prog_query(const union bpf_attr *attr,
union bpf_attr __user *uattr);
+
+const struct bpf_func_proto *
+cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog);
+const struct bpf_func_proto *
+cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog);
#else
static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; }
@@ -444,6 +449,18 @@ static inline int cgroup_bpf_prog_query(const union bpf_attr *attr,
return -EINVAL;
}
+static inline const struct bpf_func_proto *
+cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ return NULL;
+}
+
+static inline const struct bpf_func_proto *
+cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ return NULL;
+}
+
static inline int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux,
struct bpf_map *map) { return 0; }
static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a627a02cf8ab..9c1674973e03 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -48,6 +48,7 @@ struct mem_cgroup;
struct module;
struct bpf_func_state;
struct ftrace_ops;
+struct cgroup;
extern struct idr btf_idr;
extern spinlock_t btf_idr_lock;
@@ -1730,7 +1731,14 @@ int bpf_obj_get_user(const char __user *pathname, int flags);
int __init bpf_iter_ ## target(args) { return 0; }
struct bpf_iter_aux_info {
+ /* for map_elem iter */
struct bpf_map *map;
+
+ /* for cgroup iter */
+ struct {
+ struct cgroup *start; /* starting cgroup */
+ enum bpf_cgroup_iter_order order;
+ } cgroup;
};
typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog,
@@ -1966,6 +1974,15 @@ static inline bool unprivileged_ebpf_enabled(void)
return !sysctl_unprivileged_bpf_disabled;
}
+/* Not all bpf prog type has the bpf_ctx.
+ * For the bpf prog type that has initialized the bpf_ctx,
+ * this function can be used to decide if a kernel function
+ * is called by a bpf program.
+ */
+static inline bool has_current_bpf_ctx(void)
+{
+ return !!current->bpf_ctx;
+}
#else /* !CONFIG_BPF_SYSCALL */
static inline struct bpf_prog *bpf_prog_get(u32 ufd)
{
@@ -2175,6 +2192,10 @@ static inline bool unprivileged_ebpf_enabled(void)
return false;
}
+static inline bool has_current_bpf_ctx(void)
+{
+ return false;
+}
#endif /* CONFIG_BPF_SYSCALL */
void __bpf_free_used_btfs(struct bpf_prog_aux *aux,
@@ -2362,6 +2383,7 @@ extern const struct bpf_func_proto bpf_sock_map_update_proto;
extern const struct bpf_func_proto bpf_sock_hash_update_proto;
extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto;
extern const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto;
+extern const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto;
extern const struct bpf_func_proto bpf_msg_redirect_hash_proto;
extern const struct bpf_func_proto bpf_msg_redirect_map_proto;
extern const struct bpf_func_proto bpf_sk_redirect_hash_proto;
diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h
new file mode 100644
index 000000000000..3e164b8efaa9
--- /dev/null
+++ b/include/linux/bpf_mem_alloc.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */
+#ifndef _BPF_MEM_ALLOC_H
+#define _BPF_MEM_ALLOC_H
+#include <linux/compiler_types.h>
+#include <linux/workqueue.h>
+
+struct bpf_mem_cache;
+struct bpf_mem_caches;
+
+struct bpf_mem_alloc {
+ struct bpf_mem_caches __percpu *caches;
+ struct bpf_mem_cache __percpu *cache;
+ struct work_struct work;
+};
+
+int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu);
+void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma);
+
+/* kmalloc/kfree equivalent: */
+void *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size);
+void bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr);
+
+/* kmem_cache_alloc/free equivalent: */
+void *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma);
+void bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr);
+
+#endif /* _BPF_MEM_ALLOC_H */
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 2e3bad8640dc..1fdddbf3546b 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -212,6 +212,17 @@ struct bpf_reference_state {
* is used purely to inform the user of a reference leak.
*/
int insn_idx;
+ /* There can be a case like:
+ * main (frame 0)
+ * cb (frame 1)
+ * func (frame 3)
+ * cb (frame 4)
+ * Hence for frame 4, if callback_ref just stored boolean, it would be
+ * impossible to distinguish nested callback refs. Hence store the
+ * frameno and compare that to callback_ref in check_reference_leak when
+ * exiting a callback function.
+ */
+ int callback_ref;
};
/* state of the program:
diff --git a/include/linux/filter.h b/include/linux/filter.h
index a5f21dc3c432..527ae1d64e27 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -900,8 +900,7 @@ int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk);
int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk);
void sk_reuseport_prog_free(struct bpf_prog *prog);
int sk_detach_filter(struct sock *sk);
-int sk_get_filter(struct sock *sk, struct sock_filter __user *filter,
- unsigned int len);
+int sk_get_filter(struct sock *sk, sockptr_t optval, unsigned int len);
bool sk_filter_charge(struct sock *sk, struct sk_filter *fp);
void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp);
diff --git a/include/linux/igmp.h b/include/linux/igmp.h
index 93c262ecbdc9..78890143f079 100644
--- a/include/linux/igmp.h
+++ b/include/linux/igmp.h
@@ -118,9 +118,9 @@ extern int ip_mc_source(int add, int omode, struct sock *sk,
struct ip_mreq_source *mreqs, int ifindex);
extern int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf,int ifindex);
extern int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
- struct ip_msfilter __user *optval, int __user *optlen);
+ sockptr_t optval, sockptr_t optlen);
extern int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
- struct sockaddr_storage __user *p);
+ sockptr_t optval, size_t offset);
extern int ip_mc_sf_allow(struct sock *sk, __be32 local, __be32 rmt,
int dif, int sdif);
extern void ip_mc_init_dev(struct in_device *);
diff --git a/include/linux/mroute.h b/include/linux/mroute.h
index 6cbbfe94348c..80b8400ab8b2 100644
--- a/include/linux/mroute.h
+++ b/include/linux/mroute.h
@@ -17,7 +17,7 @@ static inline int ip_mroute_opt(int opt)
}
int ip_mroute_setsockopt(struct sock *, int, sockptr_t, unsigned int);
-int ip_mroute_getsockopt(struct sock *, int, char __user *, int __user *);
+int ip_mroute_getsockopt(struct sock *, int, sockptr_t, sockptr_t);
int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg);
int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg);
int ip_mr_init(void);
@@ -29,8 +29,8 @@ static inline int ip_mroute_setsockopt(struct sock *sock, int optname,
return -ENOPROTOOPT;
}
-static inline int ip_mroute_getsockopt(struct sock *sock, int optname,
- char __user *optval, int __user *optlen)
+static inline int ip_mroute_getsockopt(struct sock *sk, int optname,
+ sockptr_t optval, sockptr_t optlen)
{
return -ENOPROTOOPT;
}
diff --git a/include/linux/mroute6.h b/include/linux/mroute6.h
index bc351a85ce9b..8f2b307fb124 100644
--- a/include/linux/mroute6.h
+++ b/include/linux/mroute6.h
@@ -27,7 +27,7 @@ struct sock;
#ifdef CONFIG_IPV6_MROUTE
extern int ip6_mroute_setsockopt(struct sock *, int, sockptr_t, unsigned int);
-extern int ip6_mroute_getsockopt(struct sock *, int, char __user *, int __user *);
+extern int ip6_mroute_getsockopt(struct sock *, int, sockptr_t, sockptr_t);
extern int ip6_mr_input(struct sk_buff *skb);
extern int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg);
extern int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg);
@@ -42,7 +42,7 @@ static inline int ip6_mroute_setsockopt(struct sock *sock, int optname,
static inline
int ip6_mroute_getsockopt(struct sock *sock,
- int optname, char __user *optval, int __user *optlen)
+ int optname, sockptr_t optval, sockptr_t optlen)
{
return -ENOPROTOOPT;
}
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index b51d07a727c9..43c37385f1e9 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1461,8 +1461,8 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
unsigned int key_count);
struct bpf_flow_dissector;
-bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
- __be16 proto, int nhoff, int hlen, unsigned int flags);
+u32 bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
+ __be16 proto, int nhoff, int hlen, unsigned int flags);
bool __skb_flow_dissect(const struct net *net,
const struct sk_buff *skb,
diff --git a/include/linux/sockptr.h b/include/linux/sockptr.h
index d45902fb4cad..bae5e2369b4f 100644
--- a/include/linux/sockptr.h
+++ b/include/linux/sockptr.h
@@ -64,6 +64,11 @@ static inline int copy_to_sockptr_offset(sockptr_t dst, size_t offset,
return 0;
}
+static inline int copy_to_sockptr(sockptr_t dst, const void *src, size_t size)
+{
+ return copy_to_sockptr_offset(dst, 0, src, size);
+}
+
static inline void *memdup_sockptr(sockptr_t src, size_t len)
{
void *p = kmalloc_track_caller(len, GFP_USER | __GFP_NOWARN);
diff --git a/include/linux/tnum.h b/include/linux/tnum.h
index 498dbcedb451..1c3948a1d6ad 100644
--- a/include/linux/tnum.h
+++ b/include/linux/tnum.h
@@ -21,7 +21,12 @@ struct tnum {
struct tnum tnum_const(u64 value);
/* A completely unknown value */
extern const struct tnum tnum_unknown;
-/* A value that's unknown except that @min <= value <= @max */
+/* An unknown value that is a superset of @min <= value <= @max.
+ *
+ * Could include values outside the range of [@min, @max].
+ * For example tnum_range(0, 2) is represented by {0, 1, 2, *3*},
+ * rather than the intended set of {0, 1, 2}.
+ */
struct tnum tnum_range(u64 min, u64 max);
/* Arithmetic and logical ops */
@@ -73,7 +78,18 @@ static inline bool tnum_is_unknown(struct tnum a)
*/
bool tnum_is_aligned(struct tnum a, u64 size);
-/* Returns true if @b represents a subset of @a. */
+/* Returns true if @b represents a subset of @a.
+ *
+ * Note that using tnum_range() as @a requires extra cautions as tnum_in() may
+ * return true unexpectedly due to tnum limited ability to represent tight
+ * range, e.g.
+ *
+ * tnum_in(tnum_range(0, 2), tnum_const(3)) == true
+ *
+ * As a rule of thumb, if @a is explicitly coded rather than coming from
+ * reg->var_off, it should be in form of tnum_const(), tnum_range(0, 2**n - 1),
+ * or tnum_range(2**n, 2**(n+1) - 1).
+ */
bool tnum_in(struct tnum a, struct tnum b);
/* Formatting functions. These have snprintf-like semantics: they will write
diff --git a/include/net/ip.h b/include/net/ip.h
index 1c979fd1904c..038097c2a152 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -743,8 +743,12 @@ void ip_cmsg_recv_offset(struct msghdr *msg, struct sock *sk,
int ip_cmsg_send(struct sock *sk, struct msghdr *msg,
struct ipcm_cookie *ipc, bool allow_ipv6);
DECLARE_STATIC_KEY_FALSE(ip4_min_ttl);
+int do_ip_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
+ unsigned int optlen);
int ip_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
unsigned int optlen);
+int do_ip_getsockopt(struct sock *sk, int level, int optname,
+ sockptr_t optval, sockptr_t optlen);
int ip_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
int __user *optlen);
int ip_ra_control(struct sock *sk, unsigned char on,
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index de9dcc5652c4..d664ba5812d8 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -1156,8 +1156,12 @@ struct in6_addr *fl6_update_dst(struct flowi6 *fl6,
*/
DECLARE_STATIC_KEY_FALSE(ip6_min_hopcount);
+int do_ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
+ unsigned int optlen);
int ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
unsigned int optlen);
+int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
+ sockptr_t optval, sockptr_t optlen);
int ipv6_getsockopt(struct sock *sk, int level, int optname,
char __user *optval, int __user *optlen);
@@ -1207,7 +1211,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf,
struct sockaddr_storage *list);
int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf,
- struct sockaddr_storage __user *p);
+ sockptr_t optval, size_t ss_offset);
#ifdef CONFIG_PROC_FS
int ac6_proc_init(struct net *net);
diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h
index 45e0339be6fa..c48186bf4737 100644
--- a/include/net/ipv6_stubs.h
+++ b/include/net/ipv6_stubs.h
@@ -81,6 +81,10 @@ struct ipv6_bpf_stub {
const struct in6_addr *daddr, __be16 dport,
int dif, int sdif, struct udp_table *tbl,
struct sk_buff *skb);
+ int (*ipv6_setsockopt)(struct sock *sk, int level, int optname,
+ sockptr_t optval, unsigned int optlen);
+ int (*ipv6_getsockopt)(struct sock *sk, int level, int optname,
+ sockptr_t optval, sockptr_t optlen);
};
extern const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly;
diff --git a/include/net/sock.h b/include/net/sock.h
index ca469980e006..96a31026e35d 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1788,6 +1788,11 @@ static inline void unlock_sock_fast(struct sock *sk, bool slow)
}
}
+void sockopt_lock_sock(struct sock *sk);
+void sockopt_release_sock(struct sock *sk);
+bool sockopt_ns_capable(struct user_namespace *ns, int cap);
+bool sockopt_capable(int cap);
+
/* Used by processes to "lock" a socket state, so that
* interrupts and bottom half handlers won't change it
* from under us. It essentially blocks any incoming
@@ -1862,9 +1867,13 @@ void sock_pfree(struct sk_buff *skb);
#define sock_edemux sock_efree
#endif
+int sk_setsockopt(struct sock *sk, int level, int optname,
+ sockptr_t optval, unsigned int optlen);
int sock_setsockopt(struct socket *sock, int level, int op,
sockptr_t optval, unsigned int optlen);
+int sk_getsockopt(struct sock *sk, int level, int optname,
+ sockptr_t optval, sockptr_t optlen);
int sock_getsockopt(struct socket *sock, int level, int op,
char __user *optval, int __user *optlen);
int sock_gettstamp(struct socket *sock, void __user *userstamp,
diff --git a/include/net/tcp.h b/include/net/tcp.h
index d10962b9f0d0..735e957f7f4b 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -402,9 +402,13 @@ void tcp_init_sock(struct sock *sk);
void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb);
__poll_t tcp_poll(struct file *file, struct socket *sock,
struct poll_table_struct *wait);
+int do_tcp_getsockopt(struct sock *sk, int level,
+ int optname, sockptr_t optval, sockptr_t optlen);
int tcp_getsockopt(struct sock *sk, int level, int optname,
char __user *optval, int __user *optlen);
bool tcp_bpf_bypass_getsockopt(int level, int optname);
+int do_tcp_setsockopt(struct sock *sk, int level, int optname,
+ sockptr_t optval, unsigned int optlen);
int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
unsigned int optlen);
void tcp_set_keepalive(struct sock *sk, int val);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 1d6085e15fc8..793103b10eab 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -87,10 +87,29 @@ struct bpf_cgroup_storage_key {
__u32 attach_type; /* program attach type (enum bpf_attach_type) */
};
+enum bpf_cgroup_iter_order {
+ BPF_CGROUP_ITER_ORDER_UNSPEC = 0,
+ BPF_CGROUP_ITER_SELF_ONLY, /* process only a single object. */
+ BPF_CGROUP_ITER_DESCENDANTS_PRE, /* walk descendants in pre-order. */
+ BPF_CGROUP_ITER_DESCENDANTS_POST, /* walk descendants in post-order. */
+ BPF_CGROUP_ITER_ANCESTORS_UP, /* walk ancestors upward. */
+};
+
union bpf_iter_link_info {
struct {
__u32 map_fd;
} map;
+ struct {
+ enum bpf_cgroup_iter_order order;
+
+ /* At most one of cgroup_fd and cgroup_id can be non-zero. If
+ * both are zero, the walk starts from the default cgroup v2
+ * root. For walking v1 hierarchy, one should always explicitly
+ * specify cgroup_fd.
+ */
+ __u32 cgroup_fd;
+ __u64 cgroup_id;
+ } cgroup;
};
/* BPF syscall commands, see bpf(2) man-page for more details. */
@@ -4437,7 +4456,7 @@ union bpf_attr {
*
* **-EEXIST** if the option already exists.
*
- * **-EFAULT** on failrue to parse the existing header options.
+ * **-EFAULT** on failure to parse the existing header options.
*
* **-EPERM** if the helper cannot be used under the current
* *skops*\ **->op**.
@@ -4646,7 +4665,7 @@ union bpf_attr {
* a *map* with *task* as the **key**. From this
* perspective, the usage is not much different from
* **bpf_map_lookup_elem**\ (*map*, **&**\ *task*) except this
- * helper enforces the key must be an task_struct and the map must also
+ * helper enforces the key must be a task_struct and the map must also
* be a **BPF_MAP_TYPE_TASK_STORAGE**.
*
* Underneath, the value is stored locally at *task* instead of
@@ -4704,7 +4723,7 @@ union bpf_attr {
*
* long bpf_ima_inode_hash(struct inode *inode, void *dst, u32 size)
* Description
- * Returns the stored IMA hash of the *inode* (if it's avaialable).
+ * Returns the stored IMA hash of the *inode* (if it's available).
* If the hash is larger than *size*, then only *size*
* bytes will be copied to *dst*
* Return
@@ -4728,12 +4747,12 @@ union bpf_attr {
*
* The argument *len_diff* can be used for querying with a planned
* size change. This allows to check MTU prior to changing packet
- * ctx. Providing an *len_diff* adjustment that is larger than the
+ * ctx. Providing a *len_diff* adjustment that is larger than the
* actual packet size (resulting in negative packet size) will in
- * principle not exceed the MTU, why it is not considered a
- * failure. Other BPF-helpers are needed for performing the
- * planned size change, why the responsability for catch a negative
- * packet size belong in those helpers.
+ * principle not exceed the MTU, which is why it is not considered
+ * a failure. Other BPF helpers are needed for performing the
+ * planned size change; therefore the responsibility for catching
+ * a negative packet size belongs in those helpers.
*
* Specifying *ifindex* zero means the MTU check is performed
* against the current net device. This is practical if this isn't
@@ -5085,17 +5104,29 @@ union bpf_attr {
*
* int bpf_get_retval(void)
* Description
- * Get the syscall's return value that will be returned to userspace.
+ * Get the BPF program's return value that will be returned to the upper layers.
*
- * This helper is currently supported by cgroup programs only.
+ * This helper is currently supported by cgroup programs and only by the hooks
+ * where BPF program's return value is returned to the userspace via errno.
* Return
- * The syscall's return value.
+ * The BPF program's return value.
*
* int bpf_set_retval(int retval)
* Description
- * Set the syscall's return value that will be returned to userspace.
+ * Set the BPF program's return value that will be returned to the upper layers.
+ *
+ * This helper is currently supported by cgroup programs and only by the hooks
+ * where BPF program's return value is returned to the userspace via errno.
+ *
+ * Note that there is the following corner case where the program exports an error
+ * via bpf_set_retval but signals success via 'return 1':
+ *
+ * bpf_set_retval(-EPERM);
+ * return 1;
+ *
+ * In this case, the BPF program's return value will use helper's -EPERM. This
+ * still holds true for cgroup/bind{4,6} which supports extra 'return 3' success case.
*
- * This helper is currently supported by cgroup programs only.
* Return
* 0 on success, or a negative error in case of failure.
*
@@ -5628,6 +5659,11 @@ enum {
BPF_F_SEQ_NUMBER = (1ULL << 3),
};
+/* BPF_FUNC_skb_get_tunnel_key flags. */
+enum {
+ BPF_F_TUNINFO_FLAGS = (1ULL << 4),
+};
+
/* BPF_FUNC_perf_event_output, BPF_FUNC_perf_event_read and
* BPF_FUNC_perf_event_read_value flags.
*/
@@ -5817,7 +5853,10 @@ struct bpf_tunnel_key {
};
__u8 tunnel_tos;
__u8 tunnel_ttl;
- __u16 tunnel_ext; /* Padding, future use. */
+ union {
+ __u16 tunnel_ext; /* compat */
+ __be16 tunnel_flags;
+ };
__u32 tunnel_label;
union {
__u32 local_ipv4;
@@ -5861,6 +5900,11 @@ enum bpf_ret_code {
* represented by BPF_REDIRECT above).
*/
BPF_LWT_REROUTE = 128,
+ /* BPF_FLOW_DISSECTOR_CONTINUE: used by BPF_PROG_TYPE_FLOW_DISSECTOR
+ * to indicate that no custom dissection was performed, and
+ * fallback to standard dissector is requested.
+ */
+ BPF_FLOW_DISSECTOR_CONTINUE = 129,
};
struct bpf_sock {
@@ -6159,11 +6203,22 @@ struct bpf_link_info {
struct {
__aligned_u64 target_name; /* in/out: target_name buffer ptr */
__u32 target_name_len; /* in/out: target_name buffer len */
+
+ /* If the iter specific field is 32 bits, it can be put
+ * in the first or second union. Otherwise it should be
+ * put in the second union.
+ */
union {
struct {
__u32 map_id;
} map;
};
+ union {
+ struct {
+ __u64 cgroup_id;
+ __u32 order;
+ } cgroup;
+ };
} iter;
struct {
__u32 netns_ino;
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 057ba8e01e70..341c94f208f4 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -13,7 +13,7 @@ obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o
obj-$(CONFIG_BPF_SYSCALL) += disasm.o
obj-$(CONFIG_BPF_JIT) += trampoline.o
-obj-$(CONFIG_BPF_SYSCALL) += btf.o
+obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o
obj-$(CONFIG_BPF_JIT) += dispatcher.o
ifeq ($(CONFIG_NET),y)
obj-$(CONFIG_BPF_SYSCALL) += devmap.o
@@ -24,6 +24,9 @@ endif
ifeq ($(CONFIG_PERF_EVENTS),y)
obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
endif
+ifeq ($(CONFIG_CGROUPS),y)
+obj-$(CONFIG_BPF_SYSCALL) += cgroup_iter.o
+endif
obj-$(CONFIG_CGROUP_BPF) += cgroup.o
ifeq ($(CONFIG_INET),y)
obj-$(CONFIG_BPF_SYSCALL) += reuseport_array.o
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 97bb57493ed5..5dc307bdeaeb 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -694,19 +694,24 @@ struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop)
int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx)
{
+ struct bpf_run_ctx run_ctx, *old_run_ctx;
int ret;
if (prog->aux->sleepable) {
rcu_read_lock_trace();
migrate_disable();
might_fault();
+ old_run_ctx = bpf_set_run_ctx(&run_ctx);
ret = bpf_prog_run(prog, ctx);
+ bpf_reset_run_ctx(old_run_ctx);
migrate_enable();
rcu_read_unlock_trace();
} else {
rcu_read_lock();
migrate_disable();
+ old_run_ctx = bpf_set_run_ctx(&run_ctx);
ret = bpf_prog_run(prog, ctx);
+ bpf_reset_run_ctx(old_run_ctx);
migrate_enable();
rcu_read_unlock();
}
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 4ee2e7286c23..802fc15b0d73 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -555,11 +555,11 @@ void bpf_local_storage_map_free(struct bpf_local_storage_map *smap,
struct bpf_local_storage_elem, map_node))) {
if (busy_counter) {
migrate_disable();
- __this_cpu_inc(*busy_counter);
+ this_cpu_inc(*busy_counter);
}
bpf_selem_unlink(selem, false);
if (busy_counter) {
- __this_cpu_dec(*busy_counter);
+ this_cpu_dec(*busy_counter);
migrate_enable();
}
cond_resched_rcu();
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index fa71d58b7ded..4fd845bc5a12 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -41,17 +41,21 @@ BTF_SET_END(bpf_lsm_hooks)
*/
BTF_SET_START(bpf_lsm_current_hooks)
/* operate on freshly allocated sk without any cgroup association */
+#ifdef CONFIG_SECURITY_NETWORK
BTF_ID(func, bpf_lsm_sk_alloc_security)
BTF_ID(func, bpf_lsm_sk_free_security)
+#endif
BTF_SET_END(bpf_lsm_current_hooks)
/* List of LSM hooks that trigger while the socket is properly locked.
*/
BTF_SET_START(bpf_lsm_locked_sockopt_hooks)
+#ifdef CONFIG_SECURITY_NETWORK
BTF_ID(func, bpf_lsm_socket_sock_rcv_skb)
BTF_ID(func, bpf_lsm_sock_graft)
BTF_ID(func, bpf_lsm_inet_csk_clone)
BTF_ID(func, bpf_lsm_inet_conn_established)
+#endif
BTF_SET_END(bpf_lsm_locked_sockopt_hooks)
/* List of LSM hooks that trigger while the socket is _not_ locked,
@@ -59,8 +63,10 @@ BTF_SET_END(bpf_lsm_locked_sockopt_hooks)
* in the early init phase.
*/
BTF_SET_START(bpf_lsm_unlocked_sockopt_hooks)
+#ifdef CONFIG_SECURITY_NETWORK
BTF_ID(func, bpf_lsm_socket_post_create)
BTF_ID(func, bpf_lsm_socket_socketpair)
+#endif
BTF_SET_END(bpf_lsm_unlocked_sockopt_hooks)
#ifdef CONFIG_CGROUP_BPF
@@ -189,6 +195,14 @@ static const struct bpf_func_proto bpf_get_attach_cookie_proto = {
static const struct bpf_func_proto *
bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
+ const struct bpf_func_proto *func_proto;
+
+ if (prog->expected_attach_type == BPF_LSM_CGROUP) {
+ func_proto = cgroup_common_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+ }
+
switch (func_id) {
case BPF_FUNC_inode_storage_get:
return &bpf_inode_storage_get_proto;
@@ -212,15 +226,6 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return prog->aux->sleepable ? &bpf_ima_file_hash_proto : NULL;
case BPF_FUNC_get_attach_cookie:
return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto : NULL;
- case BPF_FUNC_get_local_storage:
- return prog->expected_attach_type == BPF_LSM_CGROUP ?
- &bpf_get_local_storage_proto : NULL;
- case BPF_FUNC_set_retval:
- return prog->expected_attach_type == BPF_LSM_CGROUP ?
- &bpf_set_retval_proto : NULL;
- case BPF_FUNC_get_retval:
- return prog->expected_attach_type == BPF_LSM_CGROUP ?
- &bpf_get_retval_proto : NULL;
#ifdef CONFIG_NET
case BPF_FUNC_setsockopt:
if (prog->expected_attach_type != BPF_LSM_CGROUP)
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index e9014dc62682..6f290623347e 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -26,20 +26,20 @@ static DEFINE_PER_CPU(int, bpf_task_storage_busy);
static void bpf_task_storage_lock(void)
{
migrate_disable();
- __this_cpu_inc(bpf_task_storage_busy);
+ this_cpu_inc(bpf_task_storage_busy);
}
static void bpf_task_storage_unlock(void)
{
- __this_cpu_dec(bpf_task_storage_busy);
+ this_cpu_dec(bpf_task_storage_busy);
migrate_enable();
}
static bool bpf_task_storage_trylock(void)
{
migrate_disable();
- if (unlikely(__this_cpu_inc_return(bpf_task_storage_busy) != 1)) {
- __this_cpu_dec(bpf_task_storage_busy);
+ if (unlikely(this_cpu_inc_return(bpf_task_storage_busy) != 1)) {
+ this_cpu_dec(bpf_task_storage_busy);
migrate_enable();
return false;
}
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 4a400cd63731..00c7f864900e 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1529,6 +1529,37 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
return ret;
}
+BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
+{
+ /* flags argument is not used now,
+ * but provides an ability to extend the API.
+ * verifier checks that its value is correct.
+ */
+ enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
+ struct bpf_cgroup_storage *storage;
+ struct bpf_cg_run_ctx *ctx;
+ void *ptr;
+
+ /* get current cgroup storage from BPF run context */
+ ctx = container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
+ storage = ctx->prog_item->cgroup_storage[stype];
+
+ if (stype == BPF_CGROUP_STORAGE_SHARED)
+ ptr = &READ_ONCE(storage->buf)->data[0];
+ else
+ ptr = this_cpu_ptr(storage->percpu_buf);
+
+ return (unsigned long)ptr;
+}
+
+const struct bpf_func_proto bpf_get_local_storage_proto = {
+ .func = bpf_get_local_storage,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_MAP_VALUE,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_ANYTHING,
+};
+
BPF_CALL_0(bpf_get_retval)
{
struct bpf_cg_run_ctx *ctx =
@@ -1560,32 +1591,26 @@ const struct bpf_func_proto bpf_set_retval_proto = {
};
static const struct bpf_func_proto *
-cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
+ const struct bpf_func_proto *func_proto;
+
+ func_proto = cgroup_common_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
+ func_proto = cgroup_current_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
switch (func_id) {
- case BPF_FUNC_get_current_uid_gid:
- return &bpf_get_current_uid_gid_proto;
- case BPF_FUNC_get_local_storage:
- return &bpf_get_local_storage_proto;
- case BPF_FUNC_get_current_cgroup_id:
- return &bpf_get_current_cgroup_id_proto;
case BPF_FUNC_perf_event_output:
return &bpf_event_output_data_proto;
- case BPF_FUNC_get_retval:
- return &bpf_get_retval_proto;
- case BPF_FUNC_set_retval:
- return &bpf_set_retval_proto;
default:
return bpf_base_func_proto(func_id);
}
}
-static const struct bpf_func_proto *
-cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
-{
- return cgroup_base_func_proto(func_id, prog);
-}
-
static bool cgroup_dev_is_valid_access(int off, int size,
enum bpf_access_type type,
const struct bpf_prog *prog,
@@ -2098,11 +2123,17 @@ static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = {
static const struct bpf_func_proto *
sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
+ const struct bpf_func_proto *func_proto;
+
+ func_proto = cgroup_common_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
+ func_proto = cgroup_current_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
switch (func_id) {
- case BPF_FUNC_strtol:
- return &bpf_strtol_proto;
- case BPF_FUNC_strtoul:
- return &bpf_strtoul_proto;
case BPF_FUNC_sysctl_get_name:
return &bpf_sysctl_get_name_proto;
case BPF_FUNC_sysctl_get_current_value:
@@ -2113,8 +2144,10 @@ sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_sysctl_set_new_value_proto;
case BPF_FUNC_ktime_get_coarse_ns:
return &bpf_ktime_get_coarse_ns_proto;
+ case BPF_FUNC_perf_event_output:
+ return &bpf_event_output_data_proto;
default:
- return cgroup_base_func_proto(func_id, prog);
+ return bpf_base_func_proto(func_id);
}
}
@@ -2235,6 +2268,16 @@ static const struct bpf_func_proto bpf_get_netns_cookie_sockopt_proto = {
static const struct bpf_func_proto *
cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
+ const struct bpf_func_proto *func_proto;
+
+ func_proto = cgroup_common_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
+ func_proto = cgroup_current_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
switch (func_id) {
#ifdef CONFIG_NET
case BPF_FUNC_get_netns_cookie:
@@ -2256,8 +2299,10 @@ cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_tcp_sock:
return &bpf_tcp_sock_proto;
#endif
+ case BPF_FUNC_perf_event_output:
+ return &bpf_event_output_data_proto;
default:
- return cgroup_base_func_proto(func_id, prog);
+ return bpf_base_func_proto(func_id);
}
}
@@ -2422,3 +2467,69 @@ const struct bpf_verifier_ops cg_sockopt_verifier_ops = {
const struct bpf_prog_ops cg_sockopt_prog_ops = {
};
+
+/* Common helpers for cgroup hooks. */
+const struct bpf_func_proto *
+cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_get_local_storage:
+ return &bpf_get_local_storage_proto;
+ case BPF_FUNC_get_retval:
+ switch (prog->expected_attach_type) {
+ case BPF_CGROUP_INET_INGRESS:
+ case BPF_CGROUP_INET_EGRESS:
+ case BPF_CGROUP_SOCK_OPS:
+ case BPF_CGROUP_UDP4_RECVMSG:
+ case BPF_CGROUP_UDP6_RECVMSG:
+ case BPF_CGROUP_INET4_GETPEERNAME:
+ case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_INET4_GETSOCKNAME:
+ case BPF_CGROUP_INET6_GETSOCKNAME:
+ return NULL;
+ default:
+ return &bpf_get_retval_proto;
+ }
+ case BPF_FUNC_set_retval:
+ switch (prog->expected_attach_type) {
+ case BPF_CGROUP_INET_INGRESS:
+ case BPF_CGROUP_INET_EGRESS:
+ case BPF_CGROUP_SOCK_OPS:
+ case BPF_CGROUP_UDP4_RECVMSG:
+ case BPF_CGROUP_UDP6_RECVMSG:
+ case BPF_CGROUP_INET4_GETPEERNAME:
+ case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_INET4_GETSOCKNAME:
+ case BPF_CGROUP_INET6_GETSOCKNAME:
+ return NULL;
+ default:
+ return &bpf_set_retval_proto;
+ }
+ default:
+ return NULL;
+ }
+}
+
+/* Common helpers for cgroup hooks with valid process context. */
+const struct bpf_func_proto *
+cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_get_current_uid_gid:
+ return &bpf_get_current_uid_gid_proto;
+ case BPF_FUNC_get_current_pid_tgid:
+ return &bpf_get_current_pid_tgid_proto;
+ case BPF_FUNC_get_current_comm:
+ return &bpf_get_current_comm_proto;
+ case BPF_FUNC_get_current_cgroup_id:
+ return &bpf_get_current_cgroup_id_proto;
+ case BPF_FUNC_get_current_ancestor_cgroup_id:
+ return &bpf_get_current_ancestor_cgroup_id_proto;
+#ifdef CONFIG_CGROUP_NET_CLASSID
+ case BPF_FUNC_get_cgroup_classid:
+ return &bpf_get_cgroup_classid_curr_proto;
+#endif
+ default:
+ return NULL;
+ }
+}
diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c
new file mode 100644
index 000000000000..0d200a993489
--- /dev/null
+++ b/kernel/bpf/cgroup_iter.c
@@ -0,0 +1,282 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2022 Google */
+#include <linux/bpf.h>
+#include <linux/btf_ids.h>
+#include <linux/cgroup.h>
+#include <linux/kernel.h>
+#include <linux/seq_file.h>
+
+#include "../cgroup/cgroup-internal.h" /* cgroup_mutex and cgroup_is_dead */
+
+/* cgroup_iter provides four modes of traversal to the cgroup hierarchy.
+ *
+ * 1. Walk the descendants of a cgroup in pre-order.
+ * 2. Walk the descendants of a cgroup in post-order.
+ * 3. Walk the ancestors of a cgroup.
+ * 4. Show the given cgroup only.
+ *
+ * For walking descendants, cgroup_iter can walk in either pre-order or
+ * post-order. For walking ancestors, the iter walks up from a cgroup to
+ * the root.
+ *
+ * The iter program can terminate the walk early by returning 1. Walk
+ * continues if prog returns 0.
+ *
+ * The prog can check (seq->num == 0) to determine whether this is
+ * the first element. The prog may also be passed a NULL cgroup,
+ * which means the walk has completed and the prog has a chance to
+ * do post-processing, such as outputting an epilogue.
+ *
+ * Note: the iter_prog is called with cgroup_mutex held.
+ *
+ * Currently only one session is supported, which means, depending on the
+ * volume of data bpf program intends to send to user space, the number
+ * of cgroups that can be walked is limited. For example, given the current
+ * buffer size is 8 * PAGE_SIZE, if the program sends 64B data for each
+ * cgroup, assuming PAGE_SIZE is 4kb, the total number of cgroups that can
+ * be walked is 512. This is a limitation of cgroup_iter. If the output data
+ * is larger than the kernel buffer size, after all data in the kernel buffer
+ * is consumed by user space, the subsequent read() syscall will signal
+ * EOPNOTSUPP. In order to work around, the user may have to update their
+ * program to reduce the volume of data sent to output. For example, skip
+ * some uninteresting cgroups.
+ */
+
+struct bpf_iter__cgroup {
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
+ __bpf_md_ptr(struct cgroup *, cgroup);
+};
+
+struct cgroup_iter_priv {
+ struct cgroup_subsys_state *start_css;
+ bool visited_all;
+ bool terminate;
+ int order;
+};
+
+static void *cgroup_iter_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct cgroup_iter_priv *p = seq->private;
+
+ mutex_lock(&cgroup_mutex);
+
+ /* cgroup_iter doesn't support read across multiple sessions. */
+ if (*pos > 0) {
+ if (p->visited_all)
+ return NULL;
+
+ /* Haven't visited all, but because cgroup_mutex has dropped,
+ * return -EOPNOTSUPP to indicate incomplete iteration.
+ */
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+
+ ++*pos;
+ p->terminate = false;
+ p->visited_all = false;
+ if (p->order == BPF_CGROUP_ITER_DESCENDANTS_PRE)
+ return css_next_descendant_pre(NULL, p->start_css);
+ else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST)
+ return css_next_descendant_post(NULL, p->start_css);
+ else /* BPF_CGROUP_ITER_SELF_ONLY and BPF_CGROUP_ITER_ANCESTORS_UP */
+ return p->start_css;
+}
+
+static int __cgroup_iter_seq_show(struct seq_file *seq,
+ struct cgroup_subsys_state *css, int in_stop);
+
+static void cgroup_iter_seq_stop(struct seq_file *seq, void *v)
+{
+ struct cgroup_iter_priv *p = seq->private;
+
+ mutex_unlock(&cgroup_mutex);
+
+ /* pass NULL to the prog for post-processing */
+ if (!v) {
+ __cgroup_iter_seq_show(seq, NULL, true);
+ p->visited_all = true;
+ }
+}
+
+static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct cgroup_subsys_state *curr = (struct cgroup_subsys_state *)v;
+ struct cgroup_iter_priv *p = seq->private;
+
+ ++*pos;
+ if (p->terminate)
+ return NULL;
+
+ if (p->order == BPF_CGROUP_ITER_DESCENDANTS_PRE)
+ return css_next_descendant_pre(curr, p->start_css);
+ else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST)
+ return css_next_descendant_post(curr, p->start_css);
+ else if (p->order == BPF_CGROUP_ITER_ANCESTORS_UP)
+ return curr->parent;
+ else /* BPF_CGROUP_ITER_SELF_ONLY */
+ return NULL;
+}
+
+static int __cgroup_iter_seq_show(struct seq_file *seq,
+ struct cgroup_subsys_state *css, int in_stop)
+{
+ struct cgroup_iter_priv *p = seq->private;
+ struct bpf_iter__cgroup ctx;
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+ int ret = 0;
+
+ /* cgroup is dead, skip this element */
+ if (css && cgroup_is_dead(css->cgroup))
+ return 0;
+
+ ctx.meta = &meta;
+ ctx.cgroup = css ? css->cgroup : NULL;
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, in_stop);
+ if (prog)
+ ret = bpf_iter_run_prog(prog, &ctx);
+
+ /* if prog returns > 0, terminate after this element. */
+ if (ret != 0)
+ p->terminate = true;
+
+ return 0;
+}
+
+static int cgroup_iter_seq_show(struct seq_file *seq, void *v)
+{
+ return __cgroup_iter_seq_show(seq, (struct cgroup_subsys_state *)v,
+ false);
+}
+
+static const struct seq_operations cgroup_iter_seq_ops = {
+ .start = cgroup_iter_seq_start,
+ .next = cgroup_iter_seq_next,
+ .stop = cgroup_iter_seq_stop,
+ .show = cgroup_iter_seq_show,
+};
+
+BTF_ID_LIST_SINGLE(bpf_cgroup_btf_id, struct, cgroup)
+
+static int cgroup_iter_seq_init(void *priv, struct bpf_iter_aux_info *aux)
+{
+ struct cgroup_iter_priv *p = (struct cgroup_iter_priv *)priv;
+ struct cgroup *cgrp = aux->cgroup.start;
+
+ p->start_css = &cgrp->self;
+ p->terminate = false;
+ p->visited_all = false;
+ p->order = aux->cgroup.order;
+ return 0;
+}
+
+static const struct bpf_iter_seq_info cgroup_iter_seq_info = {
+ .seq_ops = &cgroup_iter_seq_ops,
+ .init_seq_private = cgroup_iter_seq_init,
+ .seq_priv_size = sizeof(struct cgroup_iter_priv),
+};
+
+static int bpf_iter_attach_cgroup(struct bpf_prog *prog,
+ union bpf_iter_link_info *linfo,
+ struct bpf_iter_aux_info *aux)
+{
+ int fd = linfo->cgroup.cgroup_fd;
+ u64 id = linfo->cgroup.cgroup_id;
+ int order = linfo->cgroup.order;
+ struct cgroup *cgrp;
+
+ if (order != BPF_CGROUP_ITER_DESCENDANTS_PRE &&
+ order != BPF_CGROUP_ITER_DESCENDANTS_POST &&
+ order != BPF_CGROUP_ITER_ANCESTORS_UP &&
+ order != BPF_CGROUP_ITER_SELF_ONLY)
+ return -EINVAL;
+
+ if (fd && id)
+ return -EINVAL;
+
+ if (fd)
+ cgrp = cgroup_get_from_fd(fd);
+ else if (id)
+ cgrp = cgroup_get_from_id(id);
+ else /* walk the entire hierarchy by default. */
+ cgrp = cgroup_get_from_path("/");
+
+ if (IS_ERR(cgrp))
+ return PTR_ERR(cgrp);
+
+ aux->cgroup.start = cgrp;
+ aux->cgroup.order = order;
+ return 0;
+}
+
+static void bpf_iter_detach_cgroup(struct bpf_iter_aux_info *aux)
+{
+ cgroup_put(aux->cgroup.start);
+}
+
+static void bpf_iter_cgroup_show_fdinfo(const struct bpf_iter_aux_info *aux,
+ struct seq_file *seq)
+{
+ char *buf;
+
+ buf = kzalloc(PATH_MAX, GFP_KERNEL);
+ if (!buf) {
+ seq_puts(seq, "cgroup_path:\t<unknown>\n");
+ goto show_order;
+ }
+
+ /* If cgroup_path_ns() fails, buf will be an empty string, cgroup_path
+ * will print nothing.
+ *
+ * Path is in the calling process's cgroup namespace.
+ */
+ cgroup_path_ns(aux->cgroup.start, buf, PATH_MAX,
+ current->nsproxy->cgroup_ns);
+ seq_printf(seq, "cgroup_path:\t%s\n", buf);
+ kfree(buf);
+
+show_order:
+ if (aux->cgroup.order == BPF_CGROUP_ITER_DESCENDANTS_PRE)
+ seq_puts(seq, "order: descendants_pre\n");
+ else if (aux->cgroup.order == BPF_CGROUP_ITER_DESCENDANTS_POST)
+ seq_puts(seq, "order: descendants_post\n");
+ else if (aux->cgroup.order == BPF_CGROUP_ITER_ANCESTORS_UP)
+ seq_puts(seq, "order: ancestors_up\n");
+ else /* BPF_CGROUP_ITER_SELF_ONLY */
+ seq_puts(seq, "order: self_only\n");
+}
+
+static int bpf_iter_cgroup_fill_link_info(const struct bpf_iter_aux_info *aux,
+ struct bpf_link_info *info)
+{
+ info->iter.cgroup.order = aux->cgroup.order;
+ info->iter.cgroup.cgroup_id = cgroup_id(aux->cgroup.start);
+ return 0;
+}
+
+DEFINE_BPF_ITER_FUNC(cgroup, struct bpf_iter_meta *meta,
+ struct cgroup *cgroup)
+
+static struct bpf_iter_reg bpf_cgroup_reg_info = {
+ .target = "cgroup",
+ .feature = BPF_ITER_RESCHED,
+ .attach_target = bpf_iter_attach_cgroup,
+ .detach_target = bpf_iter_detach_cgroup,
+ .show_fdinfo = bpf_iter_cgroup_show_fdinfo,
+ .fill_link_info = bpf_iter_cgroup_fill_link_info,
+ .ctx_arg_info_size = 1,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__cgroup, cgroup),
+ PTR_TO_BTF_ID_OR_NULL },
+ },
+ .seq_info = &cgroup_iter_seq_info,
+};
+
+static int __init bpf_cgroup_iter_init(void)
+{
+ bpf_cgroup_reg_info.ctx_arg_info[0].btf_id = bpf_cgroup_btf_id[0];
+ return bpf_iter_reg_target(&bpf_cgroup_reg_info);
+}
+
+late_initcall(bpf_cgroup_iter_init);
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index b301a63afa2f..0fe3f136cbbe 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -14,6 +14,7 @@
#include "percpu_freelist.h"
#include "bpf_lru_list.h"
#include "map_in_map.h"
+#include <linux/bpf_mem_alloc.h>
#define HTAB_CREATE_FLAG_MASK \
(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \
@@ -92,6 +93,8 @@ struct bucket {
struct bpf_htab {
struct bpf_map map;
+ struct bpf_mem_alloc ma;
+ struct bpf_mem_alloc pcpu_ma;
struct bucket *buckets;
void *elems;
union {
@@ -99,7 +102,12 @@ struct bpf_htab {
struct bpf_lru lru;
};
struct htab_elem *__percpu *extra_elems;
- atomic_t count; /* number of elements in this hashtable */
+ /* number of elements in non-preallocated hashtable are kept
+ * in either pcount or count
+ */
+ struct percpu_counter pcount;
+ atomic_t count;
+ bool use_percpu_counter;
u32 n_buckets; /* number of hash buckets */
u32 elem_size; /* size of each element in bytes */
u32 hashrnd;
@@ -114,14 +122,14 @@ struct htab_elem {
struct {
void *padding;
union {
- struct bpf_htab *htab;
struct pcpu_freelist_node fnode;
struct htab_elem *batch_flink;
};
};
};
union {
- struct rcu_head rcu;
+ /* pointer to per-cpu pointer */
+ void *ptr_to_pptr;
struct bpf_lru_node lru_node;
};
u32 hash;
@@ -162,17 +170,25 @@ static inline int htab_lock_bucket(const struct bpf_htab *htab,
unsigned long *pflags)
{
unsigned long flags;
+ bool use_raw_lock;
hash = hash & HASHTAB_MAP_LOCK_MASK;
- migrate_disable();
+ use_raw_lock = htab_use_raw_lock(htab);
+ if (use_raw_lock)
+ preempt_disable();
+ else
+ migrate_disable();
if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) {
__this_cpu_dec(*(htab->map_locked[hash]));
- migrate_enable();
+ if (use_raw_lock)
+ preempt_enable();
+ else
+ migrate_enable();
return -EBUSY;
}
- if (htab_use_raw_lock(htab))
+ if (use_raw_lock)
raw_spin_lock_irqsave(&b->raw_lock, flags);
else
spin_lock_irqsave(&b->lock, flags);
@@ -185,13 +201,18 @@ static inline void htab_unlock_bucket(const struct bpf_htab *htab,
struct bucket *b, u32 hash,
unsigned long flags)
{
+ bool use_raw_lock = htab_use_raw_lock(htab);
+
hash = hash & HASHTAB_MAP_LOCK_MASK;
- if (htab_use_raw_lock(htab))
+ if (use_raw_lock)
raw_spin_unlock_irqrestore(&b->raw_lock, flags);
else
spin_unlock_irqrestore(&b->lock, flags);
__this_cpu_dec(*(htab->map_locked[hash]));
- migrate_enable();
+ if (use_raw_lock)
+ preempt_enable();
+ else
+ migrate_enable();
}
static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node);
@@ -428,8 +449,6 @@ static int htab_map_alloc_check(union bpf_attr *attr)
bool zero_seed = (attr->map_flags & BPF_F_ZERO_SEED);
int numa_node = bpf_map_attr_numa_node(attr);
- BUILD_BUG_ON(offsetof(struct htab_elem, htab) !=
- offsetof(struct htab_elem, hash_node.pprev));
BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) !=
offsetof(struct htab_elem, hash_node.pprev));
@@ -550,6 +569,29 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
htab_init_buckets(htab);
+/* compute_batch_value() computes batch value as num_online_cpus() * 2
+ * and __percpu_counter_compare() needs
+ * htab->max_entries - cur_number_of_elems to be more than batch * num_online_cpus()
+ * for percpu_counter to be faster than atomic_t. In practice the average bpf
+ * hash map size is 10k, which means that a system with 64 cpus will fill
+ * hashmap to 20% of 10k before percpu_counter becomes ineffective. Therefore
+ * define our own batch count as 32 then 10k hash map can be filled up to 80%:
+ * 10k - 8k > 32 _batch_ * 64 _cpus_
+ * and __percpu_counter_compare() will still be fast. At that point hash map
+ * collisions will dominate its performance anyway. Assume that hash map filled
+ * to 50+% isn't going to be O(1) and use the following formula to choose
+ * between percpu_counter and atomic_t.
+ */
+#define PERCPU_COUNTER_BATCH 32
+ if (attr->max_entries / 2 > num_online_cpus() * PERCPU_COUNTER_BATCH)
+ htab->use_percpu_counter = true;
+
+ if (htab->use_percpu_counter) {
+ err = percpu_counter_init(&htab->pcount, 0, GFP_KERNEL);
+ if (err)
+ goto free_map_locked;
+ }
+
if (prealloc) {
err = prealloc_init(htab);
if (err)
@@ -563,6 +605,16 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
if (err)
goto free_prealloc;
}
+ } else {
+ err = bpf_mem_alloc_init(&htab->ma, htab->elem_size, false);
+ if (err)
+ goto free_map_locked;
+ if (percpu) {
+ err = bpf_mem_alloc_init(&htab->pcpu_ma,
+ round_up(htab->map.value_size, 8), true);
+ if (err)
+ goto free_map_locked;
+ }
}
return &htab->map;
@@ -573,6 +625,8 @@ free_map_locked:
for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++)
free_percpu(htab->map_locked[i]);
bpf_map_area_free(htab->buckets);
+ bpf_mem_alloc_destroy(&htab->pcpu_ma);
+ bpf_mem_alloc_destroy(&htab->ma);
free_htab:
lockdep_unregister_key(&htab->lockdep_key);
bpf_map_area_free(htab);
@@ -847,17 +901,9 @@ find_first_elem:
static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
{
if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
- free_percpu(htab_elem_get_ptr(l, htab->map.key_size));
+ bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr);
check_and_free_fields(htab, l);
- kfree(l);
-}
-
-static void htab_elem_free_rcu(struct rcu_head *head)
-{
- struct htab_elem *l = container_of(head, struct htab_elem, rcu);
- struct bpf_htab *htab = l->htab;
-
- htab_elem_free(htab, l);
+ bpf_mem_cache_free(&htab->ma, l);
}
static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l)
@@ -871,6 +917,31 @@ static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l)
}
}
+static bool is_map_full(struct bpf_htab *htab)
+{
+ if (htab->use_percpu_counter)
+ return __percpu_counter_compare(&htab->pcount, htab->map.max_entries,
+ PERCPU_COUNTER_BATCH) >= 0;
+ return atomic_read(&htab->count) >= htab->map.max_entries;
+}
+
+static void inc_elem_count(struct bpf_htab *htab)
+{
+ if (htab->use_percpu_counter)
+ percpu_counter_add_batch(&htab->pcount, 1, PERCPU_COUNTER_BATCH);
+ else
+ atomic_inc(&htab->count);
+}
+
+static void dec_elem_count(struct bpf_htab *htab)
+{
+ if (htab->use_percpu_counter)
+ percpu_counter_add_batch(&htab->pcount, -1, PERCPU_COUNTER_BATCH);
+ else
+ atomic_dec(&htab->count);
+}
+
+
static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
{
htab_put_fd_value(htab, l);
@@ -879,9 +950,8 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
check_and_free_fields(htab, l);
__pcpu_freelist_push(&htab->freelist, &l->fnode);
} else {
- atomic_dec(&htab->count);
- l->htab = htab;
- call_rcu(&l->rcu, htab_elem_free_rcu);
+ dec_elem_count(htab);
+ htab_elem_free(htab, l);
}
}
@@ -906,13 +976,12 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr,
void *value, bool onallcpus)
{
- /* When using prealloc and not setting the initial value on all cpus,
- * zero-fill element values for other cpus (just as what happens when
- * not using prealloc). Otherwise, bpf program has no way to ensure
+ /* When not setting the initial value on all cpus, zero-fill element
+ * values for other cpus. Otherwise, bpf program has no way to ensure
* known initial values for cpus other than current one
* (onallcpus=false always when coming from bpf prog).
*/
- if (htab_is_prealloc(htab) && !onallcpus) {
+ if (!onallcpus) {
u32 size = round_up(htab->map.value_size, 8);
int current_cpu = raw_smp_processor_id();
int cpu;
@@ -963,19 +1032,16 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
l_new = container_of(l, struct htab_elem, fnode);
}
} else {
- if (atomic_inc_return(&htab->count) > htab->map.max_entries)
- if (!old_elem) {
+ if (is_map_full(htab))
+ if (!old_elem)
/* when map is full and update() is replacing
* old element, it's ok to allocate, since
* old element will be freed immediately.
* Otherwise return an error
*/
- l_new = ERR_PTR(-E2BIG);
- goto dec_count;
- }
- l_new = bpf_map_kmalloc_node(&htab->map, htab->elem_size,
- GFP_NOWAIT | __GFP_NOWARN,
- htab->map.numa_node);
+ return ERR_PTR(-E2BIG);
+ inc_elem_count(htab);
+ l_new = bpf_mem_cache_alloc(&htab->ma);
if (!l_new) {
l_new = ERR_PTR(-ENOMEM);
goto dec_count;
@@ -986,18 +1052,18 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
memcpy(l_new->key, key, key_size);
if (percpu) {
- size = round_up(size, 8);
if (prealloc) {
pptr = htab_elem_get_ptr(l_new, key_size);
} else {
/* alloc_percpu zero-fills */
- pptr = bpf_map_alloc_percpu(&htab->map, size, 8,
- GFP_NOWAIT | __GFP_NOWARN);
+ pptr = bpf_mem_cache_alloc(&htab->pcpu_ma);
if (!pptr) {
- kfree(l_new);
+ bpf_mem_cache_free(&htab->ma, l_new);
l_new = ERR_PTR(-ENOMEM);
goto dec_count;
}
+ l_new->ptr_to_pptr = pptr;
+ pptr = *(void **)pptr;
}
pcpu_init_value(htab, pptr, value, onallcpus);
@@ -1016,7 +1082,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
l_new->hash = hash;
return l_new;
dec_count:
- atomic_dec(&htab->count);
+ dec_elem_count(htab);
return l_new;
}
@@ -1416,6 +1482,10 @@ static void delete_all_elements(struct bpf_htab *htab)
{
int i;
+ /* It's called from a worker thread, so disable migration here,
+ * since bpf_mem_cache_free() relies on that.
+ */
+ migrate_disable();
for (i = 0; i < htab->n_buckets; i++) {
struct hlist_nulls_head *head = select_bucket(htab, i);
struct hlist_nulls_node *n;
@@ -1426,6 +1496,7 @@ static void delete_all_elements(struct bpf_htab *htab)
htab_elem_free(htab, l);
}
}
+ migrate_enable();
}
static void htab_free_malloced_timers(struct bpf_htab *htab)
@@ -1475,10 +1546,10 @@ static void htab_map_free(struct bpf_map *map)
* There is no need to synchronize_rcu() here to protect map elements.
*/
- /* some of free_htab_elem() callbacks for elements of this map may
- * not have executed. Wait for them.
+ /* htab no longer uses call_rcu() directly. bpf_mem_alloc does it
+ * underneath and is reponsible for waiting for callbacks to finish
+ * during bpf_mem_alloc_destroy().
*/
- rcu_barrier();
if (!htab_is_prealloc(htab)) {
delete_all_elements(htab);
} else {
@@ -1489,6 +1560,10 @@ static void htab_map_free(struct bpf_map *map)
bpf_map_free_kptr_off_tab(map);
free_percpu(htab->extra_elems);
bpf_map_area_free(htab->buckets);
+ bpf_mem_alloc_destroy(&htab->pcpu_ma);
+ bpf_mem_alloc_destroy(&htab->ma);
+ if (htab->use_percpu_counter)
+ percpu_counter_destroy(&htab->pcount);
for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++)
free_percpu(htab->map_locked[i]);
lockdep_unregister_key(&htab->lockdep_key);
@@ -1691,8 +1766,11 @@ again_nocopy:
/* do not grab the lock unless need it (bucket_cnt > 0). */
if (locked) {
ret = htab_lock_bucket(htab, b, batch, &flags);
- if (ret)
- goto next_batch;
+ if (ret) {
+ rcu_read_unlock();
+ bpf_enable_instrumentation();
+ goto after_loop;
+ }
}
bucket_cnt = 0;
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 3c1b9bbcf971..fc08035f14ed 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -427,40 +427,7 @@ const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = {
.ret_type = RET_INTEGER,
.arg1_type = ARG_ANYTHING,
};
-
-#ifdef CONFIG_CGROUP_BPF
-
-BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
-{
- /* flags argument is not used now,
- * but provides an ability to extend the API.
- * verifier checks that its value is correct.
- */
- enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
- struct bpf_cgroup_storage *storage;
- struct bpf_cg_run_ctx *ctx;
- void *ptr;
-
- /* get current cgroup storage from BPF run context */
- ctx = container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
- storage = ctx->prog_item->cgroup_storage[stype];
-
- if (stype == BPF_CGROUP_STORAGE_SHARED)
- ptr = &READ_ONCE(storage->buf)->data[0];
- else
- ptr = this_cpu_ptr(storage->percpu_buf);
-
- return (unsigned long)ptr;
-}
-
-const struct bpf_func_proto bpf_get_local_storage_proto = {
- .func = bpf_get_local_storage,
- .gpl_only = false,
- .ret_type = RET_PTR_TO_MAP_VALUE,
- .arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_ANYTHING,
-};
-#endif
+#endif /* CONFIG_CGROUPS */
#define BPF_STRTOX_BASE_MASK 0x1F
@@ -589,7 +556,6 @@ const struct bpf_func_proto bpf_strtoul_proto = {
.arg3_type = ARG_ANYTHING,
.arg4_type = ARG_PTR_TO_LONG,
};
-#endif
BPF_CALL_3(bpf_strncmp, const char *, s1, u32, s1_sz, const char *, s2)
{
@@ -1647,12 +1613,12 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_ringbuf_submit_dynptr_proto;
case BPF_FUNC_ringbuf_discard_dynptr:
return &bpf_ringbuf_discard_dynptr_proto;
- case BPF_FUNC_for_each_map_elem:
- return &bpf_for_each_map_elem_proto;
- case BPF_FUNC_loop:
- return &bpf_loop_proto;
case BPF_FUNC_strncmp:
return &bpf_strncmp_proto;
+ case BPF_FUNC_strtol:
+ return &bpf_strtol_proto;
+ case BPF_FUNC_strtoul:
+ return &bpf_strtoul_proto;
case BPF_FUNC_dynptr_from_mem:
return &bpf_dynptr_from_mem_proto;
case BPF_FUNC_dynptr_read:
@@ -1689,6 +1655,10 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_timer_cancel_proto;
case BPF_FUNC_kptr_xchg:
return &bpf_kptr_xchg_proto;
+ case BPF_FUNC_for_each_map_elem:
+ return &bpf_for_each_map_elem_proto;
+ case BPF_FUNC_loop:
+ return &bpf_loop_proto;
default:
break;
}
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
new file mode 100644
index 000000000000..5cc952da7d41
--- /dev/null
+++ b/kernel/bpf/memalloc.c
@@ -0,0 +1,634 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */
+#include <linux/mm.h>
+#include <linux/llist.h>
+#include <linux/bpf.h>
+#include <linux/irq_work.h>
+#include <linux/bpf_mem_alloc.h>
+#include <linux/memcontrol.h>
+#include <asm/local.h>
+
+/* Any context (including NMI) BPF specific memory allocator.
+ *
+ * Tracing BPF programs can attach to kprobe and fentry. Hence they
+ * run in unknown context where calling plain kmalloc() might not be safe.
+ *
+ * Front-end kmalloc() with per-cpu per-bucket cache of free elements.
+ * Refill this cache asynchronously from irq_work.
+ *
+ * CPU_0 buckets
+ * 16 32 64 96 128 196 256 512 1024 2048 4096
+ * ...
+ * CPU_N buckets
+ * 16 32 64 96 128 196 256 512 1024 2048 4096
+ *
+ * The buckets are prefilled at the start.
+ * BPF programs always run with migration disabled.
+ * It's safe to allocate from cache of the current cpu with irqs disabled.
+ * Free-ing is always done into bucket of the current cpu as well.
+ * irq_work trims extra free elements from buckets with kfree
+ * and refills them with kmalloc, so global kmalloc logic takes care
+ * of freeing objects allocated by one cpu and freed on another.
+ *
+ * Every allocated objected is padded with extra 8 bytes that contains
+ * struct llist_node.
+ */
+#define LLIST_NODE_SZ sizeof(struct llist_node)
+
+/* similar to kmalloc, but sizeof == 8 bucket is gone */
+static u8 size_index[24] __ro_after_init = {
+ 3, /* 8 */
+ 3, /* 16 */
+ 4, /* 24 */
+ 4, /* 32 */
+ 5, /* 40 */
+ 5, /* 48 */
+ 5, /* 56 */
+ 5, /* 64 */
+ 1, /* 72 */
+ 1, /* 80 */
+ 1, /* 88 */
+ 1, /* 96 */
+ 6, /* 104 */
+ 6, /* 112 */
+ 6, /* 120 */
+ 6, /* 128 */
+ 2, /* 136 */
+ 2, /* 144 */
+ 2, /* 152 */
+ 2, /* 160 */
+ 2, /* 168 */
+ 2, /* 176 */
+ 2, /* 184 */
+ 2 /* 192 */
+};
+
+static int bpf_mem_cache_idx(size_t size)
+{
+ if (!size || size > 4096)
+ return -1;
+
+ if (size <= 192)
+ return size_index[(size - 1) / 8] - 1;
+
+ return fls(size - 1) - 1;
+}
+
+#define NUM_CACHES 11
+
+struct bpf_mem_cache {
+ /* per-cpu list of free objects of size 'unit_size'.
+ * All accesses are done with interrupts disabled and 'active' counter
+ * protection with __llist_add() and __llist_del_first().
+ */
+ struct llist_head free_llist;
+ local_t active;
+
+ /* Operations on the free_list from unit_alloc/unit_free/bpf_mem_refill
+ * are sequenced by per-cpu 'active' counter. But unit_free() cannot
+ * fail. When 'active' is busy the unit_free() will add an object to
+ * free_llist_extra.
+ */
+ struct llist_head free_llist_extra;
+
+ struct irq_work refill_work;
+ struct obj_cgroup *objcg;
+ int unit_size;
+ /* count of objects in free_llist */
+ int free_cnt;
+ int low_watermark, high_watermark, batch;
+ int percpu_size;
+
+ struct rcu_head rcu;
+ struct llist_head free_by_rcu;
+ struct llist_head waiting_for_gp;
+ atomic_t call_rcu_in_progress;
+};
+
+struct bpf_mem_caches {
+ struct bpf_mem_cache cache[NUM_CACHES];
+};
+
+static struct llist_node notrace *__llist_del_first(struct llist_head *head)
+{
+ struct llist_node *entry, *next;
+
+ entry = head->first;
+ if (!entry)
+ return NULL;
+ next = entry->next;
+ head->first = next;
+ return entry;
+}
+
+static void *__alloc(struct bpf_mem_cache *c, int node)
+{
+ /* Allocate, but don't deplete atomic reserves that typical
+ * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc
+ * will allocate from the current numa node which is what we
+ * want here.
+ */
+ gfp_t flags = GFP_NOWAIT | __GFP_NOWARN | __GFP_ACCOUNT;
+
+ if (c->percpu_size) {
+ void **obj = kmalloc_node(c->percpu_size, flags, node);
+ void *pptr = __alloc_percpu_gfp(c->unit_size, 8, flags);
+
+ if (!obj || !pptr) {
+ free_percpu(pptr);
+ kfree(obj);
+ return NULL;
+ }
+ obj[1] = pptr;
+ return obj;
+ }
+
+ return kmalloc_node(c->unit_size, flags, node);
+}
+
+static struct mem_cgroup *get_memcg(const struct bpf_mem_cache *c)
+{
+#ifdef CONFIG_MEMCG_KMEM
+ if (c->objcg)
+ return get_mem_cgroup_from_objcg(c->objcg);
+#endif
+
+#ifdef CONFIG_MEMCG
+ return root_mem_cgroup;
+#else
+ return NULL;
+#endif
+}
+
+/* Mostly runs from irq_work except __init phase. */
+static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node)
+{
+ struct mem_cgroup *memcg = NULL, *old_memcg;
+ unsigned long flags;
+ void *obj;
+ int i;
+
+ memcg = get_memcg(c);
+ old_memcg = set_active_memcg(memcg);
+ for (i = 0; i < cnt; i++) {
+ obj = __alloc(c, node);
+ if (!obj)
+ break;
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ /* In RT irq_work runs in per-cpu kthread, so disable
+ * interrupts to avoid preemption and interrupts and
+ * reduce the chance of bpf prog executing on this cpu
+ * when active counter is busy.
+ */
+ local_irq_save(flags);
+ /* alloc_bulk runs from irq_work which will not preempt a bpf
+ * program that does unit_alloc/unit_free since IRQs are
+ * disabled there. There is no race to increment 'active'
+ * counter. It protects free_llist from corruption in case NMI
+ * bpf prog preempted this loop.
+ */
+ WARN_ON_ONCE(local_inc_return(&c->active) != 1);
+ __llist_add(obj, &c->free_llist);
+ c->free_cnt++;
+ local_dec(&c->active);
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_restore(flags);
+ }
+ set_active_memcg(old_memcg);
+ mem_cgroup_put(memcg);
+}
+
+static void free_one(struct bpf_mem_cache *c, void *obj)
+{
+ if (c->percpu_size) {
+ free_percpu(((void **)obj)[1]);
+ kfree(obj);
+ return;
+ }
+
+ kfree(obj);
+}
+
+static void __free_rcu(struct rcu_head *head)
+{
+ struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
+ struct llist_node *llnode = llist_del_all(&c->waiting_for_gp);
+ struct llist_node *pos, *t;
+
+ llist_for_each_safe(pos, t, llnode)
+ free_one(c, pos);
+ atomic_set(&c->call_rcu_in_progress, 0);
+}
+
+static void __free_rcu_tasks_trace(struct rcu_head *head)
+{
+ struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
+
+ call_rcu(&c->rcu, __free_rcu);
+}
+
+static void enque_to_free(struct bpf_mem_cache *c, void *obj)
+{
+ struct llist_node *llnode = obj;
+
+ /* bpf_mem_cache is a per-cpu object. Freeing happens in irq_work.
+ * Nothing races to add to free_by_rcu list.
+ */
+ __llist_add(llnode, &c->free_by_rcu);
+}
+
+static void do_call_rcu(struct bpf_mem_cache *c)
+{
+ struct llist_node *llnode, *t;
+
+ if (atomic_xchg(&c->call_rcu_in_progress, 1))
+ return;
+
+ WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp));
+ llist_for_each_safe(llnode, t, __llist_del_all(&c->free_by_rcu))
+ /* There is no concurrent __llist_add(waiting_for_gp) access.
+ * It doesn't race with llist_del_all either.
+ * But there could be two concurrent llist_del_all(waiting_for_gp):
+ * from __free_rcu() and from drain_mem_cache().
+ */
+ __llist_add(llnode, &c->waiting_for_gp);
+ /* Use call_rcu_tasks_trace() to wait for sleepable progs to finish.
+ * Then use call_rcu() to wait for normal progs to finish
+ * and finally do free_one() on each element.
+ */
+ call_rcu_tasks_trace(&c->rcu, __free_rcu_tasks_trace);
+}
+
+static void free_bulk(struct bpf_mem_cache *c)
+{
+ struct llist_node *llnode, *t;
+ unsigned long flags;
+ int cnt;
+
+ do {
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_save(flags);
+ WARN_ON_ONCE(local_inc_return(&c->active) != 1);
+ llnode = __llist_del_first(&c->free_llist);
+ if (llnode)
+ cnt = --c->free_cnt;
+ else
+ cnt = 0;
+ local_dec(&c->active);
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_restore(flags);
+ enque_to_free(c, llnode);
+ } while (cnt > (c->high_watermark + c->low_watermark) / 2);
+
+ /* and drain free_llist_extra */
+ llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra))
+ enque_to_free(c, llnode);
+ do_call_rcu(c);
+}
+
+static void bpf_mem_refill(struct irq_work *work)
+{
+ struct bpf_mem_cache *c = container_of(work, struct bpf_mem_cache, refill_work);
+ int cnt;
+
+ /* Racy access to free_cnt. It doesn't need to be 100% accurate */
+ cnt = c->free_cnt;
+ if (cnt < c->low_watermark)
+ /* irq_work runs on this cpu and kmalloc will allocate
+ * from the current numa node which is what we want here.
+ */
+ alloc_bulk(c, c->batch, NUMA_NO_NODE);
+ else if (cnt > c->high_watermark)
+ free_bulk(c);
+}
+
+static void notrace irq_work_raise(struct bpf_mem_cache *c)
+{
+ irq_work_queue(&c->refill_work);
+}
+
+/* For typical bpf map case that uses bpf_mem_cache_alloc and single bucket
+ * the freelist cache will be elem_size * 64 (or less) on each cpu.
+ *
+ * For bpf programs that don't have statically known allocation sizes and
+ * assuming (low_mark + high_mark) / 2 as an average number of elements per
+ * bucket and all buckets are used the total amount of memory in freelists
+ * on each cpu will be:
+ * 64*16 + 64*32 + 64*64 + 64*96 + 64*128 + 64*196 + 64*256 + 32*512 + 16*1024 + 8*2048 + 4*4096
+ * == ~ 116 Kbyte using below heuristic.
+ * Initialized, but unused bpf allocator (not bpf map specific one) will
+ * consume ~ 11 Kbyte per cpu.
+ * Typical case will be between 11K and 116K closer to 11K.
+ * bpf progs can and should share bpf_mem_cache when possible.
+ */
+
+static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
+{
+ init_irq_work(&c->refill_work, bpf_mem_refill);
+ if (c->unit_size <= 256) {
+ c->low_watermark = 32;
+ c->high_watermark = 96;
+ } else {
+ /* When page_size == 4k, order-0 cache will have low_mark == 2
+ * and high_mark == 6 with batch alloc of 3 individual pages at
+ * a time.
+ * 8k allocs and above low == 1, high == 3, batch == 1.
+ */
+ c->low_watermark = max(32 * 256 / c->unit_size, 1);
+ c->high_watermark = max(96 * 256 / c->unit_size, 3);
+ }
+ c->batch = max((c->high_watermark - c->low_watermark) / 4 * 3, 1);
+
+ /* To avoid consuming memory assume that 1st run of bpf
+ * prog won't be doing more than 4 map_update_elem from
+ * irq disabled region
+ */
+ alloc_bulk(c, c->unit_size <= 256 ? 4 : 1, cpu_to_node(cpu));
+}
+
+/* When size != 0 bpf_mem_cache for each cpu.
+ * This is typical bpf hash map use case when all elements have equal size.
+ *
+ * When size == 0 allocate 11 bpf_mem_cache-s for each cpu, then rely on
+ * kmalloc/kfree. Max allocation size is 4096 in this case.
+ * This is bpf_dynptr and bpf_kptr use case.
+ */
+int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
+{
+ static u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096};
+ struct bpf_mem_caches *cc, __percpu *pcc;
+ struct bpf_mem_cache *c, __percpu *pc;
+ struct obj_cgroup *objcg = NULL;
+ int cpu, i, unit_size, percpu_size = 0;
+
+ if (size) {
+ pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL);
+ if (!pc)
+ return -ENOMEM;
+
+ if (percpu)
+ /* room for llist_node and per-cpu pointer */
+ percpu_size = LLIST_NODE_SZ + sizeof(void *);
+ else
+ size += LLIST_NODE_SZ; /* room for llist_node */
+ unit_size = size;
+
+#ifdef CONFIG_MEMCG_KMEM
+ objcg = get_obj_cgroup_from_current();
+#endif
+ for_each_possible_cpu(cpu) {
+ c = per_cpu_ptr(pc, cpu);
+ c->unit_size = unit_size;
+ c->objcg = objcg;
+ c->percpu_size = percpu_size;
+ prefill_mem_cache(c, cpu);
+ }
+ ma->cache = pc;
+ return 0;
+ }
+
+ /* size == 0 && percpu is an invalid combination */
+ if (WARN_ON_ONCE(percpu))
+ return -EINVAL;
+
+ pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL);
+ if (!pcc)
+ return -ENOMEM;
+#ifdef CONFIG_MEMCG_KMEM
+ objcg = get_obj_cgroup_from_current();
+#endif
+ for_each_possible_cpu(cpu) {
+ cc = per_cpu_ptr(pcc, cpu);
+ for (i = 0; i < NUM_CACHES; i++) {
+ c = &cc->cache[i];
+ c->unit_size = sizes[i];
+ c->objcg = objcg;
+ prefill_mem_cache(c, cpu);
+ }
+ }
+ ma->caches = pcc;
+ return 0;
+}
+
+static void drain_mem_cache(struct bpf_mem_cache *c)
+{
+ struct llist_node *llnode, *t;
+
+ /* No progs are using this bpf_mem_cache, but htab_map_free() called
+ * bpf_mem_cache_free() for all remaining elements and they can be in
+ * free_by_rcu or in waiting_for_gp lists, so drain those lists now.
+ */
+ llist_for_each_safe(llnode, t, __llist_del_all(&c->free_by_rcu))
+ free_one(c, llnode);
+ llist_for_each_safe(llnode, t, llist_del_all(&c->waiting_for_gp))
+ free_one(c, llnode);
+ llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist))
+ free_one(c, llnode);
+ llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra))
+ free_one(c, llnode);
+}
+
+static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma)
+{
+ free_percpu(ma->cache);
+ free_percpu(ma->caches);
+ ma->cache = NULL;
+ ma->caches = NULL;
+}
+
+static void free_mem_alloc(struct bpf_mem_alloc *ma)
+{
+ /* waiting_for_gp lists was drained, but __free_rcu might
+ * still execute. Wait for it now before we freeing percpu caches.
+ */
+ rcu_barrier_tasks_trace();
+ rcu_barrier();
+ free_mem_alloc_no_barrier(ma);
+}
+
+static void free_mem_alloc_deferred(struct work_struct *work)
+{
+ struct bpf_mem_alloc *ma = container_of(work, struct bpf_mem_alloc, work);
+
+ free_mem_alloc(ma);
+ kfree(ma);
+}
+
+static void destroy_mem_alloc(struct bpf_mem_alloc *ma, int rcu_in_progress)
+{
+ struct bpf_mem_alloc *copy;
+
+ if (!rcu_in_progress) {
+ /* Fast path. No callbacks are pending, hence no need to do
+ * rcu_barrier-s.
+ */
+ free_mem_alloc_no_barrier(ma);
+ return;
+ }
+
+ copy = kmalloc(sizeof(*ma), GFP_KERNEL);
+ if (!copy) {
+ /* Slow path with inline barrier-s */
+ free_mem_alloc(ma);
+ return;
+ }
+
+ /* Defer barriers into worker to let the rest of map memory to be freed */
+ copy->cache = ma->cache;
+ ma->cache = NULL;
+ copy->caches = ma->caches;
+ ma->caches = NULL;
+ INIT_WORK(&copy->work, free_mem_alloc_deferred);
+ queue_work(system_unbound_wq, &copy->work);
+}
+
+void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
+{
+ struct bpf_mem_caches *cc;
+ struct bpf_mem_cache *c;
+ int cpu, i, rcu_in_progress;
+
+ if (ma->cache) {
+ rcu_in_progress = 0;
+ for_each_possible_cpu(cpu) {
+ c = per_cpu_ptr(ma->cache, cpu);
+ drain_mem_cache(c);
+ rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
+ }
+ /* objcg is the same across cpus */
+ if (c->objcg)
+ obj_cgroup_put(c->objcg);
+ destroy_mem_alloc(ma, rcu_in_progress);
+ }
+ if (ma->caches) {
+ rcu_in_progress = 0;
+ for_each_possible_cpu(cpu) {
+ cc = per_cpu_ptr(ma->caches, cpu);
+ for (i = 0; i < NUM_CACHES; i++) {
+ c = &cc->cache[i];
+ drain_mem_cache(c);
+ rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
+ }
+ }
+ if (c->objcg)
+ obj_cgroup_put(c->objcg);
+ destroy_mem_alloc(ma, rcu_in_progress);
+ }
+}
+
+/* notrace is necessary here and in other functions to make sure
+ * bpf programs cannot attach to them and cause llist corruptions.
+ */
+static void notrace *unit_alloc(struct bpf_mem_cache *c)
+{
+ struct llist_node *llnode = NULL;
+ unsigned long flags;
+ int cnt = 0;
+
+ /* Disable irqs to prevent the following race for majority of prog types:
+ * prog_A
+ * bpf_mem_alloc
+ * preemption or irq -> prog_B
+ * bpf_mem_alloc
+ *
+ * but prog_B could be a perf_event NMI prog.
+ * Use per-cpu 'active' counter to order free_list access between
+ * unit_alloc/unit_free/bpf_mem_refill.
+ */
+ local_irq_save(flags);
+ if (local_inc_return(&c->active) == 1) {
+ llnode = __llist_del_first(&c->free_llist);
+ if (llnode)
+ cnt = --c->free_cnt;
+ }
+ local_dec(&c->active);
+ local_irq_restore(flags);
+
+ WARN_ON(cnt < 0);
+
+ if (cnt < c->low_watermark)
+ irq_work_raise(c);
+ return llnode;
+}
+
+/* Though 'ptr' object could have been allocated on a different cpu
+ * add it to the free_llist of the current cpu.
+ * Let kfree() logic deal with it when it's later called from irq_work.
+ */
+static void notrace unit_free(struct bpf_mem_cache *c, void *ptr)
+{
+ struct llist_node *llnode = ptr - LLIST_NODE_SZ;
+ unsigned long flags;
+ int cnt = 0;
+
+ BUILD_BUG_ON(LLIST_NODE_SZ > 8);
+
+ local_irq_save(flags);
+ if (local_inc_return(&c->active) == 1) {
+ __llist_add(llnode, &c->free_llist);
+ cnt = ++c->free_cnt;
+ } else {
+ /* unit_free() cannot fail. Therefore add an object to atomic
+ * llist. free_bulk() will drain it. Though free_llist_extra is
+ * a per-cpu list we have to use atomic llist_add here, since
+ * it also can be interrupted by bpf nmi prog that does another
+ * unit_free() into the same free_llist_extra.
+ */
+ llist_add(llnode, &c->free_llist_extra);
+ }
+ local_dec(&c->active);
+ local_irq_restore(flags);
+
+ if (cnt > c->high_watermark)
+ /* free few objects from current cpu into global kmalloc pool */
+ irq_work_raise(c);
+}
+
+/* Called from BPF program or from sys_bpf syscall.
+ * In both cases migration is disabled.
+ */
+void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size)
+{
+ int idx;
+ void *ret;
+
+ if (!size)
+ return ZERO_SIZE_PTR;
+
+ idx = bpf_mem_cache_idx(size + LLIST_NODE_SZ);
+ if (idx < 0)
+ return NULL;
+
+ ret = unit_alloc(this_cpu_ptr(ma->caches)->cache + idx);
+ return !ret ? NULL : ret + LLIST_NODE_SZ;
+}
+
+void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr)
+{
+ int idx;
+
+ if (!ptr)
+ return;
+
+ idx = bpf_mem_cache_idx(__ksize(ptr - LLIST_NODE_SZ));
+ if (idx < 0)
+ return;
+
+ unit_free(this_cpu_ptr(ma->caches)->cache + idx, ptr);
+}
+
+void notrace *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma)
+{
+ void *ret;
+
+ ret = unit_alloc(this_cpu_ptr(ma->cache));
+ return !ret ? NULL : ret + LLIST_NODE_SZ;
+}
+
+void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr)
+{
+ if (!ptr)
+ return;
+
+ unit_free(this_cpu_ptr(ma->cache), ptr);
+}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 27760627370d..4fb08c43420d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -638,7 +638,10 @@ static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock)
bpf_map_free_id(map, do_idr_lock);
btf_put(map->btf);
INIT_WORK(&map->work, bpf_map_free_deferred);
- schedule_work(&map->work);
+ /* Avoid spawning kworkers, since they all might contend
+ * for the same mutex like slab_mutex.
+ */
+ queue_work(system_unbound_wq, &map->work);
}
}
@@ -1437,9 +1440,9 @@ err_put:
#define BPF_MAP_DELETE_ELEM_LAST_FIELD key
-static int map_delete_elem(union bpf_attr *attr)
+static int map_delete_elem(union bpf_attr *attr, bpfptr_t uattr)
{
- void __user *ukey = u64_to_user_ptr(attr->key);
+ bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel);
int ufd = attr->map_fd;
struct bpf_map *map;
struct fd f;
@@ -1459,7 +1462,7 @@ static int map_delete_elem(union bpf_attr *attr)
goto err_put;
}
- key = __bpf_copy_key(ukey, map->key_size);
+ key = ___bpf_copy_key(ukey, map->key_size);
if (IS_ERR(key)) {
err = PTR_ERR(key);
goto err_put;
@@ -4941,7 +4944,7 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
err = map_update_elem(&attr, uattr);
break;
case BPF_MAP_DELETE_ELEM:
- err = map_delete_elem(&attr);
+ err = map_delete_elem(&attr, uattr);
break;
case BPF_MAP_GET_NEXT_KEY:
err = map_get_next_key(&attr);
@@ -5073,8 +5076,10 @@ BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size)
{
switch (cmd) {
case BPF_MAP_CREATE:
+ case BPF_MAP_DELETE_ELEM:
case BPF_MAP_UPDATE_ELEM:
case BPF_MAP_FREEZE:
+ case BPF_MAP_GET_FD_BY_ID:
case BPF_PROG_LOAD:
case BPF_BTF_LOAD:
case BPF_LINK_CREATE:
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index ff87e38af8a7..ad76940b02cc 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -895,7 +895,7 @@ u64 notrace __bpf_prog_enter(struct bpf_prog *prog, struct bpf_tramp_run_ctx *ru
run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
- if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) {
+ if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
inc_misses_counter(prog);
return 0;
}
@@ -930,7 +930,7 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start, struct bpf_tramp_
bpf_reset_run_ctx(run_ctx->saved_run_ctx);
update_prog_stats(prog, start);
- __this_cpu_dec(*(prog->active));
+ this_cpu_dec(*(prog->active));
migrate_enable();
rcu_read_unlock();
}
@@ -966,7 +966,7 @@ u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, struct bpf_tramp_r
migrate_disable();
might_fault();
- if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) {
+ if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
inc_misses_counter(prog);
return 0;
}
@@ -982,7 +982,7 @@ void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start,
bpf_reset_run_ctx(run_ctx->saved_run_ctx);
update_prog_stats(prog, start);
- __this_cpu_dec(*(prog->active));
+ this_cpu_dec(*(prog->active));
migrate_enable();
rcu_read_unlock_trace();
}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8d07493a477c..003f7ba19558 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1092,6 +1092,7 @@ static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx)
id = ++env->id_gen;
state->refs[new_ofs].id = id;
state->refs[new_ofs].insn_idx = insn_idx;
+ state->refs[new_ofs].callback_ref = state->in_callback_fn ? state->frameno : 0;
return id;
}
@@ -1104,6 +1105,9 @@ static int release_reference_state(struct bpf_func_state *state, int ptr_id)
last_idx = state->acquired_refs - 1;
for (i = 0; i < state->acquired_refs; i++) {
if (state->refs[i].id == ptr_id) {
+ /* Cannot release caller references in callbacks */
+ if (state->in_callback_fn && state->refs[i].callback_ref != state->frameno)
+ return -EINVAL;
if (last_idx && i != last_idx)
memcpy(&state->refs[i], &state->refs[last_idx],
sizeof(*state->refs));
@@ -6918,10 +6922,17 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
caller->regs[BPF_REG_0] = *r0;
}
- /* Transfer references to the caller */
- err = copy_reference_state(caller, callee);
- if (err)
- return err;
+ /* callback_fn frame should have released its own additions to parent's
+ * reference state at this point, or check_reference_leak would
+ * complain, hence it must be the same as the caller. There is no need
+ * to copy it back.
+ */
+ if (!callee->in_callback_fn) {
+ /* Transfer references to the caller */
+ err = copy_reference_state(caller, callee);
+ if (err)
+ return err;
+ }
*insn_idx = callee->callsite + 1;
if (env->log.level & BPF_LOG_LEVEL) {
@@ -7043,13 +7054,20 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
static int check_reference_leak(struct bpf_verifier_env *env)
{
struct bpf_func_state *state = cur_func(env);
+ bool refs_lingering = false;
int i;
+ if (state->frameno && !state->in_callback_fn)
+ return 0;
+
for (i = 0; i < state->acquired_refs; i++) {
+ if (state->in_callback_fn && state->refs[i].callback_ref != state->frameno)
+ continue;
verbose(env, "Unreleased reference id=%d alloc_insn=%d\n",
state->refs[i].id, state->refs[i].insn_idx);
+ refs_lingering = true;
}
- return state->acquired_refs ? -EINVAL : 0;
+ return refs_lingering ? -EINVAL : 0;
}
static int check_bpf_snprintf_call(struct bpf_verifier_env *env,
@@ -12338,6 +12356,16 @@ static int do_check(struct bpf_verifier_env *env)
return -EINVAL;
}
+ /* We must do check_reference_leak here before
+ * prepare_func_exit to handle the case when
+ * state->curframe > 0, it may be a callback
+ * function, for which reference_state must
+ * match caller reference state when it exits.
+ */
+ err = check_reference_leak(env);
+ if (err)
+ return err;
+
if (state->curframe) {
/* exit from nested function */
err = prepare_func_exit(env, &env->insn_idx);
@@ -12347,10 +12375,6 @@ static int do_check(struct bpf_verifier_env *env)
continue;
}
- err = check_reference_leak(env);
- if (err)
- return err;
-
err = check_return_code(env);
if (err)
return err;
@@ -12563,14 +12587,6 @@ err_put:
return err;
}
-static int check_map_prealloc(struct bpf_map *map)
-{
- return (map->map_type != BPF_MAP_TYPE_HASH &&
- map->map_type != BPF_MAP_TYPE_PERCPU_HASH &&
- map->map_type != BPF_MAP_TYPE_HASH_OF_MAPS) ||
- !(map->map_flags & BPF_F_NO_PREALLOC);
-}
-
static bool is_tracing_prog_type(enum bpf_prog_type type)
{
switch (type) {
@@ -12585,50 +12601,12 @@ static bool is_tracing_prog_type(enum bpf_prog_type type)
}
}
-static bool is_preallocated_map(struct bpf_map *map)
-{
- if (!check_map_prealloc(map))
- return false;
- if (map->inner_map_meta && !check_map_prealloc(map->inner_map_meta))
- return false;
- return true;
-}
-
static int check_map_prog_compatibility(struct bpf_verifier_env *env,
struct bpf_map *map,
struct bpf_prog *prog)
{
enum bpf_prog_type prog_type = resolve_prog_type(prog);
- /*
- * Validate that trace type programs use preallocated hash maps.
- *
- * For programs attached to PERF events this is mandatory as the
- * perf NMI can hit any arbitrary code sequence.
- *
- * All other trace types using preallocated hash maps are unsafe as
- * well because tracepoint or kprobes can be inside locked regions
- * of the memory allocator or at a place where a recursion into the
- * memory allocator would see inconsistent state.
- *
- * On RT enabled kernels run-time allocation of all trace type
- * programs is strictly prohibited due to lock type constraints. On
- * !RT kernels it is allowed for backwards compatibility reasons for
- * now, but warnings are emitted so developers are made aware of
- * the unsafety and can fix their programs before this is enforced.
- */
- if (is_tracing_prog_type(prog_type) && !is_preallocated_map(map)) {
- if (prog_type == BPF_PROG_TYPE_PERF_EVENT) {
- verbose(env, "perf_event programs can only use preallocated hash map\n");
- return -EINVAL;
- }
- if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
- verbose(env, "trace type programs can only use preallocated hash map\n");
- return -EINVAL;
- }
- WARN_ONCE(1, "trace type BPF program uses run-time allocation\n");
- verbose(env, "trace type programs with run-time allocated hash maps are unsafe. Switch to preallocated hash maps.\n");
- }
if (map_value_has_spin_lock(map)) {
if (prog_type == BPF_PROG_TYPE_SOCKET_FILTER) {
@@ -12675,12 +12653,6 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
case BPF_MAP_TYPE_LRU_PERCPU_HASH:
case BPF_MAP_TYPE_ARRAY_OF_MAPS:
case BPF_MAP_TYPE_HASH_OF_MAPS:
- if (!is_preallocated_map(map)) {
- verbose(env,
- "Sleepable programs can only use preallocated maps\n");
- return -EINVAL;
- }
- break;
case BPF_MAP_TYPE_RINGBUF:
case BPF_MAP_TYPE_INODE_STORAGE:
case BPF_MAP_TYPE_SK_STORAGE:
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index feb59380c896..793ecff29038 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -3,6 +3,10 @@
#include <linux/sched/cputime.h>
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
+
static DEFINE_SPINLOCK(cgroup_rstat_lock);
static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
@@ -141,6 +145,31 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
return pos;
}
+/*
+ * A hook for bpf stat collectors to attach to and flush their stats.
+ * Together with providing bpf kfuncs for cgroup_rstat_updated() and
+ * cgroup_rstat_flush(), this enables a complete workflow where bpf progs that
+ * collect cgroup stats can integrate with rstat for efficient flushing.
+ *
+ * A static noinline declaration here could cause the compiler to optimize away
+ * the function. A global noinline declaration will keep the definition, but may
+ * optimize away the callsite. Therefore, __weak is needed to ensure that the
+ * call is still emitted, by telling the compiler that we don't know what the
+ * function might eventually be.
+ *
+ * __diag_* below are needed to dismiss the missing prototype warning.
+ */
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+ "kfuncs which will be used in BPF programs");
+
+__weak noinline void bpf_rstat_flush(struct cgroup *cgrp,
+ struct cgroup *parent, int cpu)
+{
+}
+
+__diag_pop();
+
/* see cgroup_rstat_flush() */
static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
__releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
@@ -168,6 +197,7 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
struct cgroup_subsys_state *css;
cgroup_base_stat_flush(pos, cpu);
+ bpf_rstat_flush(pos, cgroup_parent(pos), cpu);
rcu_read_lock();
list_for_each_entry_rcu(css, &pos->rstat_css_list,
@@ -501,3 +531,21 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq)
seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time);
#endif
}
+
+/* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */
+BTF_SET8_START(bpf_rstat_kfunc_ids)
+BTF_ID_FLAGS(func, cgroup_rstat_updated)
+BTF_ID_FLAGS(func, cgroup_rstat_flush, KF_SLEEPABLE)
+BTF_SET8_END(bpf_rstat_kfunc_ids)
+
+static const struct btf_kfunc_id_set bpf_rstat_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &bpf_rstat_kfunc_ids,
+};
+
+static int __init bpf_rstat_kfunc_init(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
+ &bpf_rstat_kfunc_set);
+}
+late_initcall(bpf_rstat_kfunc_init);
diff --git a/net/core/filter.c b/net/core/filter.c
index c191db80ce93..e872f45399b0 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3010,7 +3010,7 @@ BPF_CALL_0(bpf_get_cgroup_classid_curr)
return __task_get_classid(current);
}
-static const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto = {
+const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto = {
.func = bpf_get_cgroup_classid_curr,
.gpl_only = false,
.ret_type = RET_INTEGER,
@@ -4489,7 +4489,8 @@ BPF_CALL_4(bpf_skb_get_tunnel_key, struct sk_buff *, skb, struct bpf_tunnel_key
void *to_orig = to;
int err;
- if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6)))) {
+ if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6 |
+ BPF_F_TUNINFO_FLAGS)))) {
err = -EINVAL;
goto err_clear;
}
@@ -4521,7 +4522,10 @@ set_compat:
to->tunnel_id = be64_to_cpu(info->key.tun_id);
to->tunnel_tos = info->key.tos;
to->tunnel_ttl = info->key.ttl;
- to->tunnel_ext = 0;
+ if (flags & BPF_F_TUNINFO_FLAGS)
+ to->tunnel_flags = info->key.tun_flags;
+ else
+ to->tunnel_ext = 0;
if (flags & BPF_F_TUNINFO_IPV6) {
memcpy(to->remote_ipv6, &info->key.u.ipv6.src,
@@ -5014,359 +5018,259 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};
-static int __bpf_setsockopt(struct sock *sk, int level, int optname,
- char *optval, int optlen)
-{
- char devname[IFNAMSIZ];
- int val, valbool;
- struct net *net;
- int ifindex;
- int ret = 0;
-
- if (!sk_fullsock(sk))
+static int sol_socket_sockopt(struct sock *sk, int optname,
+ char *optval, int *optlen,
+ bool getopt)
+{
+ switch (optname) {
+ case SO_REUSEADDR:
+ case SO_SNDBUF:
+ case SO_RCVBUF:
+ case SO_KEEPALIVE:
+ case SO_PRIORITY:
+ case SO_REUSEPORT:
+ case SO_RCVLOWAT:
+ case SO_MARK:
+ case SO_MAX_PACING_RATE:
+ case SO_BINDTOIFINDEX:
+ case SO_TXREHASH:
+ if (*optlen != sizeof(int))
+ return -EINVAL;
+ break;
+ case SO_BINDTODEVICE:
+ break;
+ default:
return -EINVAL;
+ }
- if (level == SOL_SOCKET) {
- if (optlen != sizeof(int) && optname != SO_BINDTODEVICE)
+ if (getopt) {
+ if (optname == SO_BINDTODEVICE)
return -EINVAL;
- val = *((int *)optval);
- valbool = val ? 1 : 0;
-
- /* Only some socketops are supported */
- switch (optname) {
- case SO_RCVBUF:
- val = min_t(u32, val, READ_ONCE(sysctl_rmem_max));
- val = min_t(int, val, INT_MAX / 2);
- sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
- WRITE_ONCE(sk->sk_rcvbuf,
- max_t(int, val * 2, SOCK_MIN_RCVBUF));
- break;
- case SO_SNDBUF:
- val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
- val = min_t(int, val, INT_MAX / 2);
- sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
- WRITE_ONCE(sk->sk_sndbuf,
- max_t(int, val * 2, SOCK_MIN_SNDBUF));
- break;
- case SO_MAX_PACING_RATE: /* 32bit version */
- if (val != ~0U)
- cmpxchg(&sk->sk_pacing_status,
- SK_PACING_NONE,
- SK_PACING_NEEDED);
- sk->sk_max_pacing_rate = (val == ~0U) ?
- ~0UL : (unsigned int)val;
- sk->sk_pacing_rate = min(sk->sk_pacing_rate,
- sk->sk_max_pacing_rate);
- break;
- case SO_PRIORITY:
- sk->sk_priority = val;
- break;
- case SO_RCVLOWAT:
- if (val < 0)
- val = INT_MAX;
- if (sk->sk_socket && sk->sk_socket->ops->set_rcvlowat)
- ret = sk->sk_socket->ops->set_rcvlowat(sk, val);
- else
- WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
- break;
- case SO_MARK:
- if (sk->sk_mark != val) {
- sk->sk_mark = val;
- sk_dst_reset(sk);
- }
- break;
- case SO_BINDTODEVICE:
- optlen = min_t(long, optlen, IFNAMSIZ - 1);
- strncpy(devname, optval, optlen);
- devname[optlen] = 0;
+ return sk_getsockopt(sk, SOL_SOCKET, optname,
+ KERNEL_SOCKPTR(optval),
+ KERNEL_SOCKPTR(optlen));
+ }
- ifindex = 0;
- if (devname[0] != '\0') {
- struct net_device *dev;
+ return sk_setsockopt(sk, SOL_SOCKET, optname,
+ KERNEL_SOCKPTR(optval), *optlen);
+}
- ret = -ENODEV;
+static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname,
+ char *optval, int optlen)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ unsigned long timeout;
+ int val;
- net = sock_net(sk);
- dev = dev_get_by_name(net, devname);
- if (!dev)
- break;
- ifindex = dev->ifindex;
- dev_put(dev);
- }
- fallthrough;
- case SO_BINDTOIFINDEX:
- if (optname == SO_BINDTOIFINDEX)
- ifindex = val;
- ret = sock_bindtoindex(sk, ifindex, false);
- break;
- case SO_KEEPALIVE:
- if (sk->sk_prot->keepalive)
- sk->sk_prot->keepalive(sk, valbool);
- sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
- break;
- case SO_REUSEPORT:
- sk->sk_reuseport = valbool;
- break;
- case SO_TXREHASH:
- if (val < -1 || val > 1) {
- ret = -EINVAL;
- break;
- }
- sk->sk_txrehash = (u8)val;
- break;
- default:
- ret = -EINVAL;
- }
-#ifdef CONFIG_INET
- } else if (level == SOL_IP) {
- if (optlen != sizeof(int) || sk->sk_family != AF_INET)
- return -EINVAL;
+ if (optlen != sizeof(int))
+ return -EINVAL;
- val = *((int *)optval);
- /* Only some options are supported */
- switch (optname) {
- case IP_TOS:
- if (val < -1 || val > 0xff) {
- ret = -EINVAL;
- } else {
- struct inet_sock *inet = inet_sk(sk);
+ val = *(int *)optval;
- if (val == -1)
- val = 0;
- inet->tos = val;
- }
- break;
- default:
- ret = -EINVAL;
- }
-#if IS_ENABLED(CONFIG_IPV6)
- } else if (level == SOL_IPV6) {
- if (optlen != sizeof(int) || sk->sk_family != AF_INET6)
+ /* Only some options are supported */
+ switch (optname) {
+ case TCP_BPF_IW:
+ if (val <= 0 || tp->data_segs_out > tp->syn_data)
+ return -EINVAL;
+ tcp_snd_cwnd_set(tp, val);
+ break;
+ case TCP_BPF_SNDCWND_CLAMP:
+ if (val <= 0)
+ return -EINVAL;
+ tp->snd_cwnd_clamp = val;
+ tp->snd_ssthresh = val;
+ break;
+ case TCP_BPF_DELACK_MAX:
+ timeout = usecs_to_jiffies(val);
+ if (timeout > TCP_DELACK_MAX ||
+ timeout < TCP_TIMEOUT_MIN)
+ return -EINVAL;
+ inet_csk(sk)->icsk_delack_max = timeout;
+ break;
+ case TCP_BPF_RTO_MIN:
+ timeout = usecs_to_jiffies(val);
+ if (timeout > TCP_RTO_MIN ||
+ timeout < TCP_TIMEOUT_MIN)
return -EINVAL;
+ inet_csk(sk)->icsk_rto_min = timeout;
+ break;
+ default:
+ return -EINVAL;
+ }
- val = *((int *)optval);
- /* Only some options are supported */
- switch (optname) {
- case IPV6_TCLASS:
- if (val < -1 || val > 0xff) {
- ret = -EINVAL;
- } else {
- struct ipv6_pinfo *np = inet6_sk(sk);
+ return 0;
+}
- if (val == -1)
- val = 0;
- np->tclass = val;
- }
- break;
- default:
- ret = -EINVAL;
- }
-#endif
- } else if (level == SOL_TCP &&
- sk->sk_prot->setsockopt == tcp_setsockopt) {
- if (optname == TCP_CONGESTION) {
- char name[TCP_CA_NAME_MAX];
+static int sol_tcp_sockopt(struct sock *sk, int optname,
+ char *optval, int *optlen,
+ bool getopt)
+{
+ if (sk->sk_prot->setsockopt != tcp_setsockopt)
+ return -EINVAL;
- strncpy(name, optval, min_t(long, optlen,
- TCP_CA_NAME_MAX-1));
- name[TCP_CA_NAME_MAX-1] = 0;
- ret = tcp_set_congestion_control(sk, name, false, true);
- } else {
- struct inet_connection_sock *icsk = inet_csk(sk);
+ switch (optname) {
+ case TCP_NODELAY:
+ case TCP_MAXSEG:
+ case TCP_KEEPIDLE:
+ case TCP_KEEPINTVL:
+ case TCP_KEEPCNT:
+ case TCP_SYNCNT:
+ case TCP_WINDOW_CLAMP:
+ case TCP_THIN_LINEAR_TIMEOUTS:
+ case TCP_USER_TIMEOUT:
+ case TCP_NOTSENT_LOWAT:
+ case TCP_SAVE_SYN:
+ if (*optlen != sizeof(int))
+ return -EINVAL;
+ break;
+ case TCP_CONGESTION:
+ if (*optlen < 2)
+ return -EINVAL;
+ break;
+ case TCP_SAVED_SYN:
+ if (*optlen < 1)
+ return -EINVAL;
+ break;
+ default:
+ if (getopt)
+ return -EINVAL;
+ return bpf_sol_tcp_setsockopt(sk, optname, optval, *optlen);
+ }
+
+ if (getopt) {
+ if (optname == TCP_SAVED_SYN) {
struct tcp_sock *tp = tcp_sk(sk);
- unsigned long timeout;
- if (optlen != sizeof(int))
+ if (!tp->saved_syn ||
+ *optlen > tcp_saved_syn_len(tp->saved_syn))
return -EINVAL;
+ memcpy(optval, tp->saved_syn->data, *optlen);
+ /* It cannot free tp->saved_syn here because it
+ * does not know if the user space still needs it.
+ */
+ return 0;
+ }
- val = *((int *)optval);
- /* Only some options are supported */
- switch (optname) {
- case TCP_BPF_IW:
- if (val <= 0 || tp->data_segs_out > tp->syn_data)
- ret = -EINVAL;
- else
- tcp_snd_cwnd_set(tp, val);
- break;
- case TCP_BPF_SNDCWND_CLAMP:
- if (val <= 0) {
- ret = -EINVAL;
- } else {
- tp->snd_cwnd_clamp = val;
- tp->snd_ssthresh = val;
- }
- break;
- case TCP_BPF_DELACK_MAX:
- timeout = usecs_to_jiffies(val);
- if (timeout > TCP_DELACK_MAX ||
- timeout < TCP_TIMEOUT_MIN)
- return -EINVAL;
- inet_csk(sk)->icsk_delack_max = timeout;
- break;
- case TCP_BPF_RTO_MIN:
- timeout = usecs_to_jiffies(val);
- if (timeout > TCP_RTO_MIN ||
- timeout < TCP_TIMEOUT_MIN)
- return -EINVAL;
- inet_csk(sk)->icsk_rto_min = timeout;
- break;
- case TCP_SAVE_SYN:
- if (val < 0 || val > 1)
- ret = -EINVAL;
- else
- tp->save_syn = val;
- break;
- case TCP_KEEPIDLE:
- ret = tcp_sock_set_keepidle_locked(sk, val);
- break;
- case TCP_KEEPINTVL:
- if (val < 1 || val > MAX_TCP_KEEPINTVL)
- ret = -EINVAL;
- else
- tp->keepalive_intvl = val * HZ;
- break;
- case TCP_KEEPCNT:
- if (val < 1 || val > MAX_TCP_KEEPCNT)
- ret = -EINVAL;
- else
- tp->keepalive_probes = val;
- break;
- case TCP_SYNCNT:
- if (val < 1 || val > MAX_TCP_SYNCNT)
- ret = -EINVAL;
- else
- icsk->icsk_syn_retries = val;
- break;
- case TCP_USER_TIMEOUT:
- if (val < 0)
- ret = -EINVAL;
- else
- icsk->icsk_user_timeout = val;
- break;
- case TCP_NOTSENT_LOWAT:
- tp->notsent_lowat = val;
- sk->sk_write_space(sk);
- break;
- case TCP_WINDOW_CLAMP:
- ret = tcp_set_window_clamp(sk, val);
- break;
- default:
- ret = -EINVAL;
- }
+ if (optname == TCP_CONGESTION) {
+ if (!inet_csk(sk)->icsk_ca_ops)
+ return -EINVAL;
+ /* BPF expects NULL-terminated tcp-cc string */
+ optval[--(*optlen)] = '\0';
}
-#endif
- } else {
- ret = -EINVAL;
+
+ return do_tcp_getsockopt(sk, SOL_TCP, optname,
+ KERNEL_SOCKPTR(optval),
+ KERNEL_SOCKPTR(optlen));
}
- return ret;
+
+ return do_tcp_setsockopt(sk, SOL_TCP, optname,
+ KERNEL_SOCKPTR(optval), *optlen);
}
-static int _bpf_setsockopt(struct sock *sk, int level, int optname,
- char *optval, int optlen)
+static int sol_ip_sockopt(struct sock *sk, int optname,
+ char *optval, int *optlen,
+ bool getopt)
{
- if (sk_fullsock(sk))
- sock_owned_by_me(sk);
- return __bpf_setsockopt(sk, level, optname, optval, optlen);
+ if (sk->sk_family != AF_INET)
+ return -EINVAL;
+
+ switch (optname) {
+ case IP_TOS:
+ if (*optlen != sizeof(int))
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (getopt)
+ return do_ip_getsockopt(sk, SOL_IP, optname,
+ KERNEL_SOCKPTR(optval),
+ KERNEL_SOCKPTR(optlen));
+
+ return do_ip_setsockopt(sk, SOL_IP, optname,
+ KERNEL_SOCKPTR(optval), *optlen);
}
-static int __bpf_getsockopt(struct sock *sk, int level, int optname,
- char *optval, int optlen)
+static int sol_ipv6_sockopt(struct sock *sk, int optname,
+ char *optval, int *optlen,
+ bool getopt)
{
- if (!sk_fullsock(sk))
- goto err_clear;
+ if (sk->sk_family != AF_INET6)
+ return -EINVAL;
- if (level == SOL_SOCKET) {
- if (optlen != sizeof(int))
- goto err_clear;
+ switch (optname) {
+ case IPV6_TCLASS:
+ case IPV6_AUTOFLOWLABEL:
+ if (*optlen != sizeof(int))
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
- switch (optname) {
- case SO_RCVBUF:
- *((int *)optval) = sk->sk_rcvbuf;
- break;
- case SO_SNDBUF:
- *((int *)optval) = sk->sk_sndbuf;
- break;
- case SO_MARK:
- *((int *)optval) = sk->sk_mark;
- break;
- case SO_PRIORITY:
- *((int *)optval) = sk->sk_priority;
- break;
- case SO_BINDTOIFINDEX:
- *((int *)optval) = sk->sk_bound_dev_if;
- break;
- case SO_REUSEPORT:
- *((int *)optval) = sk->sk_reuseport;
- break;
- case SO_TXREHASH:
- *((int *)optval) = sk->sk_txrehash;
- break;
- default:
- goto err_clear;
- }
-#ifdef CONFIG_INET
- } else if (level == SOL_TCP && sk->sk_prot->getsockopt == tcp_getsockopt) {
- struct inet_connection_sock *icsk;
- struct tcp_sock *tp;
+ if (getopt)
+ return ipv6_bpf_stub->ipv6_getsockopt(sk, SOL_IPV6, optname,
+ KERNEL_SOCKPTR(optval),
+ KERNEL_SOCKPTR(optlen));
- switch (optname) {
- case TCP_CONGESTION:
- icsk = inet_csk(sk);
+ return ipv6_bpf_stub->ipv6_setsockopt(sk, SOL_IPV6, optname,
+ KERNEL_SOCKPTR(optval), *optlen);
+}
- if (!icsk->icsk_ca_ops || optlen <= 1)
- goto err_clear;
- strncpy(optval, icsk->icsk_ca_ops->name, optlen);
- optval[optlen - 1] = 0;
- break;
- case TCP_SAVED_SYN:
- tp = tcp_sk(sk);
+static int __bpf_setsockopt(struct sock *sk, int level, int optname,
+ char *optval, int optlen)
+{
+ if (!sk_fullsock(sk))
+ return -EINVAL;
- if (optlen <= 0 || !tp->saved_syn ||
- optlen > tcp_saved_syn_len(tp->saved_syn))
- goto err_clear;
- memcpy(optval, tp->saved_syn->data, optlen);
- break;
- default:
- goto err_clear;
- }
- } else if (level == SOL_IP) {
- struct inet_sock *inet = inet_sk(sk);
+ if (level == SOL_SOCKET)
+ return sol_socket_sockopt(sk, optname, optval, &optlen, false);
+ else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP)
+ return sol_ip_sockopt(sk, optname, optval, &optlen, false);
+ else if (IS_ENABLED(CONFIG_IPV6) && level == SOL_IPV6)
+ return sol_ipv6_sockopt(sk, optname, optval, &optlen, false);
+ else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP)
+ return sol_tcp_sockopt(sk, optname, optval, &optlen, false);
- if (optlen != sizeof(int) || sk->sk_family != AF_INET)
- goto err_clear;
+ return -EINVAL;
+}
- /* Only some options are supported */
- switch (optname) {
- case IP_TOS:
- *((int *)optval) = (int)inet->tos;
- break;
- default:
- goto err_clear;
- }
-#if IS_ENABLED(CONFIG_IPV6)
- } else if (level == SOL_IPV6) {
- struct ipv6_pinfo *np = inet6_sk(sk);
+static int _bpf_setsockopt(struct sock *sk, int level, int optname,
+ char *optval, int optlen)
+{
+ if (sk_fullsock(sk))
+ sock_owned_by_me(sk);
+ return __bpf_setsockopt(sk, level, optname, optval, optlen);
+}
- if (optlen != sizeof(int) || sk->sk_family != AF_INET6)
- goto err_clear;
+static int __bpf_getsockopt(struct sock *sk, int level, int optname,
+ char *optval, int optlen)
+{
+ int err, saved_optlen = optlen;
- /* Only some options are supported */
- switch (optname) {
- case IPV6_TCLASS:
- *((int *)optval) = (int)np->tclass;
- break;
- default:
- goto err_clear;
- }
-#endif
-#endif
- } else {
- goto err_clear;
+ if (!sk_fullsock(sk)) {
+ err = -EINVAL;
+ goto done;
}
- return 0;
-err_clear:
- memset(optval, 0, optlen);
- return -EINVAL;
+
+ if (level == SOL_SOCKET)
+ err = sol_socket_sockopt(sk, optname, optval, &optlen, true);
+ else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP)
+ err = sol_tcp_sockopt(sk, optname, optval, &optlen, true);
+ else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP)
+ err = sol_ip_sockopt(sk, optname, optval, &optlen, true);
+ else if (IS_ENABLED(CONFIG_IPV6) && level == SOL_IPV6)
+ err = sol_ipv6_sockopt(sk, optname, optval, &optlen, true);
+ else
+ err = -EINVAL;
+
+done:
+ if (err)
+ optlen = 0;
+ if (optlen < saved_optlen)
+ memset(optval + optlen, 0, saved_optlen - optlen);
+ return err;
}
static int _bpf_getsockopt(struct sock *sk, int level, int optname,
@@ -7667,34 +7571,23 @@ const struct bpf_func_proto bpf_sk_storage_get_cg_sock_proto __weak;
static const struct bpf_func_proto *
sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
+ const struct bpf_func_proto *func_proto;
+
+ func_proto = cgroup_common_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
+ func_proto = cgroup_current_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
switch (func_id) {
- /* inet and inet6 sockets are created in a process
- * context so there is always a valid uid/gid
- */
- case BPF_FUNC_get_current_uid_gid:
- return &bpf_get_current_uid_gid_proto;
- case BPF_FUNC_get_local_storage:
- return &bpf_get_local_storage_proto;
case BPF_FUNC_get_socket_cookie:
return &bpf_get_socket_cookie_sock_proto;
case BPF_FUNC_get_netns_cookie:
return &bpf_get_netns_cookie_sock_proto;
case BPF_FUNC_perf_event_output:
return &bpf_event_output_data_proto;
- case BPF_FUNC_get_current_pid_tgid:
- return &bpf_get_current_pid_tgid_proto;
- case BPF_FUNC_get_current_comm:
- return &bpf_get_current_comm_proto;
-#ifdef CONFIG_CGROUPS
- case BPF_FUNC_get_current_cgroup_id:
- return &bpf_get_current_cgroup_id_proto;
- case BPF_FUNC_get_current_ancestor_cgroup_id:
- return &bpf_get_current_ancestor_cgroup_id_proto;
-#endif
-#ifdef CONFIG_CGROUP_NET_CLASSID
- case BPF_FUNC_get_cgroup_classid:
- return &bpf_get_cgroup_classid_curr_proto;
-#endif
case BPF_FUNC_sk_storage_get:
return &bpf_sk_storage_get_cg_sock_proto;
case BPF_FUNC_ktime_get_coarse_ns:
@@ -7707,12 +7600,17 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
static const struct bpf_func_proto *
sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
+ const struct bpf_func_proto *func_proto;
+
+ func_proto = cgroup_common_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
+ func_proto = cgroup_current_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
switch (func_id) {
- /* inet and inet6 sockets are created in a process
- * context so there is always a valid uid/gid
- */
- case BPF_FUNC_get_current_uid_gid:
- return &bpf_get_current_uid_gid_proto;
case BPF_FUNC_bind:
switch (prog->expected_attach_type) {
case BPF_CGROUP_INET4_CONNECT:
@@ -7725,24 +7623,8 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_socket_cookie_sock_addr_proto;
case BPF_FUNC_get_netns_cookie:
return &bpf_get_netns_cookie_sock_addr_proto;
- case BPF_FUNC_get_local_storage:
- return &bpf_get_local_storage_proto;
case BPF_FUNC_perf_event_output:
return &bpf_event_output_data_proto;
- case BPF_FUNC_get_current_pid_tgid:
- return &bpf_get_current_pid_tgid_proto;
- case BPF_FUNC_get_current_comm:
- return &bpf_get_current_comm_proto;
-#ifdef CONFIG_CGROUPS
- case BPF_FUNC_get_current_cgroup_id:
- return &bpf_get_current_cgroup_id_proto;
- case BPF_FUNC_get_current_ancestor_cgroup_id:
- return &bpf_get_current_ancestor_cgroup_id_proto;
-#endif
-#ifdef CONFIG_CGROUP_NET_CLASSID
- case BPF_FUNC_get_cgroup_classid:
- return &bpf_get_cgroup_classid_curr_proto;
-#endif
#ifdef CONFIG_INET
case BPF_FUNC_sk_lookup_tcp:
return &bpf_sock_addr_sk_lookup_tcp_proto;
@@ -7823,9 +7705,13 @@ const struct bpf_func_proto bpf_sk_storage_delete_proto __weak;
static const struct bpf_func_proto *
cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
+ const struct bpf_func_proto *func_proto;
+
+ func_proto = cgroup_common_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
switch (func_id) {
- case BPF_FUNC_get_local_storage:
- return &bpf_get_local_storage_proto;
case BPF_FUNC_sk_fullsock:
return &bpf_sk_fullsock_proto;
case BPF_FUNC_sk_storage_get:
@@ -8065,6 +7951,12 @@ const struct bpf_func_proto bpf_sock_hash_update_proto __weak;
static const struct bpf_func_proto *
sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
+ const struct bpf_func_proto *func_proto;
+
+ func_proto = cgroup_common_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
switch (func_id) {
case BPF_FUNC_setsockopt:
return &bpf_sock_ops_setsockopt_proto;
@@ -8078,8 +7970,6 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_sock_hash_update_proto;
case BPF_FUNC_get_socket_cookie:
return &bpf_get_socket_cookie_sock_ops_proto;
- case BPF_FUNC_get_local_storage:
- return &bpf_get_local_storage_proto;
case BPF_FUNC_perf_event_output:
return &bpf_event_output_data_proto;
case BPF_FUNC_sk_storage_get:
@@ -10812,14 +10702,13 @@ int sk_detach_filter(struct sock *sk)
}
EXPORT_SYMBOL_GPL(sk_detach_filter);
-int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf,
- unsigned int len)
+int sk_get_filter(struct sock *sk, sockptr_t optval, unsigned int len)
{
struct sock_fprog_kern *fprog;
struct sk_filter *filter;
int ret = 0;
- lock_sock(sk);
+ sockopt_lock_sock(sk);
filter = rcu_dereference_protected(sk->sk_filter,
lockdep_sock_is_held(sk));
if (!filter)
@@ -10844,7 +10733,7 @@ int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf,
goto out;
ret = -EFAULT;
- if (copy_to_user(ubuf, fprog->filter, bpf_classic_proglen(fprog)))
+ if (copy_to_sockptr(optval, fprog->filter, bpf_classic_proglen(fprog)))
goto out;
/* Instead of bytes, the API requests to return the number
@@ -10852,7 +10741,7 @@ int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf,
*/
ret = fprog->len;
out:
- release_sock(sk);
+ sockopt_release_sock(sk);
return ret;
}
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 764c4cb3fe8f..990429c69ccd 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -866,8 +866,8 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys,
}
}
-bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
- __be16 proto, int nhoff, int hlen, unsigned int flags)
+u32 bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
+ __be16 proto, int nhoff, int hlen, unsigned int flags)
{
struct bpf_flow_keys *flow_keys = ctx->flow_keys;
u32 result;
@@ -892,7 +892,7 @@ bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
flow_keys->thoff = clamp_t(u16, flow_keys->thoff,
flow_keys->nhoff, hlen);
- return result == BPF_OK;
+ return result;
}
static bool is_pppoe_ses_hdr_valid(const struct pppoe_hdr *hdr)
@@ -1008,6 +1008,7 @@ bool __skb_flow_dissect(const struct net *net,
};
__be16 n_proto = proto;
struct bpf_prog *prog;
+ u32 result;
if (skb) {
ctx.skb = skb;
@@ -1019,13 +1020,16 @@ bool __skb_flow_dissect(const struct net *net,
}
prog = READ_ONCE(run_array->items[0].prog);
- ret = bpf_flow_dissect(prog, &ctx, n_proto, nhoff,
- hlen, flags);
+ result = bpf_flow_dissect(prog, &ctx, n_proto, nhoff,
+ hlen, flags);
+ if (result == BPF_FLOW_DISSECTOR_CONTINUE)
+ goto dissect_continue;
__skb_flow_bpf_to_target(&flow_keys, flow_dissector,
target_container);
rcu_read_unlock();
- return ret;
+ return result == BPF_OK;
}
+dissect_continue:
rcu_read_unlock();
}
diff --git a/net/core/sock.c b/net/core/sock.c
index 788c1372663c..eeb6cbac6f49 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -703,15 +703,17 @@ static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
goto out;
}
- return sock_bindtoindex(sk, index, true);
+ sockopt_lock_sock(sk);
+ ret = sock_bindtoindex_locked(sk, index);
+ sockopt_release_sock(sk);
out:
#endif
return ret;
}
-static int sock_getbindtodevice(struct sock *sk, char __user *optval,
- int __user *optlen, int len)
+static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
+ sockptr_t optlen, int len)
{
int ret = -ENOPROTOOPT;
#ifdef CONFIG_NETDEVICES
@@ -735,12 +737,12 @@ static int sock_getbindtodevice(struct sock *sk, char __user *optval,
len = strlen(devname) + 1;
ret = -EFAULT;
- if (copy_to_user(optval, devname, len))
+ if (copy_to_sockptr(optval, devname, len))
goto out;
zero:
ret = -EFAULT;
- if (put_user(len, optlen))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
goto out;
ret = 0;
@@ -1036,17 +1038,51 @@ static int sock_reserve_memory(struct sock *sk, int bytes)
return 0;
}
+void sockopt_lock_sock(struct sock *sk)
+{
+ /* When current->bpf_ctx is set, the setsockopt is called from
+ * a bpf prog. bpf has ensured the sk lock has been
+ * acquired before calling setsockopt().
+ */
+ if (has_current_bpf_ctx())
+ return;
+
+ lock_sock(sk);
+}
+EXPORT_SYMBOL(sockopt_lock_sock);
+
+void sockopt_release_sock(struct sock *sk)
+{
+ if (has_current_bpf_ctx())
+ return;
+
+ release_sock(sk);
+}
+EXPORT_SYMBOL(sockopt_release_sock);
+
+bool sockopt_ns_capable(struct user_namespace *ns, int cap)
+{
+ return has_current_bpf_ctx() || ns_capable(ns, cap);
+}
+EXPORT_SYMBOL(sockopt_ns_capable);
+
+bool sockopt_capable(int cap)
+{
+ return has_current_bpf_ctx() || capable(cap);
+}
+EXPORT_SYMBOL(sockopt_capable);
+
/*
* This is meant for all protocols to use and covers goings on
* at the socket level. Everything here is generic.
*/
-int sock_setsockopt(struct socket *sock, int level, int optname,
- sockptr_t optval, unsigned int optlen)
+int sk_setsockopt(struct sock *sk, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
{
struct so_timestamping timestamping;
+ struct socket *sock = sk->sk_socket;
struct sock_txtime sk_txtime;
- struct sock *sk = sock->sk;
int val;
int valbool;
struct linger ling;
@@ -1067,11 +1103,11 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
valbool = val ? 1 : 0;
- lock_sock(sk);
+ sockopt_lock_sock(sk);
switch (optname) {
case SO_DEBUG:
- if (val && !capable(CAP_NET_ADMIN))
+ if (val && !sockopt_capable(CAP_NET_ADMIN))
ret = -EACCES;
else
sock_valbool_flag(sk, SOCK_DBG, valbool);
@@ -1115,7 +1151,7 @@ set_sndbuf:
break;
case SO_SNDBUFFORCE:
- if (!capable(CAP_NET_ADMIN)) {
+ if (!sockopt_capable(CAP_NET_ADMIN)) {
ret = -EPERM;
break;
}
@@ -1137,7 +1173,7 @@ set_sndbuf:
break;
case SO_RCVBUFFORCE:
- if (!capable(CAP_NET_ADMIN)) {
+ if (!sockopt_capable(CAP_NET_ADMIN)) {
ret = -EPERM;
break;
}
@@ -1164,8 +1200,8 @@ set_sndbuf:
case SO_PRIORITY:
if ((val >= 0 && val <= 6) ||
- ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
- ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
+ sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
sk->sk_priority = val;
else
ret = -EPERM;
@@ -1228,7 +1264,7 @@ set_sndbuf:
case SO_RCVLOWAT:
if (val < 0)
val = INT_MAX;
- if (sock->ops->set_rcvlowat)
+ if (sock && sock->ops->set_rcvlowat)
ret = sock->ops->set_rcvlowat(sk, val);
else
WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
@@ -1310,8 +1346,8 @@ set_sndbuf:
clear_bit(SOCK_PASSSEC, &sock->flags);
break;
case SO_MARK:
- if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
- !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
+ if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
+ !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
ret = -EPERM;
break;
}
@@ -1319,8 +1355,8 @@ set_sndbuf:
__sock_set_mark(sk, val);
break;
case SO_RCVMARK:
- if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
- !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
+ if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
+ !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
ret = -EPERM;
break;
}
@@ -1354,7 +1390,7 @@ set_sndbuf:
#ifdef CONFIG_NET_RX_BUSY_POLL
case SO_BUSY_POLL:
/* allow unprivileged users to decrease the value */
- if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
+ if ((val > sk->sk_ll_usec) && !sockopt_capable(CAP_NET_ADMIN))
ret = -EPERM;
else {
if (val < 0)
@@ -1364,13 +1400,13 @@ set_sndbuf:
}
break;
case SO_PREFER_BUSY_POLL:
- if (valbool && !capable(CAP_NET_ADMIN))
+ if (valbool && !sockopt_capable(CAP_NET_ADMIN))
ret = -EPERM;
else
WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
break;
case SO_BUSY_POLL_BUDGET:
- if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) {
+ if (val > READ_ONCE(sk->sk_busy_poll_budget) && !sockopt_capable(CAP_NET_ADMIN)) {
ret = -EPERM;
} else {
if (val < 0 || val > U16_MAX)
@@ -1441,7 +1477,7 @@ set_sndbuf:
* scheduler has enough safe guards.
*/
if (sk_txtime.clockid != CLOCK_MONOTONIC &&
- !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
+ !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
ret = -EPERM;
break;
}
@@ -1496,9 +1532,16 @@ set_sndbuf:
ret = -ENOPROTOOPT;
break;
}
- release_sock(sk);
+ sockopt_release_sock(sk);
return ret;
}
+
+int sock_setsockopt(struct socket *sock, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
+{
+ return sk_setsockopt(sock->sk, level, optname,
+ optval, optlen);
+}
EXPORT_SYMBOL(sock_setsockopt);
static const struct cred *sk_get_peer_cred(struct sock *sk)
@@ -1525,22 +1568,25 @@ static void cred_to_ucred(struct pid *pid, const struct cred *cred,
}
}
-static int groups_to_user(gid_t __user *dst, const struct group_info *src)
+static int groups_to_user(sockptr_t dst, const struct group_info *src)
{
struct user_namespace *user_ns = current_user_ns();
int i;
- for (i = 0; i < src->ngroups; i++)
- if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
+ for (i = 0; i < src->ngroups; i++) {
+ gid_t gid = from_kgid_munged(user_ns, src->gid[i]);
+
+ if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
return -EFAULT;
+ }
return 0;
}
-int sock_getsockopt(struct socket *sock, int level, int optname,
- char __user *optval, int __user *optlen)
+int sk_getsockopt(struct sock *sk, int level, int optname,
+ sockptr_t optval, sockptr_t optlen)
{
- struct sock *sk = sock->sk;
+ struct socket *sock = sk->sk_socket;
union {
int val;
@@ -1557,7 +1603,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
int lv = sizeof(int);
int len;
- if (get_user(len, optlen))
+ if (copy_from_sockptr(&len, optlen, sizeof(int)))
return -EFAULT;
if (len < 0)
return -EINVAL;
@@ -1692,7 +1738,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
spin_unlock(&sk->sk_peer_lock);
- if (copy_to_user(optval, &peercred, len))
+ if (copy_to_sockptr(optval, &peercred, len))
return -EFAULT;
goto lenout;
}
@@ -1710,11 +1756,11 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
if (len < n * sizeof(gid_t)) {
len = n * sizeof(gid_t);
put_cred(cred);
- return put_user(len, optlen) ? -EFAULT : -ERANGE;
+ return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
}
len = n * sizeof(gid_t);
- ret = groups_to_user((gid_t __user *)optval, cred->group_info);
+ ret = groups_to_user(optval, cred->group_info);
put_cred(cred);
if (ret)
return ret;
@@ -1730,7 +1776,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
return -ENOTCONN;
if (lv < len)
return -EINVAL;
- if (copy_to_user(optval, address, len))
+ if (copy_to_sockptr(optval, address, len))
return -EFAULT;
goto lenout;
}
@@ -1747,7 +1793,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
break;
case SO_PEERSEC:
- return security_socket_getpeersec_stream(sock, optval, optlen, len);
+ return security_socket_getpeersec_stream(sock, optval.user, optlen.user, len);
case SO_MARK:
v.val = sk->sk_mark;
@@ -1779,7 +1825,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
return sock_getbindtodevice(sk, optval, optlen, len);
case SO_GET_FILTER:
- len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
+ len = sk_get_filter(sk, optval, len);
if (len < 0)
return len;
@@ -1827,7 +1873,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
sk_get_meminfo(sk, meminfo);
len = min_t(unsigned int, len, sizeof(meminfo));
- if (copy_to_user(optval, &meminfo, len))
+ if (copy_to_sockptr(optval, &meminfo, len))
return -EFAULT;
goto lenout;
@@ -1896,14 +1942,22 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
if (len > lv)
len = lv;
- if (copy_to_user(optval, &v, len))
+ if (copy_to_sockptr(optval, &v, len))
return -EFAULT;
lenout:
- if (put_user(len, optlen))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
return -EFAULT;
return 0;
}
+int sock_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ return sk_getsockopt(sock->sk, level, optname,
+ USER_SOCKPTR(optval),
+ USER_SOCKPTR(optlen));
+}
+
/*
* Initialize an sk_lock.
*
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index e3ab0cb61624..df0660d818ac 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -2529,11 +2529,10 @@ done:
err = ip_mc_leave_group(sk, &imr);
return err;
}
-
int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
- struct ip_msfilter __user *optval, int __user *optlen)
+ sockptr_t optval, sockptr_t optlen)
{
- int err, len, count, copycount;
+ int err, len, count, copycount, msf_size;
struct ip_mreqn imr;
__be32 addr = msf->imsf_multiaddr;
struct ip_mc_socklist *pmc;
@@ -2575,12 +2574,15 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
copycount = count < msf->imsf_numsrc ? count : msf->imsf_numsrc;
len = flex_array_size(psl, sl_addr, copycount);
msf->imsf_numsrc = count;
- if (put_user(IP_MSFILTER_SIZE(copycount), optlen) ||
- copy_to_user(optval, msf, IP_MSFILTER_SIZE(0))) {
+ msf_size = IP_MSFILTER_SIZE(copycount);
+ if (copy_to_sockptr(optlen, &msf_size, sizeof(int)) ||
+ copy_to_sockptr(optval, msf, IP_MSFILTER_SIZE(0))) {
return -EFAULT;
}
if (len &&
- copy_to_user(&optval->imsf_slist_flex[0], psl->sl_addr, len))
+ copy_to_sockptr_offset(optval,
+ offsetof(struct ip_msfilter, imsf_slist_flex),
+ psl->sl_addr, len))
return -EFAULT;
return 0;
done:
@@ -2588,7 +2590,7 @@ done:
}
int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
- struct sockaddr_storage __user *p)
+ sockptr_t optval, size_t ss_offset)
{
int i, count, copycount;
struct sockaddr_in *psin;
@@ -2618,15 +2620,17 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
count = psl ? psl->sl_count : 0;
copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc;
gsf->gf_numsrc = count;
- for (i = 0; i < copycount; i++, p++) {
+ for (i = 0; i < copycount; i++) {
struct sockaddr_storage ss;
psin = (struct sockaddr_in *)&ss;
memset(&ss, 0, sizeof(ss));
psin->sin_family = AF_INET;
psin->sin_addr.s_addr = psl->sl_addr[i];
- if (copy_to_user(p, &ss, sizeof(ss)))
+ if (copy_to_sockptr_offset(optval, ss_offset,
+ &ss, sizeof(ss)))
return -EFAULT;
+ ss_offset += sizeof(ss);
}
return 0;
}
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index e49a61a053a6..6e19cad154f5 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -888,8 +888,8 @@ static int compat_ip_mcast_join_leave(struct sock *sk, int optname,
DEFINE_STATIC_KEY_FALSE(ip4_min_ttl);
-static int do_ip_setsockopt(struct sock *sk, int level, int optname,
- sockptr_t optval, unsigned int optlen)
+int do_ip_setsockopt(struct sock *sk, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
{
struct inet_sock *inet = inet_sk(sk);
struct net *net = sock_net(sk);
@@ -944,7 +944,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, int optname,
err = 0;
if (needs_rtnl)
rtnl_lock();
- lock_sock(sk);
+ sockopt_lock_sock(sk);
switch (optname) {
case IP_OPTIONS:
@@ -1333,14 +1333,14 @@ static int do_ip_setsockopt(struct sock *sk, int level, int optname,
case IP_IPSEC_POLICY:
case IP_XFRM_POLICY:
err = -EPERM;
- if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
break;
err = xfrm_user_policy(sk, optname, optval, optlen);
break;
case IP_TRANSPARENT:
- if (!!val && !ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
- !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
+ if (!!val && !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
+ !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
err = -EPERM;
break;
}
@@ -1368,13 +1368,13 @@ static int do_ip_setsockopt(struct sock *sk, int level, int optname,
err = -ENOPROTOOPT;
break;
}
- release_sock(sk);
+ sockopt_release_sock(sk);
if (needs_rtnl)
rtnl_unlock();
return err;
e_inval:
- release_sock(sk);
+ sockopt_release_sock(sk);
if (needs_rtnl)
rtnl_unlock();
return -EINVAL;
@@ -1462,37 +1462,37 @@ static bool getsockopt_needs_rtnl(int optname)
return false;
}
-static int ip_get_mcast_msfilter(struct sock *sk, void __user *optval,
- int __user *optlen, int len)
+static int ip_get_mcast_msfilter(struct sock *sk, sockptr_t optval,
+ sockptr_t optlen, int len)
{
const int size0 = offsetof(struct group_filter, gf_slist_flex);
- struct group_filter __user *p = optval;
struct group_filter gsf;
- int num;
+ int num, gsf_size;
int err;
if (len < size0)
return -EINVAL;
- if (copy_from_user(&gsf, p, size0))
+ if (copy_from_sockptr(&gsf, optval, size0))
return -EFAULT;
num = gsf.gf_numsrc;
- err = ip_mc_gsfget(sk, &gsf, p->gf_slist_flex);
+ err = ip_mc_gsfget(sk, &gsf, optval,
+ offsetof(struct group_filter, gf_slist_flex));
if (err)
return err;
if (gsf.gf_numsrc < num)
num = gsf.gf_numsrc;
- if (put_user(GROUP_FILTER_SIZE(num), optlen) ||
- copy_to_user(p, &gsf, size0))
+ gsf_size = GROUP_FILTER_SIZE(num);
+ if (copy_to_sockptr(optlen, &gsf_size, sizeof(int)) ||
+ copy_to_sockptr(optval, &gsf, size0))
return -EFAULT;
return 0;
}
-static int compat_ip_get_mcast_msfilter(struct sock *sk, void __user *optval,
- int __user *optlen, int len)
+static int compat_ip_get_mcast_msfilter(struct sock *sk, sockptr_t optval,
+ sockptr_t optlen, int len)
{
const int size0 = offsetof(struct compat_group_filter, gf_slist_flex);
- struct compat_group_filter __user *p = optval;
struct compat_group_filter gf32;
struct group_filter gf;
int num;
@@ -1500,7 +1500,7 @@ static int compat_ip_get_mcast_msfilter(struct sock *sk, void __user *optval,
if (len < size0)
return -EINVAL;
- if (copy_from_user(&gf32, p, size0))
+ if (copy_from_sockptr(&gf32, optval, size0))
return -EFAULT;
gf.gf_interface = gf32.gf_interface;
@@ -1508,21 +1508,24 @@ static int compat_ip_get_mcast_msfilter(struct sock *sk, void __user *optval,
num = gf.gf_numsrc = gf32.gf_numsrc;
gf.gf_group = gf32.gf_group;
- err = ip_mc_gsfget(sk, &gf, p->gf_slist_flex);
+ err = ip_mc_gsfget(sk, &gf, optval,
+ offsetof(struct compat_group_filter, gf_slist_flex));
if (err)
return err;
if (gf.gf_numsrc < num)
num = gf.gf_numsrc;
len = GROUP_FILTER_SIZE(num) - (sizeof(gf) - sizeof(gf32));
- if (put_user(len, optlen) ||
- put_user(gf.gf_fmode, &p->gf_fmode) ||
- put_user(gf.gf_numsrc, &p->gf_numsrc))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)) ||
+ copy_to_sockptr_offset(optval, offsetof(struct compat_group_filter, gf_fmode),
+ &gf.gf_fmode, sizeof(gf.gf_fmode)) ||
+ copy_to_sockptr_offset(optval, offsetof(struct compat_group_filter, gf_numsrc),
+ &gf.gf_numsrc, sizeof(gf.gf_numsrc)))
return -EFAULT;
return 0;
}
-static int do_ip_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen)
+int do_ip_getsockopt(struct sock *sk, int level, int optname,
+ sockptr_t optval, sockptr_t optlen)
{
struct inet_sock *inet = inet_sk(sk);
bool needs_rtnl = getsockopt_needs_rtnl(optname);
@@ -1535,14 +1538,14 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
if (ip_mroute_opt(optname))
return ip_mroute_getsockopt(sk, optname, optval, optlen);
- if (get_user(len, optlen))
+ if (copy_from_sockptr(&len, optlen, sizeof(int)))
return -EFAULT;
if (len < 0)
return -EINVAL;
if (needs_rtnl)
rtnl_lock();
- lock_sock(sk);
+ sockopt_lock_sock(sk);
switch (optname) {
case IP_OPTIONS:
@@ -1558,17 +1561,19 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
memcpy(optbuf, &inet_opt->opt,
sizeof(struct ip_options) +
inet_opt->opt.optlen);
- release_sock(sk);
+ sockopt_release_sock(sk);
- if (opt->optlen == 0)
- return put_user(0, optlen);
+ if (opt->optlen == 0) {
+ len = 0;
+ return copy_to_sockptr(optlen, &len, sizeof(int));
+ }
ip_options_undo(opt);
len = min_t(unsigned int, len, opt->optlen);
- if (put_user(len, optlen))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
return -EFAULT;
- if (copy_to_user(optval, opt->__data, len))
+ if (copy_to_sockptr(optval, opt->__data, len))
return -EFAULT;
return 0;
}
@@ -1632,7 +1637,7 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
dst_release(dst);
}
if (!val) {
- release_sock(sk);
+ sockopt_release_sock(sk);
return -ENOTCONN;
}
break;
@@ -1657,11 +1662,11 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
struct in_addr addr;
len = min_t(unsigned int, len, sizeof(struct in_addr));
addr.s_addr = inet->mc_addr;
- release_sock(sk);
+ sockopt_release_sock(sk);
- if (put_user(len, optlen))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
return -EFAULT;
- if (copy_to_user(optval, &addr, len))
+ if (copy_to_sockptr(optval, &addr, len))
return -EFAULT;
return 0;
}
@@ -1673,12 +1678,11 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
err = -EINVAL;
goto out;
}
- if (copy_from_user(&msf, optval, IP_MSFILTER_SIZE(0))) {
+ if (copy_from_sockptr(&msf, optval, IP_MSFILTER_SIZE(0))) {
err = -EFAULT;
goto out;
}
- err = ip_mc_msfget(sk, &msf,
- (struct ip_msfilter __user *)optval, optlen);
+ err = ip_mc_msfget(sk, &msf, optval, optlen);
goto out;
}
case MCAST_MSFILTER:
@@ -1695,13 +1699,18 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
{
struct msghdr msg;
- release_sock(sk);
+ sockopt_release_sock(sk);
if (sk->sk_type != SOCK_STREAM)
return -ENOPROTOOPT;
- msg.msg_control_is_user = true;
- msg.msg_control_user = optval;
+ if (optval.is_kernel) {
+ msg.msg_control_is_user = false;
+ msg.msg_control = optval.kernel;
+ } else {
+ msg.msg_control_is_user = true;
+ msg.msg_control_user = optval.user;
+ }
msg.msg_controllen = len;
msg.msg_flags = in_compat_syscall() ? MSG_CMSG_COMPAT : 0;
@@ -1722,7 +1731,7 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
put_cmsg(&msg, SOL_IP, IP_TOS, sizeof(tos), &tos);
}
len -= msg.msg_controllen;
- return put_user(len, optlen);
+ return copy_to_sockptr(optlen, &len, sizeof(int));
}
case IP_FREEBIND:
val = inet->freebind;
@@ -1734,29 +1743,29 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
val = inet->min_ttl;
break;
default:
- release_sock(sk);
+ sockopt_release_sock(sk);
return -ENOPROTOOPT;
}
- release_sock(sk);
+ sockopt_release_sock(sk);
if (len < sizeof(int) && len > 0 && val >= 0 && val <= 255) {
unsigned char ucval = (unsigned char)val;
len = 1;
- if (put_user(len, optlen))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
return -EFAULT;
- if (copy_to_user(optval, &ucval, 1))
+ if (copy_to_sockptr(optval, &ucval, 1))
return -EFAULT;
} else {
len = min_t(unsigned int, sizeof(int), len);
- if (put_user(len, optlen))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
return -EFAULT;
- if (copy_to_user(optval, &val, len))
+ if (copy_to_sockptr(optval, &val, len))
return -EFAULT;
}
return 0;
out:
- release_sock(sk);
+ sockopt_release_sock(sk);
if (needs_rtnl)
rtnl_unlock();
return err;
@@ -1767,7 +1776,8 @@ int ip_getsockopt(struct sock *sk, int level,
{
int err;
- err = do_ip_getsockopt(sk, level, optname, optval, optlen);
+ err = do_ip_getsockopt(sk, level, optname,
+ USER_SOCKPTR(optval), USER_SOCKPTR(optlen));
#if IS_ENABLED(CONFIG_BPFILTER_UMH)
if (optname >= BPFILTER_IPT_SO_GET_INFO &&
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 73651d17e51f..95eefbe2e142 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1546,7 +1546,8 @@ out:
}
/* Getsock opt support for the multicast routing system. */
-int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int __user *optlen)
+int ip_mroute_getsockopt(struct sock *sk, int optname, sockptr_t optval,
+ sockptr_t optlen)
{
int olr;
int val;
@@ -1577,14 +1578,14 @@ int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int
return -ENOPROTOOPT;
}
- if (get_user(olr, optlen))
+ if (copy_from_sockptr(&olr, optlen, sizeof(int)))
return -EFAULT;
olr = min_t(unsigned int, olr, sizeof(int));
if (olr < 0)
return -EINVAL;
- if (put_user(olr, optlen))
+ if (copy_to_sockptr(optlen, &olr, sizeof(int)))
return -EFAULT;
- if (copy_to_user(optval, &val, olr))
+ if (copy_to_sockptr(optval, &val, olr))
return -EFAULT;
return 0;
}
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 306b94dedc8d..52b8879e7d20 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3199,7 +3199,7 @@ EXPORT_SYMBOL(tcp_disconnect);
static inline bool tcp_can_repair_sock(const struct sock *sk)
{
- return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
+ return sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
(sk->sk_state != TCP_LISTEN);
}
@@ -3476,8 +3476,8 @@ int tcp_set_window_clamp(struct sock *sk, int val)
/*
* Socket option code for TCP.
*/
-static int do_tcp_setsockopt(struct sock *sk, int level, int optname,
- sockptr_t optval, unsigned int optlen)
+int do_tcp_setsockopt(struct sock *sk, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
@@ -3499,11 +3499,11 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int optname,
return -EFAULT;
name[val] = 0;
- lock_sock(sk);
- err = tcp_set_congestion_control(sk, name, true,
- ns_capable(sock_net(sk)->user_ns,
- CAP_NET_ADMIN));
- release_sock(sk);
+ sockopt_lock_sock(sk);
+ err = tcp_set_congestion_control(sk, name, !has_current_bpf_ctx(),
+ sockopt_ns_capable(sock_net(sk)->user_ns,
+ CAP_NET_ADMIN));
+ sockopt_release_sock(sk);
return err;
}
case TCP_ULP: {
@@ -3519,9 +3519,9 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int optname,
return -EFAULT;
name[val] = 0;
- lock_sock(sk);
+ sockopt_lock_sock(sk);
err = tcp_set_ulp(sk, name);
- release_sock(sk);
+ sockopt_release_sock(sk);
return err;
}
case TCP_FASTOPEN_KEY: {
@@ -3554,7 +3554,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int optname,
if (copy_from_sockptr(&val, optval, sizeof(val)))
return -EFAULT;
- lock_sock(sk);
+ sockopt_lock_sock(sk);
switch (optname) {
case TCP_MAXSEG:
@@ -3776,7 +3776,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int optname,
break;
}
- release_sock(sk);
+ sockopt_release_sock(sk);
return err;
}
@@ -4040,15 +4040,15 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk,
return stats;
}
-static int do_tcp_getsockopt(struct sock *sk, int level,
- int optname, char __user *optval, int __user *optlen)
+int do_tcp_getsockopt(struct sock *sk, int level,
+ int optname, sockptr_t optval, sockptr_t optlen)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
int val, len;
- if (get_user(len, optlen))
+ if (copy_from_sockptr(&len, optlen, sizeof(int)))
return -EFAULT;
len = min_t(unsigned int, len, sizeof(int));
@@ -4098,15 +4098,15 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
case TCP_INFO: {
struct tcp_info info;
- if (get_user(len, optlen))
+ if (copy_from_sockptr(&len, optlen, sizeof(int)))
return -EFAULT;
tcp_get_info(sk, &info);
len = min_t(unsigned int, len, sizeof(info));
- if (put_user(len, optlen))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
return -EFAULT;
- if (copy_to_user(optval, &info, len))
+ if (copy_to_sockptr(optval, &info, len))
return -EFAULT;
return 0;
}
@@ -4116,7 +4116,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
size_t sz = 0;
int attr;
- if (get_user(len, optlen))
+ if (copy_from_sockptr(&len, optlen, sizeof(int)))
return -EFAULT;
ca_ops = icsk->icsk_ca_ops;
@@ -4124,9 +4124,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
sz = ca_ops->get_info(sk, ~0U, &attr, &info);
len = min_t(unsigned int, len, sz);
- if (put_user(len, optlen))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
return -EFAULT;
- if (copy_to_user(optval, &info, len))
+ if (copy_to_sockptr(optval, &info, len))
return -EFAULT;
return 0;
}
@@ -4135,27 +4135,28 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
break;
case TCP_CONGESTION:
- if (get_user(len, optlen))
+ if (copy_from_sockptr(&len, optlen, sizeof(int)))
return -EFAULT;
len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
- if (put_user(len, optlen))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
return -EFAULT;
- if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
+ if (copy_to_sockptr(optval, icsk->icsk_ca_ops->name, len))
return -EFAULT;
return 0;
case TCP_ULP:
- if (get_user(len, optlen))
+ if (copy_from_sockptr(&len, optlen, sizeof(int)))
return -EFAULT;
len = min_t(unsigned int, len, TCP_ULP_NAME_MAX);
if (!icsk->icsk_ulp_ops) {
- if (put_user(0, optlen))
+ len = 0;
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
return -EFAULT;
return 0;
}
- if (put_user(len, optlen))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
return -EFAULT;
- if (copy_to_user(optval, icsk->icsk_ulp_ops->name, len))
+ if (copy_to_sockptr(optval, icsk->icsk_ulp_ops->name, len))
return -EFAULT;
return 0;
@@ -4163,15 +4164,15 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
u64 key[TCP_FASTOPEN_KEY_BUF_LENGTH / sizeof(u64)];
unsigned int key_len;
- if (get_user(len, optlen))
+ if (copy_from_sockptr(&len, optlen, sizeof(int)))
return -EFAULT;
key_len = tcp_fastopen_get_cipher(net, icsk, key) *
TCP_FASTOPEN_KEY_LENGTH;
len = min_t(unsigned int, len, key_len);
- if (put_user(len, optlen))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
return -EFAULT;
- if (copy_to_user(optval, key, len))
+ if (copy_to_sockptr(optval, key, len))
return -EFAULT;
return 0;
}
@@ -4197,7 +4198,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
case TCP_REPAIR_WINDOW: {
struct tcp_repair_window opt;
- if (get_user(len, optlen))
+ if (copy_from_sockptr(&len, optlen, sizeof(int)))
return -EFAULT;
if (len != sizeof(opt))
@@ -4212,7 +4213,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
opt.rcv_wnd = tp->rcv_wnd;
opt.rcv_wup = tp->rcv_wup;
- if (copy_to_user(optval, &opt, len))
+ if (copy_to_sockptr(optval, &opt, len))
return -EFAULT;
return 0;
}
@@ -4258,35 +4259,35 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
val = tp->save_syn;
break;
case TCP_SAVED_SYN: {
- if (get_user(len, optlen))
+ if (copy_from_sockptr(&len, optlen, sizeof(int)))
return -EFAULT;
- lock_sock(sk);
+ sockopt_lock_sock(sk);
if (tp->saved_syn) {
if (len < tcp_saved_syn_len(tp->saved_syn)) {
- if (put_user(tcp_saved_syn_len(tp->saved_syn),
- optlen)) {
- release_sock(sk);
+ len = tcp_saved_syn_len(tp->saved_syn);
+ if (copy_to_sockptr(optlen, &len, sizeof(int))) {
+ sockopt_release_sock(sk);
return -EFAULT;
}
- release_sock(sk);
+ sockopt_release_sock(sk);
return -EINVAL;
}
len = tcp_saved_syn_len(tp->saved_syn);
- if (put_user(len, optlen)) {
- release_sock(sk);
+ if (copy_to_sockptr(optlen, &len, sizeof(int))) {
+ sockopt_release_sock(sk);
return -EFAULT;
}
- if (copy_to_user(optval, tp->saved_syn->data, len)) {
- release_sock(sk);
+ if (copy_to_sockptr(optval, tp->saved_syn->data, len)) {
+ sockopt_release_sock(sk);
return -EFAULT;
}
tcp_saved_syn_free(tp);
- release_sock(sk);
+ sockopt_release_sock(sk);
} else {
- release_sock(sk);
+ sockopt_release_sock(sk);
len = 0;
- if (put_user(len, optlen))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
return -EFAULT;
}
return 0;
@@ -4297,31 +4298,31 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
struct tcp_zerocopy_receive zc = {};
int err;
- if (get_user(len, optlen))
+ if (copy_from_sockptr(&len, optlen, sizeof(int)))
return -EFAULT;
if (len < 0 ||
len < offsetofend(struct tcp_zerocopy_receive, length))
return -EINVAL;
if (unlikely(len > sizeof(zc))) {
- err = check_zeroed_user(optval + sizeof(zc),
- len - sizeof(zc));
+ err = check_zeroed_sockptr(optval, sizeof(zc),
+ len - sizeof(zc));
if (err < 1)
return err == 0 ? -EINVAL : err;
len = sizeof(zc);
- if (put_user(len, optlen))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
return -EFAULT;
}
- if (copy_from_user(&zc, optval, len))
+ if (copy_from_sockptr(&zc, optval, len))
return -EFAULT;
if (zc.reserved)
return -EINVAL;
if (zc.msg_flags & ~(TCP_VALID_ZC_MSG_FLAGS))
return -EINVAL;
- lock_sock(sk);
+ sockopt_lock_sock(sk);
err = tcp_zerocopy_receive(sk, &zc, &tss);
err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname,
&zc, &len, err);
- release_sock(sk);
+ sockopt_release_sock(sk);
if (len >= offsetofend(struct tcp_zerocopy_receive, msg_flags))
goto zerocopy_rcv_cmsg;
switch (len) {
@@ -4351,7 +4352,7 @@ zerocopy_rcv_sk_err:
zerocopy_rcv_inq:
zc.inq = tcp_inq_hint(sk);
zerocopy_rcv_out:
- if (!err && copy_to_user(optval, &zc, len))
+ if (!err && copy_to_sockptr(optval, &zc, len))
err = -EFAULT;
return err;
}
@@ -4360,9 +4361,9 @@ zerocopy_rcv_out:
return -ENOPROTOOPT;
}
- if (put_user(len, optlen))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
return -EFAULT;
- if (copy_to_user(optval, &val, len))
+ if (copy_to_sockptr(optval, &val, len))
return -EFAULT;
return 0;
}
@@ -4387,7 +4388,8 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
if (level != SOL_TCP)
return icsk->icsk_af_ops->getsockopt(sk, level, optname,
optval, optlen);
- return do_tcp_getsockopt(sk, level, optname, optval, optlen);
+ return do_tcp_getsockopt(sk, level, optname, USER_SOCKPTR(optval),
+ USER_SOCKPTR(optlen));
}
EXPORT_SYMBOL(tcp_getsockopt);
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 2ce0c44d0081..19732b5dce23 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -1057,6 +1057,8 @@ static const struct ipv6_stub ipv6_stub_impl = {
static const struct ipv6_bpf_stub ipv6_bpf_stub_impl = {
.inet6_bind = __inet6_bind,
.udp6_lib_lookup = __udp6_lib_lookup,
+ .ipv6_setsockopt = do_ipv6_setsockopt,
+ .ipv6_getsockopt = do_ipv6_getsockopt,
};
static int __init inet6_init(void)
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index a9ba41648e36..516e83b52f26 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -1827,8 +1827,8 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, sockptr_t optval,
* Getsock opt support for the multicast routing system.
*/
-int ip6_mroute_getsockopt(struct sock *sk, int optname, char __user *optval,
- int __user *optlen)
+int ip6_mroute_getsockopt(struct sock *sk, int optname, sockptr_t optval,
+ sockptr_t optlen)
{
int olr;
int val;
@@ -1859,16 +1859,16 @@ int ip6_mroute_getsockopt(struct sock *sk, int optname, char __user *optval,
return -ENOPROTOOPT;
}
- if (get_user(olr, optlen))
+ if (copy_from_sockptr(&olr, optlen, sizeof(int)))
return -EFAULT;
olr = min_t(int, olr, sizeof(int));
if (olr < 0)
return -EINVAL;
- if (put_user(olr, optlen))
+ if (copy_to_sockptr(optlen, &olr, sizeof(int)))
return -EFAULT;
- if (copy_to_user(optval, &val, olr))
+ if (copy_to_sockptr(optval, &val, olr))
return -EFAULT;
return 0;
}
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index e0dcc7a193df..2d2f4dd9e5df 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -327,7 +327,7 @@ static int ipv6_set_opt_hdr(struct sock *sk, int optname, sockptr_t optval,
int err;
/* hop-by-hop / destination options are privileged option */
- if (optname != IPV6_RTHDR && !ns_capable(net->user_ns, CAP_NET_RAW))
+ if (optname != IPV6_RTHDR && !sockopt_ns_capable(net->user_ns, CAP_NET_RAW))
return -EPERM;
/* remove any sticky options header with a zero option
@@ -391,8 +391,8 @@ sticky_done:
return err;
}
-static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
- sockptr_t optval, unsigned int optlen)
+int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
{
struct ipv6_pinfo *np = inet6_sk(sk);
struct net *net = sock_net(sk);
@@ -417,7 +417,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
if (needs_rtnl)
rtnl_lock();
- lock_sock(sk);
+ sockopt_lock_sock(sk);
switch (optname) {
@@ -634,8 +634,8 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
break;
case IPV6_TRANSPARENT:
- if (valbool && !ns_capable(net->user_ns, CAP_NET_RAW) &&
- !ns_capable(net->user_ns, CAP_NET_ADMIN)) {
+ if (valbool && !sockopt_ns_capable(net->user_ns, CAP_NET_RAW) &&
+ !sockopt_ns_capable(net->user_ns, CAP_NET_ADMIN)) {
retv = -EPERM;
break;
}
@@ -946,7 +946,7 @@ done:
case IPV6_IPSEC_POLICY:
case IPV6_XFRM_POLICY:
retv = -EPERM;
- if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ if (!sockopt_ns_capable(net->user_ns, CAP_NET_ADMIN))
break;
retv = xfrm_user_policy(sk, optname, optval, optlen);
break;
@@ -994,14 +994,14 @@ done:
break;
}
- release_sock(sk);
+ sockopt_release_sock(sk);
if (needs_rtnl)
rtnl_unlock();
return retv;
e_inval:
- release_sock(sk);
+ sockopt_release_sock(sk);
if (needs_rtnl)
rtnl_unlock();
return -EINVAL;
@@ -1030,7 +1030,7 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
EXPORT_SYMBOL(ipv6_setsockopt);
static int ipv6_getsockopt_sticky(struct sock *sk, struct ipv6_txoptions *opt,
- int optname, char __user *optval, int len)
+ int optname, sockptr_t optval, int len)
{
struct ipv6_opt_hdr *hdr;
@@ -1058,56 +1058,53 @@ static int ipv6_getsockopt_sticky(struct sock *sk, struct ipv6_txoptions *opt,
return 0;
len = min_t(unsigned int, len, ipv6_optlen(hdr));
- if (copy_to_user(optval, hdr, len))
+ if (copy_to_sockptr(optval, hdr, len))
return -EFAULT;
return len;
}
-static int ipv6_get_msfilter(struct sock *sk, void __user *optval,
- int __user *optlen, int len)
+static int ipv6_get_msfilter(struct sock *sk, sockptr_t optval,
+ sockptr_t optlen, int len)
{
const int size0 = offsetof(struct group_filter, gf_slist_flex);
- struct group_filter __user *p = optval;
struct group_filter gsf;
int num;
int err;
if (len < size0)
return -EINVAL;
- if (copy_from_user(&gsf, p, size0))
+ if (copy_from_sockptr(&gsf, optval, size0))
return -EFAULT;
if (gsf.gf_group.ss_family != AF_INET6)
return -EADDRNOTAVAIL;
num = gsf.gf_numsrc;
- lock_sock(sk);
- err = ip6_mc_msfget(sk, &gsf, p->gf_slist_flex);
+ sockopt_lock_sock(sk);
+ err = ip6_mc_msfget(sk, &gsf, optval, size0);
if (!err) {
if (num > gsf.gf_numsrc)
num = gsf.gf_numsrc;
- if (put_user(GROUP_FILTER_SIZE(num), optlen) ||
- copy_to_user(p, &gsf, size0))
+ len = GROUP_FILTER_SIZE(num);
+ if (copy_to_sockptr(optlen, &len, sizeof(int)) ||
+ copy_to_sockptr(optval, &gsf, size0))
err = -EFAULT;
}
- release_sock(sk);
+ sockopt_release_sock(sk);
return err;
}
-static int compat_ipv6_get_msfilter(struct sock *sk, void __user *optval,
- int __user *optlen)
+static int compat_ipv6_get_msfilter(struct sock *sk, sockptr_t optval,
+ sockptr_t optlen, int len)
{
const int size0 = offsetof(struct compat_group_filter, gf_slist_flex);
- struct compat_group_filter __user *p = optval;
struct compat_group_filter gf32;
struct group_filter gf;
- int len, err;
+ int err;
int num;
- if (get_user(len, optlen))
- return -EFAULT;
if (len < size0)
return -EINVAL;
- if (copy_from_user(&gf32, p, size0))
+ if (copy_from_sockptr(&gf32, optval, size0))
return -EFAULT;
gf.gf_interface = gf32.gf_interface;
gf.gf_fmode = gf32.gf_fmode;
@@ -1117,23 +1114,25 @@ static int compat_ipv6_get_msfilter(struct sock *sk, void __user *optval,
if (gf.gf_group.ss_family != AF_INET6)
return -EADDRNOTAVAIL;
- lock_sock(sk);
- err = ip6_mc_msfget(sk, &gf, p->gf_slist_flex);
- release_sock(sk);
+ sockopt_lock_sock(sk);
+ err = ip6_mc_msfget(sk, &gf, optval, size0);
+ sockopt_release_sock(sk);
if (err)
return err;
if (num > gf.gf_numsrc)
num = gf.gf_numsrc;
len = GROUP_FILTER_SIZE(num) - (sizeof(gf)-sizeof(gf32));
- if (put_user(len, optlen) ||
- put_user(gf.gf_fmode, &p->gf_fmode) ||
- put_user(gf.gf_numsrc, &p->gf_numsrc))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)) ||
+ copy_to_sockptr_offset(optval, offsetof(struct compat_group_filter, gf_fmode),
+ &gf.gf_fmode, sizeof(gf32.gf_fmode)) ||
+ copy_to_sockptr_offset(optval, offsetof(struct compat_group_filter, gf_numsrc),
+ &gf.gf_numsrc, sizeof(gf32.gf_numsrc)))
return -EFAULT;
return 0;
}
-static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen, unsigned int flags)
+int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
+ sockptr_t optval, sockptr_t optlen)
{
struct ipv6_pinfo *np = inet6_sk(sk);
int len;
@@ -1142,7 +1141,7 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
if (ip6_mroute_opt(optname))
return ip6_mroute_getsockopt(sk, optname, optval, optlen);
- if (get_user(len, optlen))
+ if (copy_from_sockptr(&len, optlen, sizeof(int)))
return -EFAULT;
switch (optname) {
case IPV6_ADDRFORM:
@@ -1156,7 +1155,7 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
break;
case MCAST_MSFILTER:
if (in_compat_syscall())
- return compat_ipv6_get_msfilter(sk, optval, optlen);
+ return compat_ipv6_get_msfilter(sk, optval, optlen, len);
return ipv6_get_msfilter(sk, optval, optlen, len);
case IPV6_2292PKTOPTIONS:
{
@@ -1166,16 +1165,21 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
if (sk->sk_type != SOCK_STREAM)
return -ENOPROTOOPT;
- msg.msg_control_user = optval;
+ if (optval.is_kernel) {
+ msg.msg_control_is_user = false;
+ msg.msg_control = optval.kernel;
+ } else {
+ msg.msg_control_is_user = true;
+ msg.msg_control_user = optval.user;
+ }
msg.msg_controllen = len;
- msg.msg_flags = flags;
- msg.msg_control_is_user = true;
+ msg.msg_flags = 0;
- lock_sock(sk);
+ sockopt_lock_sock(sk);
skb = np->pktoptions;
if (skb)
ip6_datagram_recv_ctl(sk, &msg, skb);
- release_sock(sk);
+ sockopt_release_sock(sk);
if (!skb) {
if (np->rxopt.bits.rxinfo) {
struct in6_pktinfo src_info;
@@ -1212,7 +1216,7 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
}
}
len -= msg.msg_controllen;
- return put_user(len, optlen);
+ return copy_to_sockptr(optlen, &len, sizeof(int));
}
case IPV6_MTU:
{
@@ -1264,15 +1268,15 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
{
struct ipv6_txoptions *opt;
- lock_sock(sk);
+ sockopt_lock_sock(sk);
opt = rcu_dereference_protected(np->opt,
lockdep_sock_is_held(sk));
len = ipv6_getsockopt_sticky(sk, opt, optname, optval, len);
- release_sock(sk);
+ sockopt_release_sock(sk);
/* check if ipv6_getsockopt_sticky() returns err code */
if (len < 0)
return len;
- return put_user(len, optlen);
+ return copy_to_sockptr(optlen, &len, sizeof(int));
}
case IPV6_RECVHOPOPTS:
@@ -1326,9 +1330,9 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
if (!mtuinfo.ip6m_mtu)
return -ENOTCONN;
- if (put_user(len, optlen))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
return -EFAULT;
- if (copy_to_user(optval, &mtuinfo, len))
+ if (copy_to_sockptr(optval, &mtuinfo, len))
return -EFAULT;
return 0;
@@ -1405,7 +1409,7 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
if (len < sizeof(freq))
return -EINVAL;
- if (copy_from_user(&freq, optval, sizeof(freq)))
+ if (copy_from_sockptr(&freq, optval, sizeof(freq)))
return -EFAULT;
if (freq.flr_action != IPV6_FL_A_GET)
@@ -1420,9 +1424,9 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
if (val < 0)
return val;
- if (put_user(len, optlen))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
return -EFAULT;
- if (copy_to_user(optval, &freq, len))
+ if (copy_to_sockptr(optval, &freq, len))
return -EFAULT;
return 0;
@@ -1474,9 +1478,9 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
return -ENOPROTOOPT;
}
len = min_t(unsigned int, sizeof(int), len);
- if (put_user(len, optlen))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
return -EFAULT;
- if (copy_to_user(optval, &val, len))
+ if (copy_to_sockptr(optval, &val, len))
return -EFAULT;
return 0;
}
@@ -1492,7 +1496,8 @@ int ipv6_getsockopt(struct sock *sk, int level, int optname,
if (level != SOL_IPV6)
return -ENOPROTOOPT;
- err = do_ipv6_getsockopt(sk, level, optname, optval, optlen, 0);
+ err = do_ipv6_getsockopt(sk, level, optname,
+ USER_SOCKPTR(optval), USER_SOCKPTR(optlen));
#ifdef CONFIG_NETFILTER
/* we need to exclude all possible ENOPROTOOPTs except default case */
if (err == -ENOPROTOOPT && optname != IPV6_2292PKTOPTIONS) {
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 87c699d57b36..0566ab03ddbe 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -580,7 +580,7 @@ done:
}
int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf,
- struct sockaddr_storage __user *p)
+ sockptr_t optval, size_t ss_offset)
{
struct ipv6_pinfo *inet6 = inet6_sk(sk);
const struct in6_addr *group;
@@ -612,8 +612,7 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf,
copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc;
gsf->gf_numsrc = count;
-
- for (i = 0; i < copycount; i++, p++) {
+ for (i = 0; i < copycount; i++) {
struct sockaddr_in6 *psin6;
struct sockaddr_storage ss;
@@ -621,8 +620,9 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf,
memset(&ss, 0, sizeof(ss));
psin6->sin6_family = AF_INET6;
psin6->sin6_addr = psl->sl_addr[i];
- if (copy_to_user(p, &ss, sizeof(ss)))
+ if (copy_to_sockptr_offset(optval, ss_offset, &ss, sizeof(ss)))
return -EFAULT;
+ ss_offset += sizeof(ss);
}
return 0;
}
diff --git a/samples/bpf/map_perf_test_kern.c b/samples/bpf/map_perf_test_kern.c
index 8773f22b6a98..7342c5b2f278 100644
--- a/samples/bpf/map_perf_test_kern.c
+++ b/samples/bpf/map_perf_test_kern.c
@@ -108,11 +108,14 @@ int stress_hmap(struct pt_regs *ctx)
u32 key = bpf_get_current_pid_tgid();
long init_val = 1;
long *value;
+ int i;
- bpf_map_update_elem(&hash_map, &key, &init_val, BPF_ANY);
- value = bpf_map_lookup_elem(&hash_map, &key);
- if (value)
- bpf_map_delete_elem(&hash_map, &key);
+ for (i = 0; i < 10; i++) {
+ bpf_map_update_elem(&hash_map, &key, &init_val, BPF_ANY);
+ value = bpf_map_lookup_elem(&hash_map, &key);
+ if (value)
+ bpf_map_delete_elem(&hash_map, &key);
+ }
return 0;
}
@@ -123,11 +126,14 @@ int stress_percpu_hmap(struct pt_regs *ctx)
u32 key = bpf_get_current_pid_tgid();
long init_val = 1;
long *value;
+ int i;
- bpf_map_update_elem(&percpu_hash_map, &key, &init_val, BPF_ANY);
- value = bpf_map_lookup_elem(&percpu_hash_map, &key);
- if (value)
- bpf_map_delete_elem(&percpu_hash_map, &key);
+ for (i = 0; i < 10; i++) {
+ bpf_map_update_elem(&percpu_hash_map, &key, &init_val, BPF_ANY);
+ value = bpf_map_lookup_elem(&percpu_hash_map, &key);
+ if (value)
+ bpf_map_delete_elem(&percpu_hash_map, &key);
+ }
return 0;
}
@@ -137,11 +143,14 @@ int stress_hmap_alloc(struct pt_regs *ctx)
u32 key = bpf_get_current_pid_tgid();
long init_val = 1;
long *value;
+ int i;
- bpf_map_update_elem(&hash_map_alloc, &key, &init_val, BPF_ANY);
- value = bpf_map_lookup_elem(&hash_map_alloc, &key);
- if (value)
- bpf_map_delete_elem(&hash_map_alloc, &key);
+ for (i = 0; i < 10; i++) {
+ bpf_map_update_elem(&hash_map_alloc, &key, &init_val, BPF_ANY);
+ value = bpf_map_lookup_elem(&hash_map_alloc, &key);
+ if (value)
+ bpf_map_delete_elem(&hash_map_alloc, &key);
+ }
return 0;
}
@@ -151,11 +160,14 @@ int stress_percpu_hmap_alloc(struct pt_regs *ctx)
u32 key = bpf_get_current_pid_tgid();
long init_val = 1;
long *value;
+ int i;
- bpf_map_update_elem(&percpu_hash_map_alloc, &key, &init_val, BPF_ANY);
- value = bpf_map_lookup_elem(&percpu_hash_map_alloc, &key);
- if (value)
- bpf_map_delete_elem(&percpu_hash_map_alloc, &key);
+ for (i = 0; i < 10; i++) {
+ bpf_map_update_elem(&percpu_hash_map_alloc, &key, &init_val, BPF_ANY);
+ value = bpf_map_lookup_elem(&percpu_hash_map_alloc, &key);
+ if (value)
+ bpf_map_delete_elem(&percpu_hash_map_alloc, &key);
+ }
return 0;
}
diff --git a/samples/bpf/map_perf_test_user.c b/samples/bpf/map_perf_test_user.c
index b6fc174ab1f2..1bb53f4b29e1 100644
--- a/samples/bpf/map_perf_test_user.c
+++ b/samples/bpf/map_perf_test_user.c
@@ -72,7 +72,7 @@ static int test_flags = ~0;
static uint32_t num_map_entries;
static uint32_t inner_lru_hash_size;
static int lru_hash_lookup_test_entries = 32;
-static uint32_t max_cnt = 1000000;
+static uint32_t max_cnt = 10000;
static int check_test_flags(enum test_type t)
{
diff --git a/scripts/bpf_doc.py b/scripts/bpf_doc.py
index dfb260de17a8..d5c389df6045 100755
--- a/scripts/bpf_doc.py
+++ b/scripts/bpf_doc.py
@@ -10,6 +10,9 @@ from __future__ import print_function
import argparse
import re
import sys, os
+import subprocess
+
+helpersDocStart = 'Start of BPF helper function descriptions:'
class NoHelperFound(BaseException):
pass
@@ -47,6 +50,10 @@ class Helper(APIElement):
@desc: textual description of the helper function
@ret: description of the return value of the helper function
"""
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.enum_val = None
+
def proto_break_down(self):
"""
Break down helper function protocol into smaller chunks: return type,
@@ -89,6 +96,7 @@ class HeaderParser(object):
self.commands = []
self.desc_unique_helpers = set()
self.define_unique_helpers = []
+ self.helper_enum_vals = {}
self.desc_syscalls = []
self.enum_syscalls = []
@@ -233,7 +241,7 @@ class HeaderParser(object):
self.enum_syscalls = re.findall('(BPF\w+)+', bpf_cmd_str)
def parse_desc_helpers(self):
- self.seek_to('* Start of BPF helper function descriptions:',
+ self.seek_to(helpersDocStart,
'Could not find start of eBPF helper descriptions list')
while True:
try:
@@ -245,30 +253,54 @@ class HeaderParser(object):
break
def parse_define_helpers(self):
- # Parse the number of FN(...) in #define __BPF_FUNC_MAPPER to compare
- # later with the number of unique function names present in description.
+ # Parse FN(...) in #define __BPF_FUNC_MAPPER to compare later with the
+ # number of unique function names present in description and use the
+ # correct enumeration value.
# Note: seek_to(..) discards the first line below the target search text,
# resulting in FN(unspec) being skipped and not added to self.define_unique_helpers.
self.seek_to('#define __BPF_FUNC_MAPPER(FN)',
'Could not find start of eBPF helper definition list')
- # Searches for either one or more FN(\w+) defines or a backslash for newline
- p = re.compile('\s*(FN\(\w+\))+|\\\\')
+ # Searches for one FN(\w+) define or a backslash for newline
+ p = re.compile('\s*FN\((\w+)\)|\\\\')
fn_defines_str = ''
+ i = 1 # 'unspec' is skipped as mentioned above
while True:
capture = p.match(self.line)
if capture:
fn_defines_str += self.line
+ self.helper_enum_vals[capture.expand(r'bpf_\1')] = i
+ i += 1
else:
break
self.line = self.reader.readline()
# Find the number of occurences of FN(\w+)
self.define_unique_helpers = re.findall('FN\(\w+\)', fn_defines_str)
+ def assign_helper_values(self):
+ seen_helpers = set()
+ for helper in self.helpers:
+ proto = helper.proto_break_down()
+ name = proto['name']
+ try:
+ enum_val = self.helper_enum_vals[name]
+ except KeyError:
+ raise Exception("Helper %s is missing from enum bpf_func_id" % name)
+
+ # Enforce current practice of having the descriptions ordered
+ # by enum value.
+ seen_helpers.add(name)
+ desc_val = len(seen_helpers)
+ if desc_val != enum_val:
+ raise Exception("Helper %s comment order (#%d) must be aligned with its position (#%d) in enum bpf_func_id" % (name, desc_val, enum_val))
+
+ helper.enum_val = enum_val
+
def run(self):
self.parse_desc_syscall()
self.parse_enum_syscall()
self.parse_desc_helpers()
self.parse_define_helpers()
+ self.assign_helper_values()
self.reader.close()
###############################################################################
@@ -357,6 +389,31 @@ class PrinterRST(Printer):
print('')
+ def get_kernel_version(self):
+ try:
+ version = subprocess.run(['git', 'describe'], cwd=linuxRoot,
+ capture_output=True, check=True)
+ version = version.stdout.decode().rstrip()
+ except:
+ try:
+ version = subprocess.run(['make', 'kernelversion'], cwd=linuxRoot,
+ capture_output=True, check=True)
+ version = version.stdout.decode().rstrip()
+ except:
+ return 'Linux'
+ return 'Linux {version}'.format(version=version)
+
+ def get_last_doc_update(self, delimiter):
+ try:
+ cmd = ['git', 'log', '-1', '--pretty=format:%cs', '--no-patch',
+ '-L',
+ '/{}/,/\*\//:include/uapi/linux/bpf.h'.format(delimiter)]
+ date = subprocess.run(cmd, cwd=linuxRoot,
+ capture_output=True, check=True)
+ return date.stdout.decode().rstrip()
+ except:
+ return ''
+
class PrinterHelpersRST(PrinterRST):
"""
A printer for dumping collected information about helpers as a ReStructured
@@ -378,6 +435,8 @@ list of eBPF helper functions
-------------------------------------------------------------------------------
:Manual section: 7
+:Version: {version}
+{date_field}{date}
DESCRIPTION
===========
@@ -410,8 +469,13 @@ kernel at the top).
HELPERS
=======
'''
+ kernelVersion = self.get_kernel_version()
+ lastUpdate = self.get_last_doc_update(helpersDocStart)
+
PrinterRST.print_license(self)
- print(header)
+ print(header.format(version=kernelVersion,
+ date_field = ':Date: ' if lastUpdate else '',
+ date=lastUpdate))
def print_footer(self):
footer = '''
@@ -761,7 +825,7 @@ class PrinterHelpers(Printer):
comma = ', '
print(one_arg, end='')
- print(') = (void *) %d;' % len(self.seen_helpers))
+ print(') = (void *) %d;' % helper.enum_val)
print('')
###############################################################################
diff --git a/tools/bpf/bpftool/btf_dumper.c b/tools/bpf/bpftool/btf_dumper.c
index 125798b0bc5d..19924b6ce796 100644
--- a/tools/bpf/bpftool/btf_dumper.c
+++ b/tools/bpf/bpftool/btf_dumper.c
@@ -452,7 +452,7 @@ static int btf_dumper_int(const struct btf_type *t, __u8 bit_offset,
*(char *)data);
break;
case BTF_INT_BOOL:
- jsonw_bool(jw, *(int *)data);
+ jsonw_bool(jw, *(bool *)data);
break;
default:
/* shouldn't happen */
diff --git a/tools/bpf/bpftool/link.c b/tools/bpf/bpftool/link.c
index 7a20931c3250..ef0dc2f8d5a2 100644
--- a/tools/bpf/bpftool/link.c
+++ b/tools/bpf/bpftool/link.c
@@ -83,6 +83,29 @@ static bool is_iter_map_target(const char *target_name)
strcmp(target_name, "bpf_sk_storage_map") == 0;
}
+static bool is_iter_cgroup_target(const char *target_name)
+{
+ return strcmp(target_name, "cgroup") == 0;
+}
+
+static const char *cgroup_order_string(__u32 order)
+{
+ switch (order) {
+ case BPF_CGROUP_ITER_ORDER_UNSPEC:
+ return "order_unspec";
+ case BPF_CGROUP_ITER_SELF_ONLY:
+ return "self_only";
+ case BPF_CGROUP_ITER_DESCENDANTS_PRE:
+ return "descendants_pre";
+ case BPF_CGROUP_ITER_DESCENDANTS_POST:
+ return "descendants_post";
+ case BPF_CGROUP_ITER_ANCESTORS_UP:
+ return "ancestors_up";
+ default: /* won't happen */
+ return "unknown";
+ }
+}
+
static void show_iter_json(struct bpf_link_info *info, json_writer_t *wtr)
{
const char *target_name = u64_to_ptr(info->iter.target_name);
@@ -91,6 +114,12 @@ static void show_iter_json(struct bpf_link_info *info, json_writer_t *wtr)
if (is_iter_map_target(target_name))
jsonw_uint_field(wtr, "map_id", info->iter.map.map_id);
+
+ if (is_iter_cgroup_target(target_name)) {
+ jsonw_lluint_field(wtr, "cgroup_id", info->iter.cgroup.cgroup_id);
+ jsonw_string_field(wtr, "order",
+ cgroup_order_string(info->iter.cgroup.order));
+ }
}
static int get_prog_info(int prog_id, struct bpf_prog_info *info)
@@ -208,6 +237,12 @@ static void show_iter_plain(struct bpf_link_info *info)
if (is_iter_map_target(target_name))
printf("map_id %u ", info->iter.map.map_id);
+
+ if (is_iter_cgroup_target(target_name)) {
+ printf("cgroup_id %llu ", info->iter.cgroup.cgroup_id);
+ printf("order %s ",
+ cgroup_order_string(info->iter.cgroup.order));
+ }
}
static int show_link_close_plain(int fd, struct bpf_link_info *info)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 1d6085e15fc8..793103b10eab 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -87,10 +87,29 @@ struct bpf_cgroup_storage_key {
__u32 attach_type; /* program attach type (enum bpf_attach_type) */
};
+enum bpf_cgroup_iter_order {
+ BPF_CGROUP_ITER_ORDER_UNSPEC = 0,
+ BPF_CGROUP_ITER_SELF_ONLY, /* process only a single object. */
+ BPF_CGROUP_ITER_DESCENDANTS_PRE, /* walk descendants in pre-order. */
+ BPF_CGROUP_ITER_DESCENDANTS_POST, /* walk descendants in post-order. */
+ BPF_CGROUP_ITER_ANCESTORS_UP, /* walk ancestors upward. */
+};
+
union bpf_iter_link_info {
struct {
__u32 map_fd;
} map;
+ struct {
+ enum bpf_cgroup_iter_order order;
+
+ /* At most one of cgroup_fd and cgroup_id can be non-zero. If
+ * both are zero, the walk starts from the default cgroup v2
+ * root. For walking v1 hierarchy, one should always explicitly
+ * specify cgroup_fd.
+ */
+ __u32 cgroup_fd;
+ __u64 cgroup_id;
+ } cgroup;
};
/* BPF syscall commands, see bpf(2) man-page for more details. */
@@ -4437,7 +4456,7 @@ union bpf_attr {
*
* **-EEXIST** if the option already exists.
*
- * **-EFAULT** on failrue to parse the existing header options.
+ * **-EFAULT** on failure to parse the existing header options.
*
* **-EPERM** if the helper cannot be used under the current
* *skops*\ **->op**.
@@ -4646,7 +4665,7 @@ union bpf_attr {
* a *map* with *task* as the **key**. From this
* perspective, the usage is not much different from
* **bpf_map_lookup_elem**\ (*map*, **&**\ *task*) except this
- * helper enforces the key must be an task_struct and the map must also
+ * helper enforces the key must be a task_struct and the map must also
* be a **BPF_MAP_TYPE_TASK_STORAGE**.
*
* Underneath, the value is stored locally at *task* instead of
@@ -4704,7 +4723,7 @@ union bpf_attr {
*
* long bpf_ima_inode_hash(struct inode *inode, void *dst, u32 size)
* Description
- * Returns the stored IMA hash of the *inode* (if it's avaialable).
+ * Returns the stored IMA hash of the *inode* (if it's available).
* If the hash is larger than *size*, then only *size*
* bytes will be copied to *dst*
* Return
@@ -4728,12 +4747,12 @@ union bpf_attr {
*
* The argument *len_diff* can be used for querying with a planned
* size change. This allows to check MTU prior to changing packet
- * ctx. Providing an *len_diff* adjustment that is larger than the
+ * ctx. Providing a *len_diff* adjustment that is larger than the
* actual packet size (resulting in negative packet size) will in
- * principle not exceed the MTU, why it is not considered a
- * failure. Other BPF-helpers are needed for performing the
- * planned size change, why the responsability for catch a negative
- * packet size belong in those helpers.
+ * principle not exceed the MTU, which is why it is not considered
+ * a failure. Other BPF helpers are needed for performing the
+ * planned size change; therefore the responsibility for catching
+ * a negative packet size belongs in those helpers.
*
* Specifying *ifindex* zero means the MTU check is performed
* against the current net device. This is practical if this isn't
@@ -5085,17 +5104,29 @@ union bpf_attr {
*
* int bpf_get_retval(void)
* Description
- * Get the syscall's return value that will be returned to userspace.
+ * Get the BPF program's return value that will be returned to the upper layers.
*
- * This helper is currently supported by cgroup programs only.
+ * This helper is currently supported by cgroup programs and only by the hooks
+ * where BPF program's return value is returned to the userspace via errno.
* Return
- * The syscall's return value.
+ * The BPF program's return value.
*
* int bpf_set_retval(int retval)
* Description
- * Set the syscall's return value that will be returned to userspace.
+ * Set the BPF program's return value that will be returned to the upper layers.
+ *
+ * This helper is currently supported by cgroup programs and only by the hooks
+ * where BPF program's return value is returned to the userspace via errno.
+ *
+ * Note that there is the following corner case where the program exports an error
+ * via bpf_set_retval but signals success via 'return 1':
+ *
+ * bpf_set_retval(-EPERM);
+ * return 1;
+ *
+ * In this case, the BPF program's return value will use helper's -EPERM. This
+ * still holds true for cgroup/bind{4,6} which supports extra 'return 3' success case.
*
- * This helper is currently supported by cgroup programs only.
* Return
* 0 on success, or a negative error in case of failure.
*
@@ -5628,6 +5659,11 @@ enum {
BPF_F_SEQ_NUMBER = (1ULL << 3),
};
+/* BPF_FUNC_skb_get_tunnel_key flags. */
+enum {
+ BPF_F_TUNINFO_FLAGS = (1ULL << 4),
+};
+
/* BPF_FUNC_perf_event_output, BPF_FUNC_perf_event_read and
* BPF_FUNC_perf_event_read_value flags.
*/
@@ -5817,7 +5853,10 @@ struct bpf_tunnel_key {
};
__u8 tunnel_tos;
__u8 tunnel_ttl;
- __u16 tunnel_ext; /* Padding, future use. */
+ union {
+ __u16 tunnel_ext; /* compat */
+ __be16 tunnel_flags;
+ };
__u32 tunnel_label;
union {
__u32 local_ipv4;
@@ -5861,6 +5900,11 @@ enum bpf_ret_code {
* represented by BPF_REDIRECT above).
*/
BPF_LWT_REROUTE = 128,
+ /* BPF_FLOW_DISSECTOR_CONTINUE: used by BPF_PROG_TYPE_FLOW_DISSECTOR
+ * to indicate that no custom dissection was performed, and
+ * fallback to standard dissector is requested.
+ */
+ BPF_FLOW_DISSECTOR_CONTINUE = 129,
};
struct bpf_sock {
@@ -6159,11 +6203,22 @@ struct bpf_link_info {
struct {
__aligned_u64 target_name; /* in/out: target_name buffer ptr */
__u32 target_name_len; /* in/out: target_name buffer len */
+
+ /* If the iter specific field is 32 bits, it can be put
+ * in the first or second union. Otherwise it should be
+ * put in the second union.
+ */
union {
struct {
__u32 map_id;
} map;
};
+ union {
+ struct {
+ __u64 cgroup_id;
+ __u32 order;
+ } cgroup;
+ };
} iter;
struct {
__u32 netns_ino;
diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h
index 7349b16b8e2f..867b734839dd 100644
--- a/tools/lib/bpf/bpf_helpers.h
+++ b/tools/lib/bpf/bpf_helpers.h
@@ -131,7 +131,7 @@
/*
* Helper function to perform a tail call with a constant/immediate map slot.
*/
-#if __clang_major__ >= 8 && defined(__bpf__)
+#if (!defined(__clang__) || __clang_major__ >= 8) && defined(__bpf__)
static __always_inline void
bpf_tail_call_static(void *ctx, const void *map, const __u32 slot)
{
@@ -139,8 +139,8 @@ bpf_tail_call_static(void *ctx, const void *map, const __u32 slot)
__bpf_unreachable();
/*
- * Provide a hard guarantee that LLVM won't optimize setting r2 (map
- * pointer) and r3 (constant map index) from _different paths_ ending
+ * Provide a hard guarantee that the compiler won't optimize setting r2
+ * (map pointer) and r3 (constant map index) from _different paths_ ending
* up at the _same_ call insn as otherwise we won't be able to use the
* jmpq/nopl retpoline-free patching by the x86-64 JIT in the kernel
* given they mismatch. See also d2e4c1e6c294 ("bpf: Constant map key
@@ -148,12 +148,19 @@ bpf_tail_call_static(void *ctx, const void *map, const __u32 slot)
*
* Note on clobber list: we need to stay in-line with BPF calling
* convention, so even if we don't end up using r0, r4, r5, we need
- * to mark them as clobber so that LLVM doesn't end up using them
- * before / after the call.
+ * to mark them as clobber so that the compiler doesn't end up using
+ * them before / after the call.
*/
- asm volatile("r1 = %[ctx]\n\t"
+ asm volatile(
+#ifdef __clang__
+ "r1 = %[ctx]\n\t"
"r2 = %[map]\n\t"
"r3 = %[slot]\n\t"
+#else
+ "mov %%r1,%[ctx]\n\t"
+ "mov %%r2,%[map]\n\t"
+ "mov %%r3,%[slot]\n\t"
+#endif
"call 12"
:: [ctx]"r"(ctx), [map]"r"(map), [slot]"i"(slot)
: "r0", "r1", "r2", "r3", "r4", "r5");
diff --git a/tools/lib/bpf/skel_internal.h b/tools/lib/bpf/skel_internal.h
index 00c5f94b43be..1e82ab06c3eb 100644
--- a/tools/lib/bpf/skel_internal.h
+++ b/tools/lib/bpf/skel_internal.h
@@ -251,6 +251,29 @@ static inline int skel_map_update_elem(int fd, const void *key,
return skel_sys_bpf(BPF_MAP_UPDATE_ELEM, &attr, attr_sz);
}
+static inline int skel_map_delete_elem(int fd, const void *key)
+{
+ const size_t attr_sz = offsetofend(union bpf_attr, flags);
+ union bpf_attr attr;
+
+ memset(&attr, 0, attr_sz);
+ attr.map_fd = fd;
+ attr.key = (long)key;
+
+ return skel_sys_bpf(BPF_MAP_DELETE_ELEM, &attr, attr_sz);
+}
+
+static inline int skel_map_get_fd_by_id(__u32 id)
+{
+ const size_t attr_sz = offsetofend(union bpf_attr, flags);
+ union bpf_attr attr;
+
+ memset(&attr, 0, attr_sz);
+ attr.map_id = id;
+
+ return skel_sys_bpf(BPF_MAP_GET_FD_BY_ID, &attr, attr_sz);
+}
+
static inline int skel_raw_tracepoint_open(const char *name, int prog_fd)
{
const size_t attr_sz = offsetofend(union bpf_attr, raw_tracepoint.prog_fd);
diff --git a/tools/testing/selftests/bpf/DENYLIST.s390x b/tools/testing/selftests/bpf/DENYLIST.s390x
index 5cadfbdadf36..18fbb6eab1e2 100644
--- a/tools/testing/selftests/bpf/DENYLIST.s390x
+++ b/tools/testing/selftests/bpf/DENYLIST.s390x
@@ -66,3 +66,7 @@ select_reuseport # intermittently fails on new s390x set
xdp_synproxy # JIT does not support calling kernel function (kfunc)
unpriv_bpf_disabled # fentry
lru_bug # prog 'printk': failed to auto-attach: -524
+setget_sockopt # attach unexpected error: -524 (trampoline)
+cb_refs # expected error message unexpected error: -524 (trampoline)
+cgroup_hierarchical_stats # JIT does not support calling kernel function (kfunc)
+htab_update # failed to attach: ERROR: strerror_r(-524)=22 (trampoline)
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 8d59ec7f4c2d..c10adecb5a73 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -45,7 +45,7 @@ ifneq ($(BPF_GCC),)
TEST_GEN_PROGS += test_progs-bpf_gcc
endif
-TEST_GEN_FILES = test_lwt_ip_encap.o test_tc_edt.o
+TEST_GEN_FILES = test_lwt_ip_encap.bpf.o test_tc_edt.bpf.o
TEST_FILES = xsk_prereqs.sh $(wildcard progs/btf_dump_test_case_*.c)
# Order correspond to 'make run_tests' order
@@ -323,6 +323,7 @@ $(OUTPUT)/test_l4lb_noinline.o: BPF_CFLAGS += -fno-inline
$(OUTPUT)/test_xdp_noinline.o: BPF_CFLAGS += -fno-inline
$(OUTPUT)/flow_dissector_load.o: flow_dissector_load.h
+$(OUTPUT)/cgroup_getset_retval_hooks.o: cgroup_getset_retval_hooks.h
# Build BPF object using Clang
# $1 - input .c file
@@ -357,17 +358,17 @@ LSKELS := kfunc_call_test.c fentry_test.c fexit_test.c fexit_sleep.c \
LSKELS_EXTRA := test_ksyms_module.c test_ksyms_weak.c kfunc_call_test_subprog.c
SKEL_BLACKLIST += $$(LSKELS)
-test_static_linked.skel.h-deps := test_static_linked1.o test_static_linked2.o
-linked_funcs.skel.h-deps := linked_funcs1.o linked_funcs2.o
-linked_vars.skel.h-deps := linked_vars1.o linked_vars2.o
-linked_maps.skel.h-deps := linked_maps1.o linked_maps2.o
+test_static_linked.skel.h-deps := test_static_linked1.bpf.o test_static_linked2.bpf.o
+linked_funcs.skel.h-deps := linked_funcs1.bpf.o linked_funcs2.bpf.o
+linked_vars.skel.h-deps := linked_vars1.bpf.o linked_vars2.bpf.o
+linked_maps.skel.h-deps := linked_maps1.bpf.o linked_maps2.bpf.o
# In the subskeleton case, we want the test_subskeleton_lib.subskel.h file
# but that's created as a side-effect of the skel.h generation.
-test_subskeleton.skel.h-deps := test_subskeleton_lib2.o test_subskeleton_lib.o test_subskeleton.o
-test_subskeleton_lib.skel.h-deps := test_subskeleton_lib2.o test_subskeleton_lib.o
-test_usdt.skel.h-deps := test_usdt.o test_usdt_multispec.o
+test_subskeleton.skel.h-deps := test_subskeleton_lib2.bpf.o test_subskeleton_lib.bpf.o test_subskeleton.bpf.o
+test_subskeleton_lib.skel.h-deps := test_subskeleton_lib2.bpf.o test_subskeleton_lib.bpf.o
+test_usdt.skel.h-deps := test_usdt.bpf.o test_usdt_multispec.bpf.o
-LINKED_BPF_SRCS := $(patsubst %.o,%.c,$(foreach skel,$(LINKED_SKELS),$($(skel)-deps)))
+LINKED_BPF_SRCS := $(patsubst %.bpf.o,%.c,$(foreach skel,$(LINKED_SKELS),$($(skel)-deps)))
# Set up extra TRUNNER_XXX "temporary" variables in the environment (relies on
# $eval()) and pass control to DEFINE_TEST_RUNNER_RULES.
@@ -385,7 +386,7 @@ TRUNNER_EXTRA_OBJS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.o, \
TRUNNER_EXTRA_HDRS := $$(filter %.h,$(TRUNNER_EXTRA_SOURCES))
TRUNNER_TESTS_HDR := $(TRUNNER_TESTS_DIR)/tests.h
TRUNNER_BPF_SRCS := $$(notdir $$(wildcard $(TRUNNER_BPF_PROGS_DIR)/*.c))
-TRUNNER_BPF_OBJS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.o, $$(TRUNNER_BPF_SRCS))
+TRUNNER_BPF_OBJS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.bpf.o, $$(TRUNNER_BPF_SRCS))
TRUNNER_BPF_SKELS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.skel.h, \
$$(filter-out $(SKEL_BLACKLIST) $(LINKED_BPF_SRCS),\
$$(TRUNNER_BPF_SRCS)))
@@ -415,7 +416,7 @@ endif
# input/output directory combination
ifeq ($($(TRUNNER_BPF_PROGS_DIR)$(if $2,-)$2-bpfobjs),)
$(TRUNNER_BPF_PROGS_DIR)$(if $2,-)$2-bpfobjs := y
-$(TRUNNER_BPF_OBJS): $(TRUNNER_OUTPUT)/%.o: \
+$(TRUNNER_BPF_OBJS): $(TRUNNER_OUTPUT)/%.bpf.o: \
$(TRUNNER_BPF_PROGS_DIR)/%.c \
$(TRUNNER_BPF_PROGS_DIR)/*.h \
$$(INCLUDE_DIR)/vmlinux.h \
@@ -425,25 +426,25 @@ $(TRUNNER_BPF_OBJS): $(TRUNNER_OUTPUT)/%.o: \
$$(call $(TRUNNER_BPF_BUILD_RULE),$$<,$$@, \
$(TRUNNER_BPF_CFLAGS))
-$(TRUNNER_BPF_SKELS): %.skel.h: %.o $(BPFTOOL) | $(TRUNNER_OUTPUT)
+$(TRUNNER_BPF_SKELS): %.skel.h: %.bpf.o $(BPFTOOL) | $(TRUNNER_OUTPUT)
$$(call msg,GEN-SKEL,$(TRUNNER_BINARY),$$@)
$(Q)$$(BPFTOOL) gen object $$(<:.o=.linked1.o) $$<
$(Q)$$(BPFTOOL) gen object $$(<:.o=.linked2.o) $$(<:.o=.linked1.o)
$(Q)$$(BPFTOOL) gen object $$(<:.o=.linked3.o) $$(<:.o=.linked2.o)
$(Q)diff $$(<:.o=.linked2.o) $$(<:.o=.linked3.o)
- $(Q)$$(BPFTOOL) gen skeleton $$(<:.o=.linked3.o) name $$(notdir $$(<:.o=)) > $$@
- $(Q)$$(BPFTOOL) gen subskeleton $$(<:.o=.linked3.o) name $$(notdir $$(<:.o=)) > $$(@:.skel.h=.subskel.h)
+ $(Q)$$(BPFTOOL) gen skeleton $$(<:.o=.linked3.o) name $$(notdir $$(<:.bpf.o=)) > $$@
+ $(Q)$$(BPFTOOL) gen subskeleton $$(<:.o=.linked3.o) name $$(notdir $$(<:.bpf.o=)) > $$(@:.skel.h=.subskel.h)
-$(TRUNNER_BPF_LSKELS): %.lskel.h: %.o $(BPFTOOL) | $(TRUNNER_OUTPUT)
+$(TRUNNER_BPF_LSKELS): %.lskel.h: %.bpf.o $(BPFTOOL) | $(TRUNNER_OUTPUT)
$$(call msg,GEN-SKEL,$(TRUNNER_BINARY),$$@)
$(Q)$$(BPFTOOL) gen object $$(<:.o=.llinked1.o) $$<
$(Q)$$(BPFTOOL) gen object $$(<:.o=.llinked2.o) $$(<:.o=.llinked1.o)
$(Q)$$(BPFTOOL) gen object $$(<:.o=.llinked3.o) $$(<:.o=.llinked2.o)
$(Q)diff $$(<:.o=.llinked2.o) $$(<:.o=.llinked3.o)
- $(Q)$$(BPFTOOL) gen skeleton -L $$(<:.o=.llinked3.o) name $$(notdir $$(<:.o=_lskel)) > $$@
+ $(Q)$$(BPFTOOL) gen skeleton -L $$(<:.o=.llinked3.o) name $$(notdir $$(<:.bpf.o=_lskel)) > $$@
$(TRUNNER_BPF_SKELS_LINKED): $(TRUNNER_BPF_OBJS) $(BPFTOOL) | $(TRUNNER_OUTPUT)
- $$(call msg,LINK-BPF,$(TRUNNER_BINARY),$$(@:.skel.h=.o))
+ $$(call msg,LINK-BPF,$(TRUNNER_BINARY),$$(@:.skel.h=.bpf.o))
$(Q)$$(BPFTOOL) gen object $$(@:.skel.h=.linked1.o) $$(addprefix $(TRUNNER_OUTPUT)/,$$($$(@F)-deps))
$(Q)$$(BPFTOOL) gen object $$(@:.skel.h=.linked2.o) $$(@:.skel.h=.linked1.o)
$(Q)$$(BPFTOOL) gen object $$(@:.skel.h=.linked3.o) $$(@:.skel.h=.linked2.o)
@@ -499,7 +500,7 @@ $(OUTPUT)/$(TRUNNER_BINARY): $(TRUNNER_TEST_OBJS) \
| $(TRUNNER_BINARY)-extras
$$(call msg,BINARY,,$$@)
$(Q)$$(CC) $$(CFLAGS) $$(filter %.a %.o,$$^) $$(LDLIBS) -o $$@
- $(Q)$(RESOLVE_BTFIDS) --btf $(TRUNNER_OUTPUT)/btf_data.o $$@
+ $(Q)$(RESOLVE_BTFIDS) --btf $(TRUNNER_OUTPUT)/btf_data.bpf.o $$@
$(Q)ln -sf $(if $2,..,.)/tools/build/bpftool/bootstrap/bpftool $(if $2,$2/)bpftool
endef
diff --git a/tools/testing/selftests/bpf/README.rst b/tools/testing/selftests/bpf/README.rst
index eb1b7541f39d..d3c6b3da0bb1 100644
--- a/tools/testing/selftests/bpf/README.rst
+++ b/tools/testing/selftests/bpf/README.rst
@@ -126,11 +126,11 @@ available in 10.0.1. The patch is available in llvm 11.0.0 trunk.
__ https://reviews.llvm.org/D78466
-bpf_verif_scale/loop6.o test failure with Clang 12
-==================================================
+bpf_verif_scale/loop6.bpf.o test failure with Clang 12
+======================================================
With Clang 12, the following bpf_verif_scale test failed:
- * ``bpf_verif_scale/loop6.o``
+ * ``bpf_verif_scale/loop6.bpf.o``
The verifier output looks like
@@ -245,7 +245,7 @@ See `kernel llvm reloc`_ for more explanation and some examples.
Using clang 13 to compile old libbpf which has static linker support,
there will be a compilation failure::
- libbpf: ELF relo #0 in section #6 has unexpected type 2 in .../bpf_tcp_nogpl.o
+ libbpf: ELF relo #0 in section #6 has unexpected type 2 in .../bpf_tcp_nogpl.bpf.o
Here, ``type 2`` refers to new relocation type ``R_BPF_64_ABS64``.
To fix this issue, user newer libbpf.
diff --git a/tools/testing/selftests/bpf/cgroup_getset_retval_hooks.h b/tools/testing/selftests/bpf/cgroup_getset_retval_hooks.h
new file mode 100644
index 000000000000..a525d3544fd7
--- /dev/null
+++ b/tools/testing/selftests/bpf/cgroup_getset_retval_hooks.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+BPF_RETVAL_HOOK(ingress, "cgroup_skb/ingress", __sk_buff, -EINVAL)
+BPF_RETVAL_HOOK(egress, "cgroup_skb/egress", __sk_buff, -EINVAL)
+BPF_RETVAL_HOOK(sock_create, "cgroup/sock_create", bpf_sock, 0)
+BPF_RETVAL_HOOK(sock_ops, "sockops", bpf_sock_ops, -EINVAL)
+BPF_RETVAL_HOOK(dev, "cgroup/dev", bpf_cgroup_dev_ctx, 0)
+BPF_RETVAL_HOOK(bind4, "cgroup/bind4", bpf_sock_addr, 0)
+BPF_RETVAL_HOOK(bind6, "cgroup/bind6", bpf_sock_addr, 0)
+BPF_RETVAL_HOOK(connect4, "cgroup/connect4", bpf_sock_addr, 0)
+BPF_RETVAL_HOOK(connect6, "cgroup/connect6", bpf_sock_addr, 0)
+BPF_RETVAL_HOOK(post_bind4, "cgroup/post_bind4", bpf_sock_addr, 0)
+BPF_RETVAL_HOOK(post_bind6, "cgroup/post_bind6", bpf_sock_addr, 0)
+BPF_RETVAL_HOOK(sendmsg4, "cgroup/sendmsg4", bpf_sock_addr, 0)
+BPF_RETVAL_HOOK(sendmsg6, "cgroup/sendmsg6", bpf_sock_addr, 0)
+BPF_RETVAL_HOOK(sysctl, "cgroup/sysctl", bpf_sysctl, 0)
+BPF_RETVAL_HOOK(recvmsg4, "cgroup/recvmsg4", bpf_sock_addr, -EINVAL)
+BPF_RETVAL_HOOK(recvmsg6, "cgroup/recvmsg6", bpf_sock_addr, -EINVAL)
+BPF_RETVAL_HOOK(getsockopt, "cgroup/getsockopt", bpf_sockopt, 0)
+BPF_RETVAL_HOOK(setsockopt, "cgroup/setsockopt", bpf_sockopt, 0)
+BPF_RETVAL_HOOK(getpeername4, "cgroup/getpeername4", bpf_sock_addr, -EINVAL)
+BPF_RETVAL_HOOK(getpeername6, "cgroup/getpeername6", bpf_sock_addr, -EINVAL)
+BPF_RETVAL_HOOK(getsockname4, "cgroup/getsockname4", bpf_sock_addr, -EINVAL)
+BPF_RETVAL_HOOK(getsockname6, "cgroup/getsockname6", bpf_sock_addr, -EINVAL)
+BPF_RETVAL_HOOK(sock_release, "cgroup/sock_release", bpf_sock, 0)
diff --git a/tools/testing/selftests/bpf/cgroup_helpers.c b/tools/testing/selftests/bpf/cgroup_helpers.c
index 9d59c3990ca8..e914cc45b766 100644
--- a/tools/testing/selftests/bpf/cgroup_helpers.c
+++ b/tools/testing/selftests/bpf/cgroup_helpers.c
@@ -33,49 +33,52 @@
#define CGROUP_MOUNT_DFLT "/sys/fs/cgroup"
#define NETCLS_MOUNT_PATH CGROUP_MOUNT_DFLT "/net_cls"
#define CGROUP_WORK_DIR "/cgroup-test-work-dir"
-#define format_cgroup_path(buf, path) \
+
+#define format_cgroup_path_pid(buf, path, pid) \
snprintf(buf, sizeof(buf), "%s%s%d%s", CGROUP_MOUNT_PATH, \
- CGROUP_WORK_DIR, getpid(), path)
+ CGROUP_WORK_DIR, pid, path)
+
+#define format_cgroup_path(buf, path) \
+ format_cgroup_path_pid(buf, path, getpid())
+
+#define format_parent_cgroup_path(buf, path) \
+ format_cgroup_path_pid(buf, path, getppid())
#define format_classid_path(buf) \
snprintf(buf, sizeof(buf), "%s%s", NETCLS_MOUNT_PATH, \
CGROUP_WORK_DIR)
-/**
- * enable_all_controllers() - Enable all available cgroup v2 controllers
- *
- * Enable all available cgroup v2 controllers in order to increase
- * the code coverage.
- *
- * If successful, 0 is returned.
- */
-static int enable_all_controllers(char *cgroup_path)
+static int __enable_controllers(const char *cgroup_path, const char *controllers)
{
char path[PATH_MAX + 1];
- char buf[PATH_MAX];
+ char enable[PATH_MAX + 1];
char *c, *c2;
int fd, cfd;
ssize_t len;
- snprintf(path, sizeof(path), "%s/cgroup.controllers", cgroup_path);
- fd = open(path, O_RDONLY);
- if (fd < 0) {
- log_err("Opening cgroup.controllers: %s", path);
- return 1;
- }
-
- len = read(fd, buf, sizeof(buf) - 1);
- if (len < 0) {
+ /* If not controllers are passed, enable all available controllers */
+ if (!controllers) {
+ snprintf(path, sizeof(path), "%s/cgroup.controllers",
+ cgroup_path);
+ fd = open(path, O_RDONLY);
+ if (fd < 0) {
+ log_err("Opening cgroup.controllers: %s", path);
+ return 1;
+ }
+ len = read(fd, enable, sizeof(enable) - 1);
+ if (len < 0) {
+ close(fd);
+ log_err("Reading cgroup.controllers: %s", path);
+ return 1;
+ } else if (len == 0) { /* No controllers to enable */
+ close(fd);
+ return 0;
+ }
+ enable[len] = 0;
close(fd);
- log_err("Reading cgroup.controllers: %s", path);
- return 1;
+ } else {
+ strncpy(enable, controllers, sizeof(enable));
}
- buf[len] = 0;
- close(fd);
-
- /* No controllers available? We're probably on cgroup v1. */
- if (len == 0)
- return 0;
snprintf(path, sizeof(path), "%s/cgroup.subtree_control", cgroup_path);
cfd = open(path, O_RDWR);
@@ -84,7 +87,7 @@ static int enable_all_controllers(char *cgroup_path)
return 1;
}
- for (c = strtok_r(buf, " ", &c2); c; c = strtok_r(NULL, " ", &c2)) {
+ for (c = strtok_r(enable, " ", &c2); c; c = strtok_r(NULL, " ", &c2)) {
if (dprintf(cfd, "+%s\n", c) <= 0) {
log_err("Enabling controller %s: %s", c, path);
close(cfd);
@@ -96,6 +99,87 @@ static int enable_all_controllers(char *cgroup_path)
}
/**
+ * enable_controllers() - Enable cgroup v2 controllers
+ * @relative_path: The cgroup path, relative to the workdir
+ * @controllers: List of controllers to enable in cgroup.controllers format
+ *
+ *
+ * Enable given cgroup v2 controllers, if @controllers is NULL, enable all
+ * available controllers.
+ *
+ * If successful, 0 is returned.
+ */
+int enable_controllers(const char *relative_path, const char *controllers)
+{
+ char cgroup_path[PATH_MAX + 1];
+
+ format_cgroup_path(cgroup_path, relative_path);
+ return __enable_controllers(cgroup_path, controllers);
+}
+
+static int __write_cgroup_file(const char *cgroup_path, const char *file,
+ const char *buf)
+{
+ char file_path[PATH_MAX + 1];
+ int fd;
+
+ snprintf(file_path, sizeof(file_path), "%s/%s", cgroup_path, file);
+ fd = open(file_path, O_RDWR);
+ if (fd < 0) {
+ log_err("Opening %s", file_path);
+ return 1;
+ }
+
+ if (dprintf(fd, "%s", buf) <= 0) {
+ log_err("Writing to %s", file_path);
+ close(fd);
+ return 1;
+ }
+ close(fd);
+ return 0;
+}
+
+/**
+ * write_cgroup_file() - Write to a cgroup file
+ * @relative_path: The cgroup path, relative to the workdir
+ * @file: The name of the file in cgroupfs to write to
+ * @buf: Buffer to write to the file
+ *
+ * Write to a file in the given cgroup's directory.
+ *
+ * If successful, 0 is returned.
+ */
+int write_cgroup_file(const char *relative_path, const char *file,
+ const char *buf)
+{
+ char cgroup_path[PATH_MAX - 24];
+
+ format_cgroup_path(cgroup_path, relative_path);
+ return __write_cgroup_file(cgroup_path, file, buf);
+}
+
+/**
+ * write_cgroup_file_parent() - Write to a cgroup file in the parent process
+ * workdir
+ * @relative_path: The cgroup path, relative to the parent process workdir
+ * @file: The name of the file in cgroupfs to write to
+ * @buf: Buffer to write to the file
+ *
+ * Write to a file in the given cgroup's directory under the parent process
+ * workdir.
+ *
+ * If successful, 0 is returned.
+ */
+int write_cgroup_file_parent(const char *relative_path, const char *file,
+ const char *buf)
+{
+ char cgroup_path[PATH_MAX - 24];
+
+ format_parent_cgroup_path(cgroup_path, relative_path);
+ return __write_cgroup_file(cgroup_path, file, buf);
+}
+
+/**
* setup_cgroup_environment() - Setup the cgroup environment
*
* After calling this function, cleanup_cgroup_environment should be called
@@ -133,7 +217,9 @@ int setup_cgroup_environment(void)
return 1;
}
- if (enable_all_controllers(cgroup_workdir))
+ /* Enable all available controllers to increase test coverage */
+ if (__enable_controllers(CGROUP_MOUNT_PATH, NULL) ||
+ __enable_controllers(cgroup_workdir, NULL))
return 1;
return 0;
@@ -173,7 +259,7 @@ static int join_cgroup_from_top(const char *cgroup_path)
/**
* join_cgroup() - Join a cgroup
- * @path: The cgroup path, relative to the workdir, to join
+ * @relative_path: The cgroup path, relative to the workdir, to join
*
* This function expects a cgroup to already be created, relative to the cgroup
* work dir, and it joins it. For example, passing "/my-cgroup" as the path
@@ -182,11 +268,27 @@ static int join_cgroup_from_top(const char *cgroup_path)
*
* On success, it returns 0, otherwise on failure it returns 1.
*/
-int join_cgroup(const char *path)
+int join_cgroup(const char *relative_path)
+{
+ char cgroup_path[PATH_MAX + 1];
+
+ format_cgroup_path(cgroup_path, relative_path);
+ return join_cgroup_from_top(cgroup_path);
+}
+
+/**
+ * join_parent_cgroup() - Join a cgroup in the parent process workdir
+ * @relative_path: The cgroup path, relative to parent process workdir, to join
+ *
+ * See join_cgroup().
+ *
+ * On success, it returns 0, otherwise on failure it returns 1.
+ */
+int join_parent_cgroup(const char *relative_path)
{
char cgroup_path[PATH_MAX + 1];
- format_cgroup_path(cgroup_path, path);
+ format_parent_cgroup_path(cgroup_path, relative_path);
return join_cgroup_from_top(cgroup_path);
}
@@ -213,8 +315,26 @@ void cleanup_cgroup_environment(void)
}
/**
+ * get_root_cgroup() - Get the FD of the root cgroup
+ *
+ * On success, it returns the file descriptor. On failure, it returns -1.
+ * If there is a failure, it prints the error to stderr.
+ */
+int get_root_cgroup(void)
+{
+ int fd;
+
+ fd = open(CGROUP_MOUNT_PATH, O_RDONLY);
+ if (fd < 0) {
+ log_err("Opening root cgroup");
+ return -1;
+ }
+ return fd;
+}
+
+/**
* create_and_get_cgroup() - Create a cgroup, relative to workdir, and get the FD
- * @path: The cgroup path, relative to the workdir, to join
+ * @relative_path: The cgroup path, relative to the workdir, to join
*
* This function creates a cgroup under the top level workdir and returns the
* file descriptor. It is idempotent.
@@ -222,14 +342,14 @@ void cleanup_cgroup_environment(void)
* On success, it returns the file descriptor. On failure it returns -1.
* If there is a failure, it prints the error to stderr.
*/
-int create_and_get_cgroup(const char *path)
+int create_and_get_cgroup(const char *relative_path)
{
char cgroup_path[PATH_MAX + 1];
int fd;
- format_cgroup_path(cgroup_path, path);
+ format_cgroup_path(cgroup_path, relative_path);
if (mkdir(cgroup_path, 0777) && errno != EEXIST) {
- log_err("mkdiring cgroup %s .. %s", path, cgroup_path);
+ log_err("mkdiring cgroup %s .. %s", relative_path, cgroup_path);
return -1;
}
@@ -244,13 +364,13 @@ int create_and_get_cgroup(const char *path)
/**
* get_cgroup_id() - Get cgroup id for a particular cgroup path
- * @path: The cgroup path, relative to the workdir, to join
+ * @relative_path: The cgroup path, relative to the workdir, to join
*
* On success, it returns the cgroup id. On failure it returns 0,
* which is an invalid cgroup id.
* If there is a failure, it prints the error to stderr.
*/
-unsigned long long get_cgroup_id(const char *path)
+unsigned long long get_cgroup_id(const char *relative_path)
{
int dirfd, err, flags, mount_id, fhsize;
union {
@@ -261,7 +381,7 @@ unsigned long long get_cgroup_id(const char *path)
struct file_handle *fhp, *fhp2;
unsigned long long ret = 0;
- format_cgroup_path(cgroup_workdir, path);
+ format_cgroup_path(cgroup_workdir, relative_path);
dirfd = AT_FDCWD;
flags = 0;
diff --git a/tools/testing/selftests/bpf/cgroup_helpers.h b/tools/testing/selftests/bpf/cgroup_helpers.h
index fcc9cb91b211..3358734356ab 100644
--- a/tools/testing/selftests/bpf/cgroup_helpers.h
+++ b/tools/testing/selftests/bpf/cgroup_helpers.h
@@ -10,11 +10,18 @@
__FILE__, __LINE__, clean_errno(), ##__VA_ARGS__)
/* cgroupv2 related */
-int cgroup_setup_and_join(const char *path);
-int create_and_get_cgroup(const char *path);
-unsigned long long get_cgroup_id(const char *path);
-
-int join_cgroup(const char *path);
+int enable_controllers(const char *relative_path, const char *controllers);
+int write_cgroup_file(const char *relative_path, const char *file,
+ const char *buf);
+int write_cgroup_file_parent(const char *relative_path, const char *file,
+ const char *buf);
+int cgroup_setup_and_join(const char *relative_path);
+int get_root_cgroup(void);
+int create_and_get_cgroup(const char *relative_path);
+unsigned long long get_cgroup_id(const char *relative_path);
+
+int join_cgroup(const char *relative_path);
+int join_parent_cgroup(const char *relative_path);
int setup_cgroup_environment(void);
void cleanup_cgroup_environment(void);
@@ -26,4 +33,4 @@ int join_classid(void);
int setup_classid_environment(void);
void cleanup_classid_environment(void);
-#endif /* __CGROUP_HELPERS_H */ \ No newline at end of file
+#endif /* __CGROUP_HELPERS_H */
diff --git a/tools/testing/selftests/bpf/get_cgroup_id_user.c b/tools/testing/selftests/bpf/get_cgroup_id_user.c
index e021cc67dc02..156743cf5870 100644
--- a/tools/testing/selftests/bpf/get_cgroup_id_user.c
+++ b/tools/testing/selftests/bpf/get_cgroup_id_user.c
@@ -48,7 +48,7 @@ static int bpf_find_map(const char *test, struct bpf_object *obj,
int main(int argc, char **argv)
{
const char *probe_name = "syscalls/sys_enter_nanosleep";
- const char *file = "get_cgroup_id_kern.o";
+ const char *file = "get_cgroup_id_kern.bpf.o";
int err, bytes, efd, prog_fd, pmu_fd;
int cgroup_fd, cgidmap_fd, pidmap_fd;
struct perf_event_attr attr = {};
diff --git a/tools/testing/selftests/bpf/map_tests/task_storage_map.c b/tools/testing/selftests/bpf/map_tests/task_storage_map.c
new file mode 100644
index 000000000000..1adc9c292eb2
--- /dev/null
+++ b/tools/testing/selftests/bpf/map_tests/task_storage_map.c
@@ -0,0 +1,122 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2022. Huawei Technologies Co., Ltd */
+#define _GNU_SOURCE
+#include <sched.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <errno.h>
+#include <string.h>
+#include <pthread.h>
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include "test_maps.h"
+#include "task_local_storage_helpers.h"
+#include "read_bpf_task_storage_busy.skel.h"
+
+struct lookup_ctx {
+ bool start;
+ bool stop;
+ int pid_fd;
+ int map_fd;
+ int loop;
+};
+
+static void *lookup_fn(void *arg)
+{
+ struct lookup_ctx *ctx = arg;
+ long value;
+ int i = 0;
+
+ while (!ctx->start)
+ usleep(1);
+
+ while (!ctx->stop && i++ < ctx->loop)
+ bpf_map_lookup_elem(ctx->map_fd, &ctx->pid_fd, &value);
+ return NULL;
+}
+
+static void abort_lookup(struct lookup_ctx *ctx, pthread_t *tids, unsigned int nr)
+{
+ unsigned int i;
+
+ ctx->stop = true;
+ ctx->start = true;
+ for (i = 0; i < nr; i++)
+ pthread_join(tids[i], NULL);
+}
+
+void test_task_storage_map_stress_lookup(void)
+{
+#define MAX_NR_THREAD 4096
+ unsigned int i, nr = 256, loop = 8192, cpu = 0;
+ struct read_bpf_task_storage_busy *skel;
+ pthread_t tids[MAX_NR_THREAD];
+ struct lookup_ctx ctx;
+ cpu_set_t old, new;
+ const char *cfg;
+ int err;
+
+ cfg = getenv("TASK_STORAGE_MAP_NR_THREAD");
+ if (cfg) {
+ nr = atoi(cfg);
+ if (nr > MAX_NR_THREAD)
+ nr = MAX_NR_THREAD;
+ }
+ cfg = getenv("TASK_STORAGE_MAP_NR_LOOP");
+ if (cfg)
+ loop = atoi(cfg);
+ cfg = getenv("TASK_STORAGE_MAP_PIN_CPU");
+ if (cfg)
+ cpu = atoi(cfg);
+
+ skel = read_bpf_task_storage_busy__open_and_load();
+ err = libbpf_get_error(skel);
+ CHECK(err, "open_and_load", "error %d\n", err);
+
+ /* Only for a fully preemptible kernel */
+ if (!skel->kconfig->CONFIG_PREEMPT)
+ return;
+
+ /* Save the old affinity setting */
+ sched_getaffinity(getpid(), sizeof(old), &old);
+
+ /* Pinned on a specific CPU */
+ CPU_ZERO(&new);
+ CPU_SET(cpu, &new);
+ sched_setaffinity(getpid(), sizeof(new), &new);
+
+ ctx.start = false;
+ ctx.stop = false;
+ ctx.pid_fd = sys_pidfd_open(getpid(), 0);
+ ctx.map_fd = bpf_map__fd(skel->maps.task);
+ ctx.loop = loop;
+ for (i = 0; i < nr; i++) {
+ err = pthread_create(&tids[i], NULL, lookup_fn, &ctx);
+ if (err) {
+ abort_lookup(&ctx, tids, i);
+ CHECK(err, "pthread_create", "error %d\n", err);
+ goto out;
+ }
+ }
+
+ ctx.start = true;
+ for (i = 0; i < nr; i++)
+ pthread_join(tids[i], NULL);
+
+ skel->bss->pid = getpid();
+ err = read_bpf_task_storage_busy__attach(skel);
+ CHECK(err, "attach", "error %d\n", err);
+
+ /* Trigger program */
+ syscall(SYS_gettid);
+ skel->bss->pid = 0;
+
+ CHECK(skel->bss->busy != 0, "bad bpf_task_storage_busy", "got %d\n", skel->bss->busy);
+out:
+ read_bpf_task_storage_busy__destroy(skel);
+ /* Restore affinity setting */
+ sched_setaffinity(getpid(), sizeof(old), &old);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c b/tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c
index dbe56fa8582d..e1c1e521cca2 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c
@@ -7,7 +7,7 @@ void serial_test_bpf_obj_id(void)
{
const __u64 array_magic_value = 0xfaceb00c;
const __u32 array_key = 0;
- const char *file = "./test_obj_id.o";
+ const char *file = "./test_obj_id.bpf.o";
const char *expected_prog_name = "test_obj_id";
const char *expected_map_name = "test_map_id";
const __u64 nsec_per_sec = 1000000000;
diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c b/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c
index ff6cce9fef06..5ca252823294 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c
@@ -75,45 +75,45 @@ static void scale_test(const char *file,
void test_verif_scale1()
{
- scale_test("test_verif_scale1.o", BPF_PROG_TYPE_SCHED_CLS, false);
+ scale_test("test_verif_scale1.bpf.o", BPF_PROG_TYPE_SCHED_CLS, false);
}
void test_verif_scale2()
{
- scale_test("test_verif_scale2.o", BPF_PROG_TYPE_SCHED_CLS, false);
+ scale_test("test_verif_scale2.bpf.o", BPF_PROG_TYPE_SCHED_CLS, false);
}
void test_verif_scale3()
{
- scale_test("test_verif_scale3.o", BPF_PROG_TYPE_SCHED_CLS, false);
+ scale_test("test_verif_scale3.bpf.o", BPF_PROG_TYPE_SCHED_CLS, false);
}
void test_verif_scale_pyperf_global()
{
- scale_test("pyperf_global.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false);
+ scale_test("pyperf_global.bpf.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false);
}
void test_verif_scale_pyperf_subprogs()
{
- scale_test("pyperf_subprogs.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false);
+ scale_test("pyperf_subprogs.bpf.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false);
}
void test_verif_scale_pyperf50()
{
/* full unroll by llvm */
- scale_test("pyperf50.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false);
+ scale_test("pyperf50.bpf.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false);
}
void test_verif_scale_pyperf100()
{
/* full unroll by llvm */
- scale_test("pyperf100.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false);
+ scale_test("pyperf100.bpf.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false);
}
void test_verif_scale_pyperf180()
{
/* full unroll by llvm */
- scale_test("pyperf180.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false);
+ scale_test("pyperf180.bpf.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false);
}
void test_verif_scale_pyperf600()
@@ -124,13 +124,13 @@ void test_verif_scale_pyperf600()
* 16k insns in loop body.
* Total of 5 such loops. Total program size ~82k insns.
*/
- scale_test("pyperf600.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false);
+ scale_test("pyperf600.bpf.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false);
}
void test_verif_scale_pyperf600_bpf_loop(void)
{
/* use the bpf_loop helper*/
- scale_test("pyperf600_bpf_loop.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false);
+ scale_test("pyperf600_bpf_loop.bpf.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false);
}
void test_verif_scale_pyperf600_nounroll()
@@ -141,37 +141,37 @@ void test_verif_scale_pyperf600_nounroll()
* ~110 insns in loop body.
* Total of 5 such loops. Total program size ~1500 insns.
*/
- scale_test("pyperf600_nounroll.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false);
+ scale_test("pyperf600_nounroll.bpf.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false);
}
void test_verif_scale_loop1()
{
- scale_test("loop1.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false);
+ scale_test("loop1.bpf.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false);
}
void test_verif_scale_loop2()
{
- scale_test("loop2.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false);
+ scale_test("loop2.bpf.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false);
}
void test_verif_scale_loop3_fail()
{
- scale_test("loop3.o", BPF_PROG_TYPE_RAW_TRACEPOINT, true /* fails */);
+ scale_test("loop3.bpf.o", BPF_PROG_TYPE_RAW_TRACEPOINT, true /* fails */);
}
void test_verif_scale_loop4()
{
- scale_test("loop4.o", BPF_PROG_TYPE_SCHED_CLS, false);
+ scale_test("loop4.bpf.o", BPF_PROG_TYPE_SCHED_CLS, false);
}
void test_verif_scale_loop5()
{
- scale_test("loop5.o", BPF_PROG_TYPE_SCHED_CLS, false);
+ scale_test("loop5.bpf.o", BPF_PROG_TYPE_SCHED_CLS, false);
}
void test_verif_scale_loop6()
{
- scale_test("loop6.o", BPF_PROG_TYPE_KPROBE, false);
+ scale_test("loop6.bpf.o", BPF_PROG_TYPE_KPROBE, false);
}
void test_verif_scale_strobemeta()
@@ -180,54 +180,54 @@ void test_verif_scale_strobemeta()
* Total program size 20.8k insn.
* ~350k processed_insns
*/
- scale_test("strobemeta.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false);
+ scale_test("strobemeta.bpf.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false);
}
void test_verif_scale_strobemeta_bpf_loop(void)
{
/* use the bpf_loop helper*/
- scale_test("strobemeta_bpf_loop.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false);
+ scale_test("strobemeta_bpf_loop.bpf.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false);
}
void test_verif_scale_strobemeta_nounroll1()
{
/* no unroll, tiny loops */
- scale_test("strobemeta_nounroll1.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false);
+ scale_test("strobemeta_nounroll1.bpf.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false);
}
void test_verif_scale_strobemeta_nounroll2()
{
/* no unroll, tiny loops */
- scale_test("strobemeta_nounroll2.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false);
+ scale_test("strobemeta_nounroll2.bpf.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false);
}
void test_verif_scale_strobemeta_subprogs()
{
/* non-inlined subprogs */
- scale_test("strobemeta_subprogs.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false);
+ scale_test("strobemeta_subprogs.bpf.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false);
}
void test_verif_scale_sysctl_loop1()
{
- scale_test("test_sysctl_loop1.o", BPF_PROG_TYPE_CGROUP_SYSCTL, false);
+ scale_test("test_sysctl_loop1.bpf.o", BPF_PROG_TYPE_CGROUP_SYSCTL, false);
}
void test_verif_scale_sysctl_loop2()
{
- scale_test("test_sysctl_loop2.o", BPF_PROG_TYPE_CGROUP_SYSCTL, false);
+ scale_test("test_sysctl_loop2.bpf.o", BPF_PROG_TYPE_CGROUP_SYSCTL, false);
}
void test_verif_scale_xdp_loop()
{
- scale_test("test_xdp_loop.o", BPF_PROG_TYPE_XDP, false);
+ scale_test("test_xdp_loop.bpf.o", BPF_PROG_TYPE_XDP, false);
}
void test_verif_scale_seg6_loop()
{
- scale_test("test_seg6_loop.o", BPF_PROG_TYPE_LWT_SEG6LOCAL, false);
+ scale_test("test_seg6_loop.bpf.o", BPF_PROG_TYPE_LWT_SEG6LOCAL, false);
}
void test_verif_twfw()
{
- scale_test("twfw.o", BPF_PROG_TYPE_CGROUP_SKB, false);
+ scale_test("twfw.bpf.o", BPF_PROG_TYPE_CGROUP_SKB, false);
}
diff --git a/tools/testing/selftests/bpf/prog_tests/btf.c b/tools/testing/selftests/bpf/prog_tests/btf.c
index ef6528b8084c..127b8caa3dc1 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf.c
@@ -4651,8 +4651,8 @@ struct btf_file_test {
};
static struct btf_file_test file_tests[] = {
- { .file = "test_btf_newkv.o", },
- { .file = "test_btf_nokv.o", .btf_kv_notfound = true, },
+ { .file = "test_btf_newkv.bpf.o", },
+ { .file = "test_btf_nokv.bpf.o", .btf_kv_notfound = true, },
};
static void do_test_file(unsigned int test_num)
diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dump.c b/tools/testing/selftests/bpf/prog_tests/btf_dump.c
index 5fce7008d1ff..b1ca954ed1e5 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf_dump.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf_dump.c
@@ -52,7 +52,7 @@ static int test_btf_dump_case(int n, struct btf_dump_test_case *t)
int err = 0, fd = -1;
FILE *f = NULL;
- snprintf(test_file, sizeof(test_file), "%s.o", t->file);
+ snprintf(test_file, sizeof(test_file), "%s.bpf.o", t->file);
btf = btf__parse_elf(test_file, NULL);
if (!ASSERT_OK_PTR(btf, "btf_parse_elf")) {
@@ -764,8 +764,8 @@ static void test_btf_dump_struct_data(struct btf *btf, struct btf_dump *d,
/* union with nested struct */
TEST_BTF_DUMP_DATA(btf, d, "union", str, union bpf_iter_link_info, BTF_F_COMPACT,
- "(union bpf_iter_link_info){.map = (struct){.map_fd = (__u32)1,},}",
- { .map = { .map_fd = 1 }});
+ "(union bpf_iter_link_info){.map = (struct){.map_fd = (__u32)1,},.cgroup = (struct){.order = (enum bpf_cgroup_iter_order)BPF_CGROUP_ITER_SELF_ONLY,.cgroup_fd = (__u32)1,},}",
+ { .cgroup = { .order = 1, .cgroup_fd = 1, }});
/* struct skb with nested structs/unions; because type output is so
* complex, we don't do a string comparison, just verify we return
@@ -841,8 +841,8 @@ static void test_btf_dump_datasec_data(char *str)
char license[4] = "GPL";
struct btf_dump *d;
- btf = btf__parse("xdping_kern.o", NULL);
- if (!ASSERT_OK_PTR(btf, "xdping_kern.o BTF not found"))
+ btf = btf__parse("xdping_kern.bpf.o", NULL);
+ if (!ASSERT_OK_PTR(btf, "xdping_kern.bpf.o BTF not found"))
return;
d = btf_dump__new(btf, btf_dump_snprintf, str, NULL);
diff --git a/tools/testing/selftests/bpf/prog_tests/btf_endian.c b/tools/testing/selftests/bpf/prog_tests/btf_endian.c
index 8afbf3d0b89a..5b9f84dbeb43 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf_endian.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf_endian.c
@@ -23,7 +23,7 @@ void test_btf_endian() {
int var_id;
/* Load BTF in native endianness */
- btf = btf__parse_elf("btf_dump_test_case_syntax.o", NULL);
+ btf = btf__parse_elf("btf_dump_test_case_syntax.bpf.o", NULL);
if (!ASSERT_OK_PTR(btf, "parse_native_btf"))
goto err_out;
diff --git a/tools/testing/selftests/bpf/prog_tests/cb_refs.c b/tools/testing/selftests/bpf/prog_tests/cb_refs.c
new file mode 100644
index 000000000000..3bff680de16c
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/cb_refs.c
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bpf/libbpf.h"
+#include <test_progs.h>
+#include <network_helpers.h>
+
+#include "cb_refs.skel.h"
+
+static char log_buf[1024 * 1024];
+
+struct {
+ const char *prog_name;
+ const char *err_msg;
+} cb_refs_tests[] = {
+ { "underflow_prog", "reference has not been acquired before" },
+ { "leak_prog", "Unreleased reference" },
+ { "nested_cb", "Unreleased reference id=4 alloc_insn=2" }, /* alloc_insn=2{4,5} */
+ { "non_cb_transfer_ref", "Unreleased reference id=4 alloc_insn=1" }, /* alloc_insn=1{1,2} */
+};
+
+void test_cb_refs(void)
+{
+ LIBBPF_OPTS(bpf_object_open_opts, opts, .kernel_log_buf = log_buf,
+ .kernel_log_size = sizeof(log_buf),
+ .kernel_log_level = 1);
+ struct bpf_program *prog;
+ struct cb_refs *skel;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(cb_refs_tests); i++) {
+ LIBBPF_OPTS(bpf_test_run_opts, run_opts,
+ .data_in = &pkt_v4,
+ .data_size_in = sizeof(pkt_v4),
+ .repeat = 1,
+ );
+ skel = cb_refs__open_opts(&opts);
+ if (!ASSERT_OK_PTR(skel, "cb_refs__open_and_load"))
+ return;
+ prog = bpf_object__find_program_by_name(skel->obj, cb_refs_tests[i].prog_name);
+ bpf_program__set_autoload(prog, true);
+ if (!ASSERT_ERR(cb_refs__load(skel), "cb_refs__load"))
+ bpf_prog_test_run_opts(bpf_program__fd(prog), &run_opts);
+ if (!ASSERT_OK_PTR(strstr(log_buf, cb_refs_tests[i].err_msg), "expected error message")) {
+ fprintf(stderr, "Expected: %s\n", cb_refs_tests[i].err_msg);
+ fprintf(stderr, "Verifier: %s\n", log_buf);
+ }
+ cb_refs__destroy(skel);
+ }
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_getset_retval.c b/tools/testing/selftests/bpf/prog_tests/cgroup_getset_retval.c
index 0b47c3c000c7..4d2fa99273d8 100644
--- a/tools/testing/selftests/bpf/prog_tests/cgroup_getset_retval.c
+++ b/tools/testing/selftests/bpf/prog_tests/cgroup_getset_retval.c
@@ -10,6 +10,7 @@
#include "cgroup_getset_retval_setsockopt.skel.h"
#include "cgroup_getset_retval_getsockopt.skel.h"
+#include "cgroup_getset_retval_hooks.skel.h"
#define SOL_CUSTOM 0xdeadbeef
@@ -433,6 +434,50 @@ close_bpf_object:
cgroup_getset_retval_getsockopt__destroy(obj);
}
+struct exposed_hook {
+ const char *name;
+ int expected_err;
+} exposed_hooks[] = {
+
+#define BPF_RETVAL_HOOK(NAME, SECTION, CTX, EXPECTED_ERR) \
+ { \
+ .name = #NAME, \
+ .expected_err = EXPECTED_ERR, \
+ },
+
+#include "cgroup_getset_retval_hooks.h"
+
+#undef BPF_RETVAL_HOOK
+};
+
+static void test_exposed_hooks(int cgroup_fd, int sock_fd)
+{
+ struct cgroup_getset_retval_hooks *skel;
+ struct bpf_program *prog;
+ int err;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(exposed_hooks); i++) {
+ skel = cgroup_getset_retval_hooks__open();
+ if (!ASSERT_OK_PTR(skel, "cgroup_getset_retval_hooks__open"))
+ continue;
+
+ prog = bpf_object__find_program_by_name(skel->obj, exposed_hooks[i].name);
+ if (!ASSERT_NEQ(prog, NULL, "bpf_object__find_program_by_name"))
+ goto close_skel;
+
+ err = bpf_program__set_autoload(prog, true);
+ if (!ASSERT_OK(err, "bpf_program__set_autoload"))
+ goto close_skel;
+
+ err = cgroup_getset_retval_hooks__load(skel);
+ ASSERT_EQ(err, exposed_hooks[i].expected_err, "expected_err");
+
+close_skel:
+ cgroup_getset_retval_hooks__destroy(skel);
+ }
+}
+
void test_cgroup_getset_retval(void)
{
int cgroup_fd = -1;
@@ -476,6 +521,9 @@ void test_cgroup_getset_retval(void)
if (test__start_subtest("getsockopt-retval_sync"))
test_getsockopt_retval_sync(cgroup_fd, sock_fd);
+ if (test__start_subtest("exposed_hooks"))
+ test_exposed_hooks(cgroup_fd, sock_fd);
+
close_fd:
close(cgroup_fd);
}
diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_hierarchical_stats.c b/tools/testing/selftests/bpf/prog_tests/cgroup_hierarchical_stats.c
new file mode 100644
index 000000000000..bed1661596f7
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/cgroup_hierarchical_stats.c
@@ -0,0 +1,357 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Functions to manage eBPF programs attached to cgroup subsystems
+ *
+ * Copyright 2022 Google LLC.
+ */
+#include <asm-generic/errno.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <test_progs.h>
+#include <bpf/libbpf.h>
+#include <bpf/bpf.h>
+
+#include "cgroup_helpers.h"
+#include "cgroup_hierarchical_stats.skel.h"
+
+#define PAGE_SIZE 4096
+#define MB(x) (x << 20)
+
+#define BPFFS_ROOT "/sys/fs/bpf/"
+#define BPFFS_VMSCAN BPFFS_ROOT"vmscan/"
+
+#define CG_ROOT_NAME "root"
+#define CG_ROOT_ID 1
+
+#define CGROUP_PATH(p, n) {.path = p"/"n, .name = n}
+
+static struct {
+ const char *path, *name;
+ unsigned long long id;
+ int fd;
+} cgroups[] = {
+ CGROUP_PATH("/", "test"),
+ CGROUP_PATH("/test", "child1"),
+ CGROUP_PATH("/test", "child2"),
+ CGROUP_PATH("/test/child1", "child1_1"),
+ CGROUP_PATH("/test/child1", "child1_2"),
+ CGROUP_PATH("/test/child2", "child2_1"),
+ CGROUP_PATH("/test/child2", "child2_2"),
+};
+
+#define N_CGROUPS ARRAY_SIZE(cgroups)
+#define N_NON_LEAF_CGROUPS 3
+
+static int root_cgroup_fd;
+static bool mounted_bpffs;
+
+/* reads file at 'path' to 'buf', returns 0 on success. */
+static int read_from_file(const char *path, char *buf, size_t size)
+{
+ int fd, len;
+
+ fd = open(path, O_RDONLY);
+ if (fd < 0)
+ return fd;
+
+ len = read(fd, buf, size);
+ close(fd);
+ if (len < 0)
+ return len;
+
+ buf[len] = 0;
+ return 0;
+}
+
+/* mounts bpffs and mkdir for reading stats, returns 0 on success. */
+static int setup_bpffs(void)
+{
+ int err;
+
+ /* Mount bpffs */
+ err = mount("bpf", BPFFS_ROOT, "bpf", 0, NULL);
+ mounted_bpffs = !err;
+ if (ASSERT_FALSE(err && errno != EBUSY, "mount"))
+ return err;
+
+ /* Create a directory to contain stat files in bpffs */
+ err = mkdir(BPFFS_VMSCAN, 0755);
+ if (!ASSERT_OK(err, "mkdir"))
+ return err;
+
+ return 0;
+}
+
+static void cleanup_bpffs(void)
+{
+ /* Remove created directory in bpffs */
+ ASSERT_OK(rmdir(BPFFS_VMSCAN), "rmdir "BPFFS_VMSCAN);
+
+ /* Unmount bpffs, if it wasn't already mounted when we started */
+ if (mounted_bpffs)
+ return;
+
+ ASSERT_OK(umount(BPFFS_ROOT), "unmount bpffs");
+}
+
+/* sets up cgroups, returns 0 on success. */
+static int setup_cgroups(void)
+{
+ int i, fd, err;
+
+ err = setup_cgroup_environment();
+ if (!ASSERT_OK(err, "setup_cgroup_environment"))
+ return err;
+
+ root_cgroup_fd = get_root_cgroup();
+ if (!ASSERT_GE(root_cgroup_fd, 0, "get_root_cgroup"))
+ return root_cgroup_fd;
+
+ for (i = 0; i < N_CGROUPS; i++) {
+ fd = create_and_get_cgroup(cgroups[i].path);
+ if (!ASSERT_GE(fd, 0, "create_and_get_cgroup"))
+ return fd;
+
+ cgroups[i].fd = fd;
+ cgroups[i].id = get_cgroup_id(cgroups[i].path);
+
+ /*
+ * Enable memcg controller for the entire hierarchy.
+ * Note that stats are collected for all cgroups in a hierarchy
+ * with memcg enabled anyway, but are only exposed for cgroups
+ * that have memcg enabled.
+ */
+ if (i < N_NON_LEAF_CGROUPS) {
+ err = enable_controllers(cgroups[i].path, "memory");
+ if (!ASSERT_OK(err, "enable_controllers"))
+ return err;
+ }
+ }
+ return 0;
+}
+
+static void cleanup_cgroups(void)
+{
+ close(root_cgroup_fd);
+ for (int i = 0; i < N_CGROUPS; i++)
+ close(cgroups[i].fd);
+ cleanup_cgroup_environment();
+}
+
+/* Sets up cgroup hiearchary, returns 0 on success. */
+static int setup_hierarchy(void)
+{
+ return setup_bpffs() || setup_cgroups();
+}
+
+static void destroy_hierarchy(void)
+{
+ cleanup_cgroups();
+ cleanup_bpffs();
+}
+
+static int reclaimer(const char *cgroup_path, size_t size)
+{
+ static char size_buf[128];
+ char *buf, *ptr;
+ int err;
+
+ /* Join cgroup in the parent process workdir */
+ if (join_parent_cgroup(cgroup_path))
+ return EACCES;
+
+ /* Allocate memory */
+ buf = malloc(size);
+ if (!buf)
+ return ENOMEM;
+
+ /* Write to memory to make sure it's actually allocated */
+ for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
+ *ptr = 1;
+
+ /* Try to reclaim memory */
+ snprintf(size_buf, 128, "%lu", size);
+ err = write_cgroup_file_parent(cgroup_path, "memory.reclaim", size_buf);
+
+ free(buf);
+ /* memory.reclaim returns EAGAIN if the amount is not fully reclaimed */
+ if (err && errno != EAGAIN)
+ return errno;
+
+ return 0;
+}
+
+static int induce_vmscan(void)
+{
+ int i, status;
+
+ /*
+ * In every leaf cgroup, run a child process that allocates some memory
+ * and attempts to reclaim some of it.
+ */
+ for (i = N_NON_LEAF_CGROUPS; i < N_CGROUPS; i++) {
+ pid_t pid;
+
+ /* Create reclaimer child */
+ pid = fork();
+ if (pid == 0) {
+ status = reclaimer(cgroups[i].path, MB(5));
+ exit(status);
+ }
+
+ /* Cleanup reclaimer child */
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status), "reclaimer exited");
+ ASSERT_EQ(WEXITSTATUS(status), 0, "reclaim exit code");
+ }
+ return 0;
+}
+
+static unsigned long long
+get_cgroup_vmscan_delay(unsigned long long cgroup_id, const char *file_name)
+{
+ unsigned long long vmscan = 0, id = 0;
+ static char buf[128], path[128];
+
+ /* For every cgroup, read the file generated by cgroup_iter */
+ snprintf(path, 128, "%s%s", BPFFS_VMSCAN, file_name);
+ if (!ASSERT_OK(read_from_file(path, buf, 128), "read cgroup_iter"))
+ return 0;
+
+ /* Check the output file formatting */
+ ASSERT_EQ(sscanf(buf, "cg_id: %llu, total_vmscan_delay: %llu\n",
+ &id, &vmscan), 2, "output format");
+
+ /* Check that the cgroup_id is displayed correctly */
+ ASSERT_EQ(id, cgroup_id, "cgroup_id");
+ /* Check that the vmscan reading is non-zero */
+ ASSERT_GT(vmscan, 0, "vmscan_reading");
+ return vmscan;
+}
+
+static void check_vmscan_stats(void)
+{
+ unsigned long long vmscan_readings[N_CGROUPS], vmscan_root;
+ int i;
+
+ for (i = 0; i < N_CGROUPS; i++) {
+ vmscan_readings[i] = get_cgroup_vmscan_delay(cgroups[i].id,
+ cgroups[i].name);
+ }
+
+ /* Read stats for root too */
+ vmscan_root = get_cgroup_vmscan_delay(CG_ROOT_ID, CG_ROOT_NAME);
+
+ /* Check that child1 == child1_1 + child1_2 */
+ ASSERT_EQ(vmscan_readings[1], vmscan_readings[3] + vmscan_readings[4],
+ "child1_vmscan");
+ /* Check that child2 == child2_1 + child2_2 */
+ ASSERT_EQ(vmscan_readings[2], vmscan_readings[5] + vmscan_readings[6],
+ "child2_vmscan");
+ /* Check that test == child1 + child2 */
+ ASSERT_EQ(vmscan_readings[0], vmscan_readings[1] + vmscan_readings[2],
+ "test_vmscan");
+ /* Check that root >= test */
+ ASSERT_GE(vmscan_root, vmscan_readings[1], "root_vmscan");
+}
+
+/* Creates iter link and pins in bpffs, returns 0 on success, -errno on failure.
+ */
+static int setup_cgroup_iter(struct cgroup_hierarchical_stats *obj,
+ int cgroup_fd, const char *file_name)
+{
+ DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts);
+ union bpf_iter_link_info linfo = {};
+ struct bpf_link *link;
+ static char path[128];
+ int err;
+
+ /*
+ * Create an iter link, parameterized by cgroup_fd. We only want to
+ * traverse one cgroup, so set the traversal order to "self".
+ */
+ linfo.cgroup.cgroup_fd = cgroup_fd;
+ linfo.cgroup.order = BPF_CGROUP_ITER_SELF_ONLY;
+ opts.link_info = &linfo;
+ opts.link_info_len = sizeof(linfo);
+ link = bpf_program__attach_iter(obj->progs.dump_vmscan, &opts);
+ if (!ASSERT_OK_PTR(link, "attach_iter"))
+ return -EFAULT;
+
+ /* Pin the link to a bpffs file */
+ snprintf(path, 128, "%s%s", BPFFS_VMSCAN, file_name);
+ err = bpf_link__pin(link, path);
+ ASSERT_OK(err, "pin cgroup_iter");
+
+ /* Remove the link, leaving only the ref held by the pinned file */
+ bpf_link__destroy(link);
+ return err;
+}
+
+/* Sets up programs for collecting stats, returns 0 on success. */
+static int setup_progs(struct cgroup_hierarchical_stats **skel)
+{
+ int i, err;
+
+ *skel = cgroup_hierarchical_stats__open_and_load();
+ if (!ASSERT_OK_PTR(*skel, "open_and_load"))
+ return 1;
+
+ /* Attach cgroup_iter program that will dump the stats to cgroups */
+ for (i = 0; i < N_CGROUPS; i++) {
+ err = setup_cgroup_iter(*skel, cgroups[i].fd, cgroups[i].name);
+ if (!ASSERT_OK(err, "setup_cgroup_iter"))
+ return err;
+ }
+
+ /* Also dump stats for root */
+ err = setup_cgroup_iter(*skel, root_cgroup_fd, CG_ROOT_NAME);
+ if (!ASSERT_OK(err, "setup_cgroup_iter"))
+ return err;
+
+ bpf_program__set_autoattach((*skel)->progs.dump_vmscan, false);
+ err = cgroup_hierarchical_stats__attach(*skel);
+ if (!ASSERT_OK(err, "attach"))
+ return err;
+
+ return 0;
+}
+
+static void destroy_progs(struct cgroup_hierarchical_stats *skel)
+{
+ static char path[128];
+ int i;
+
+ for (i = 0; i < N_CGROUPS; i++) {
+ /* Delete files in bpffs that cgroup_iters are pinned in */
+ snprintf(path, 128, "%s%s", BPFFS_VMSCAN,
+ cgroups[i].name);
+ ASSERT_OK(remove(path), "remove cgroup_iter pin");
+ }
+
+ /* Delete root file in bpffs */
+ snprintf(path, 128, "%s%s", BPFFS_VMSCAN, CG_ROOT_NAME);
+ ASSERT_OK(remove(path), "remove cgroup_iter root pin");
+ cgroup_hierarchical_stats__destroy(skel);
+}
+
+void test_cgroup_hierarchical_stats(void)
+{
+ struct cgroup_hierarchical_stats *skel = NULL;
+
+ if (setup_hierarchy())
+ goto hierarchy_cleanup;
+ if (setup_progs(&skel))
+ goto cleanup;
+ if (induce_vmscan())
+ goto cleanup;
+ check_vmscan_stats();
+cleanup:
+ destroy_progs(skel);
+hierarchy_cleanup:
+ destroy_hierarchy();
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c b/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c
new file mode 100644
index 000000000000..c4a2adb38da1
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c
@@ -0,0 +1,224 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2022 Google */
+
+#include <test_progs.h>
+#include <bpf/libbpf.h>
+#include <bpf/btf.h>
+#include "cgroup_iter.skel.h"
+#include "cgroup_helpers.h"
+
+#define ROOT 0
+#define PARENT 1
+#define CHILD1 2
+#define CHILD2 3
+#define NUM_CGROUPS 4
+
+#define PROLOGUE "prologue\n"
+#define EPILOGUE "epilogue\n"
+
+static const char *cg_path[] = {
+ "/", "/parent", "/parent/child1", "/parent/child2"
+};
+
+static int cg_fd[] = {-1, -1, -1, -1};
+static unsigned long long cg_id[] = {0, 0, 0, 0};
+static char expected_output[64];
+
+static int setup_cgroups(void)
+{
+ int fd, i = 0;
+
+ for (i = 0; i < NUM_CGROUPS; i++) {
+ fd = create_and_get_cgroup(cg_path[i]);
+ if (fd < 0)
+ return fd;
+
+ cg_fd[i] = fd;
+ cg_id[i] = get_cgroup_id(cg_path[i]);
+ }
+ return 0;
+}
+
+static void cleanup_cgroups(void)
+{
+ int i;
+
+ for (i = 0; i < NUM_CGROUPS; i++)
+ close(cg_fd[i]);
+}
+
+static void read_from_cgroup_iter(struct bpf_program *prog, int cgroup_fd,
+ int order, const char *testname)
+{
+ DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts);
+ union bpf_iter_link_info linfo;
+ struct bpf_link *link;
+ int len, iter_fd;
+ static char buf[128];
+ size_t left;
+ char *p;
+
+ memset(&linfo, 0, sizeof(linfo));
+ linfo.cgroup.cgroup_fd = cgroup_fd;
+ linfo.cgroup.order = order;
+ opts.link_info = &linfo;
+ opts.link_info_len = sizeof(linfo);
+
+ link = bpf_program__attach_iter(prog, &opts);
+ if (!ASSERT_OK_PTR(link, "attach_iter"))
+ return;
+
+ iter_fd = bpf_iter_create(bpf_link__fd(link));
+ if (iter_fd < 0)
+ goto free_link;
+
+ memset(buf, 0, sizeof(buf));
+ left = ARRAY_SIZE(buf);
+ p = buf;
+ while ((len = read(iter_fd, p, left)) > 0) {
+ p += len;
+ left -= len;
+ }
+
+ ASSERT_STREQ(buf, expected_output, testname);
+
+ /* read() after iter finishes should be ok. */
+ if (len == 0)
+ ASSERT_OK(read(iter_fd, buf, sizeof(buf)), "second_read");
+
+ close(iter_fd);
+free_link:
+ bpf_link__destroy(link);
+}
+
+/* Invalid cgroup. */
+static void test_invalid_cgroup(struct cgroup_iter *skel)
+{
+ DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts);
+ union bpf_iter_link_info linfo;
+ struct bpf_link *link;
+
+ memset(&linfo, 0, sizeof(linfo));
+ linfo.cgroup.cgroup_fd = (__u32)-1;
+ opts.link_info = &linfo;
+ opts.link_info_len = sizeof(linfo);
+
+ link = bpf_program__attach_iter(skel->progs.cgroup_id_printer, &opts);
+ ASSERT_ERR_PTR(link, "attach_iter");
+ bpf_link__destroy(link);
+}
+
+/* Specifying both cgroup_fd and cgroup_id is invalid. */
+static void test_invalid_cgroup_spec(struct cgroup_iter *skel)
+{
+ DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts);
+ union bpf_iter_link_info linfo;
+ struct bpf_link *link;
+
+ memset(&linfo, 0, sizeof(linfo));
+ linfo.cgroup.cgroup_fd = (__u32)cg_fd[PARENT];
+ linfo.cgroup.cgroup_id = (__u64)cg_id[PARENT];
+ opts.link_info = &linfo;
+ opts.link_info_len = sizeof(linfo);
+
+ link = bpf_program__attach_iter(skel->progs.cgroup_id_printer, &opts);
+ ASSERT_ERR_PTR(link, "attach_iter");
+ bpf_link__destroy(link);
+}
+
+/* Preorder walk prints parent and child in order. */
+static void test_walk_preorder(struct cgroup_iter *skel)
+{
+ snprintf(expected_output, sizeof(expected_output),
+ PROLOGUE "%8llu\n%8llu\n%8llu\n" EPILOGUE,
+ cg_id[PARENT], cg_id[CHILD1], cg_id[CHILD2]);
+
+ read_from_cgroup_iter(skel->progs.cgroup_id_printer, cg_fd[PARENT],
+ BPF_CGROUP_ITER_DESCENDANTS_PRE, "preorder");
+}
+
+/* Postorder walk prints child and parent in order. */
+static void test_walk_postorder(struct cgroup_iter *skel)
+{
+ snprintf(expected_output, sizeof(expected_output),
+ PROLOGUE "%8llu\n%8llu\n%8llu\n" EPILOGUE,
+ cg_id[CHILD1], cg_id[CHILD2], cg_id[PARENT]);
+
+ read_from_cgroup_iter(skel->progs.cgroup_id_printer, cg_fd[PARENT],
+ BPF_CGROUP_ITER_DESCENDANTS_POST, "postorder");
+}
+
+/* Walking parents prints parent and then root. */
+static void test_walk_ancestors_up(struct cgroup_iter *skel)
+{
+ /* terminate the walk when ROOT is met. */
+ skel->bss->terminal_cgroup = cg_id[ROOT];
+
+ snprintf(expected_output, sizeof(expected_output),
+ PROLOGUE "%8llu\n%8llu\n" EPILOGUE,
+ cg_id[PARENT], cg_id[ROOT]);
+
+ read_from_cgroup_iter(skel->progs.cgroup_id_printer, cg_fd[PARENT],
+ BPF_CGROUP_ITER_ANCESTORS_UP, "ancestors_up");
+
+ skel->bss->terminal_cgroup = 0;
+}
+
+/* Early termination prints parent only. */
+static void test_early_termination(struct cgroup_iter *skel)
+{
+ /* terminate the walk after the first element is processed. */
+ skel->bss->terminate_early = 1;
+
+ snprintf(expected_output, sizeof(expected_output),
+ PROLOGUE "%8llu\n" EPILOGUE, cg_id[PARENT]);
+
+ read_from_cgroup_iter(skel->progs.cgroup_id_printer, cg_fd[PARENT],
+ BPF_CGROUP_ITER_DESCENDANTS_PRE, "early_termination");
+
+ skel->bss->terminate_early = 0;
+}
+
+/* Waling self prints self only. */
+static void test_walk_self_only(struct cgroup_iter *skel)
+{
+ snprintf(expected_output, sizeof(expected_output),
+ PROLOGUE "%8llu\n" EPILOGUE, cg_id[PARENT]);
+
+ read_from_cgroup_iter(skel->progs.cgroup_id_printer, cg_fd[PARENT],
+ BPF_CGROUP_ITER_SELF_ONLY, "self_only");
+}
+
+void test_cgroup_iter(void)
+{
+ struct cgroup_iter *skel = NULL;
+
+ if (setup_cgroup_environment())
+ return;
+
+ if (setup_cgroups())
+ goto out;
+
+ skel = cgroup_iter__open_and_load();
+ if (!ASSERT_OK_PTR(skel, "cgroup_iter__open_and_load"))
+ goto out;
+
+ if (test__start_subtest("cgroup_iter__invalid_cgroup"))
+ test_invalid_cgroup(skel);
+ if (test__start_subtest("cgroup_iter__invalid_cgroup_spec"))
+ test_invalid_cgroup_spec(skel);
+ if (test__start_subtest("cgroup_iter__preorder"))
+ test_walk_preorder(skel);
+ if (test__start_subtest("cgroup_iter__postorder"))
+ test_walk_postorder(skel);
+ if (test__start_subtest("cgroup_iter__ancestors_up_walk"))
+ test_walk_ancestors_up(skel);
+ if (test__start_subtest("cgroup_iter__early_termination"))
+ test_early_termination(skel);
+ if (test__start_subtest("cgroup_iter__self_only"))
+ test_walk_self_only(skel);
+out:
+ cgroup_iter__destroy(skel);
+ cleanup_cgroups();
+ cleanup_cgroup_environment();
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/connect_force_port.c b/tools/testing/selftests/bpf/prog_tests/connect_force_port.c
index 9c4325f4aef2..24d553109f8d 100644
--- a/tools/testing/selftests/bpf/prog_tests/connect_force_port.c
+++ b/tools/testing/selftests/bpf/prog_tests/connect_force_port.c
@@ -53,7 +53,7 @@ static int run_test(int cgroup_fd, int server_fd, int family, int type)
__u16 expected_peer_port = 60000;
struct bpf_program *prog;
struct bpf_object *obj;
- const char *obj_file = v4 ? "connect_force_port4.o" : "connect_force_port6.o";
+ const char *obj_file = v4 ? "connect_force_port4.bpf.o" : "connect_force_port6.bpf.o";
int fd, err;
__u32 duration = 0;
diff --git a/tools/testing/selftests/bpf/prog_tests/core_reloc.c b/tools/testing/selftests/bpf/prog_tests/core_reloc.c
index c8655ba9a88f..47f42e680105 100644
--- a/tools/testing/selftests/bpf/prog_tests/core_reloc.c
+++ b/tools/testing/selftests/bpf/prog_tests/core_reloc.c
@@ -13,7 +13,7 @@ static int duration = 0;
#define MODULES_CASE(name, pg_name, tp_name) { \
.case_name = name, \
- .bpf_obj_file = "test_core_reloc_module.o", \
+ .bpf_obj_file = "test_core_reloc_module.bpf.o", \
.btf_src_file = NULL, /* find in kernel module BTFs */ \
.input = "", \
.input_len = 0, \
@@ -43,8 +43,8 @@ static int duration = 0;
#define FLAVORS_CASE_COMMON(name) \
.case_name = #name, \
- .bpf_obj_file = "test_core_reloc_flavors.o", \
- .btf_src_file = "btf__core_reloc_" #name ".o", \
+ .bpf_obj_file = "test_core_reloc_flavors.bpf.o", \
+ .btf_src_file = "btf__core_reloc_" #name ".bpf.o", \
.raw_tp_name = "sys_enter", \
.prog_name = "test_core_flavors" \
@@ -68,8 +68,8 @@ static int duration = 0;
#define NESTING_CASE_COMMON(name) \
.case_name = #name, \
- .bpf_obj_file = "test_core_reloc_nesting.o", \
- .btf_src_file = "btf__core_reloc_" #name ".o", \
+ .bpf_obj_file = "test_core_reloc_nesting.bpf.o", \
+ .btf_src_file = "btf__core_reloc_" #name ".bpf.o", \
.raw_tp_name = "sys_enter", \
.prog_name = "test_core_nesting" \
@@ -96,8 +96,8 @@ static int duration = 0;
#define ARRAYS_CASE_COMMON(name) \
.case_name = #name, \
- .bpf_obj_file = "test_core_reloc_arrays.o", \
- .btf_src_file = "btf__core_reloc_" #name ".o", \
+ .bpf_obj_file = "test_core_reloc_arrays.bpf.o", \
+ .btf_src_file = "btf__core_reloc_" #name ".bpf.o", \
.raw_tp_name = "sys_enter", \
.prog_name = "test_core_arrays" \
@@ -130,8 +130,8 @@ static int duration = 0;
#define PRIMITIVES_CASE_COMMON(name) \
.case_name = #name, \
- .bpf_obj_file = "test_core_reloc_primitives.o", \
- .btf_src_file = "btf__core_reloc_" #name ".o", \
+ .bpf_obj_file = "test_core_reloc_primitives.bpf.o", \
+ .btf_src_file = "btf__core_reloc_" #name ".bpf.o", \
.raw_tp_name = "sys_enter", \
.prog_name = "test_core_primitives" \
@@ -150,8 +150,8 @@ static int duration = 0;
#define MODS_CASE(name) { \
.case_name = #name, \
- .bpf_obj_file = "test_core_reloc_mods.o", \
- .btf_src_file = "btf__core_reloc_" #name ".o", \
+ .bpf_obj_file = "test_core_reloc_mods.bpf.o", \
+ .btf_src_file = "btf__core_reloc_" #name ".bpf.o", \
.input = STRUCT_TO_CHAR_PTR(core_reloc_##name) { \
.a = 1, \
.b = 2, \
@@ -174,8 +174,8 @@ static int duration = 0;
#define PTR_AS_ARR_CASE(name) { \
.case_name = #name, \
- .bpf_obj_file = "test_core_reloc_ptr_as_arr.o", \
- .btf_src_file = "btf__core_reloc_" #name ".o", \
+ .bpf_obj_file = "test_core_reloc_ptr_as_arr.bpf.o", \
+ .btf_src_file = "btf__core_reloc_" #name ".bpf.o", \
.input = (const char *)&(struct core_reloc_##name []){ \
{ .a = 1 }, \
{ .a = 2 }, \
@@ -203,8 +203,8 @@ static int duration = 0;
#define INTS_CASE_COMMON(name) \
.case_name = #name, \
- .bpf_obj_file = "test_core_reloc_ints.o", \
- .btf_src_file = "btf__core_reloc_" #name ".o", \
+ .bpf_obj_file = "test_core_reloc_ints.bpf.o", \
+ .btf_src_file = "btf__core_reloc_" #name ".bpf.o", \
.raw_tp_name = "sys_enter", \
.prog_name = "test_core_ints"
@@ -223,18 +223,18 @@ static int duration = 0;
#define FIELD_EXISTS_CASE_COMMON(name) \
.case_name = #name, \
- .bpf_obj_file = "test_core_reloc_existence.o", \
- .btf_src_file = "btf__core_reloc_" #name ".o", \
+ .bpf_obj_file = "test_core_reloc_existence.bpf.o", \
+ .btf_src_file = "btf__core_reloc_" #name ".bpf.o", \
.raw_tp_name = "sys_enter", \
.prog_name = "test_core_existence"
#define BITFIELDS_CASE_COMMON(objfile, test_name_prefix, name) \
.case_name = test_name_prefix#name, \
.bpf_obj_file = objfile, \
- .btf_src_file = "btf__core_reloc_" #name ".o"
+ .btf_src_file = "btf__core_reloc_" #name ".bpf.o"
#define BITFIELDS_CASE(name, ...) { \
- BITFIELDS_CASE_COMMON("test_core_reloc_bitfields_probed.o", \
+ BITFIELDS_CASE_COMMON("test_core_reloc_bitfields_probed.bpf.o", \
"probed:", name), \
.input = STRUCT_TO_CHAR_PTR(core_reloc_##name) __VA_ARGS__, \
.input_len = sizeof(struct core_reloc_##name), \
@@ -244,7 +244,7 @@ static int duration = 0;
.raw_tp_name = "sys_enter", \
.prog_name = "test_core_bitfields", \
}, { \
- BITFIELDS_CASE_COMMON("test_core_reloc_bitfields_direct.o", \
+ BITFIELDS_CASE_COMMON("test_core_reloc_bitfields_direct.bpf.o", \
"direct:", name), \
.input = STRUCT_TO_CHAR_PTR(core_reloc_##name) __VA_ARGS__, \
.input_len = sizeof(struct core_reloc_##name), \
@@ -256,14 +256,14 @@ static int duration = 0;
#define BITFIELDS_ERR_CASE(name) { \
- BITFIELDS_CASE_COMMON("test_core_reloc_bitfields_probed.o", \
+ BITFIELDS_CASE_COMMON("test_core_reloc_bitfields_probed.bpf.o", \
"probed:", name), \
.fails = true, \
- .run_btfgen_fails = true, \
+ .run_btfgen_fails = true, \
.raw_tp_name = "sys_enter", \
.prog_name = "test_core_bitfields", \
}, { \
- BITFIELDS_CASE_COMMON("test_core_reloc_bitfields_direct.o", \
+ BITFIELDS_CASE_COMMON("test_core_reloc_bitfields_direct.bpf.o", \
"direct:", name), \
.fails = true, \
.run_btfgen_fails = true, \
@@ -272,8 +272,8 @@ static int duration = 0;
#define SIZE_CASE_COMMON(name) \
.case_name = #name, \
- .bpf_obj_file = "test_core_reloc_size.o", \
- .btf_src_file = "btf__core_reloc_" #name ".o", \
+ .bpf_obj_file = "test_core_reloc_size.bpf.o", \
+ .btf_src_file = "btf__core_reloc_" #name ".bpf.o", \
.raw_tp_name = "sys_enter", \
.prog_name = "test_core_size"
@@ -307,13 +307,13 @@ static int duration = 0;
#define SIZE_ERR_CASE(name) { \
SIZE_CASE_COMMON(name), \
.fails = true, \
- .run_btfgen_fails = true, \
+ .run_btfgen_fails = true, \
}
#define TYPE_BASED_CASE_COMMON(name) \
.case_name = #name, \
- .bpf_obj_file = "test_core_reloc_type_based.o", \
- .btf_src_file = "btf__core_reloc_" #name ".o", \
+ .bpf_obj_file = "test_core_reloc_type_based.bpf.o", \
+ .btf_src_file = "btf__core_reloc_" #name ".bpf.o", \
.raw_tp_name = "sys_enter", \
.prog_name = "test_core_type_based"
@@ -331,8 +331,8 @@ static int duration = 0;
#define TYPE_ID_CASE_COMMON(name) \
.case_name = #name, \
- .bpf_obj_file = "test_core_reloc_type_id.o", \
- .btf_src_file = "btf__core_reloc_" #name ".o", \
+ .bpf_obj_file = "test_core_reloc_type_id.bpf.o", \
+ .btf_src_file = "btf__core_reloc_" #name ".bpf.o", \
.raw_tp_name = "sys_enter", \
.prog_name = "test_core_type_id"
@@ -350,8 +350,8 @@ static int duration = 0;
#define ENUMVAL_CASE_COMMON(name) \
.case_name = #name, \
- .bpf_obj_file = "test_core_reloc_enumval.o", \
- .btf_src_file = "btf__core_reloc_" #name ".o", \
+ .bpf_obj_file = "test_core_reloc_enumval.bpf.o", \
+ .btf_src_file = "btf__core_reloc_" #name ".bpf.o", \
.raw_tp_name = "sys_enter", \
.prog_name = "test_core_enumval"
@@ -369,8 +369,8 @@ static int duration = 0;
#define ENUM64VAL_CASE_COMMON(name) \
.case_name = #name, \
- .bpf_obj_file = "test_core_reloc_enum64val.o", \
- .btf_src_file = "btf__core_reloc_" #name ".o", \
+ .bpf_obj_file = "test_core_reloc_enum64val.bpf.o", \
+ .btf_src_file = "btf__core_reloc_" #name ".bpf.o", \
.raw_tp_name = "sys_enter", \
.prog_name = "test_core_enum64val"
@@ -547,7 +547,7 @@ static const struct core_reloc_test_case test_cases[] = {
/* validate we can find kernel image and use its BTF for relocs */
{
.case_name = "kernel",
- .bpf_obj_file = "test_core_reloc_kernel.o",
+ .bpf_obj_file = "test_core_reloc_kernel.bpf.o",
.btf_src_file = NULL, /* load from /lib/modules/$(uname -r) */
.input = "",
.input_len = 0,
@@ -629,8 +629,8 @@ static const struct core_reloc_test_case test_cases[] = {
/* validate edge cases of capturing relocations */
{
.case_name = "misc",
- .bpf_obj_file = "test_core_reloc_misc.o",
- .btf_src_file = "btf__core_reloc_misc.o",
+ .bpf_obj_file = "test_core_reloc_misc.bpf.o",
+ .btf_src_file = "btf__core_reloc_misc.bpf.o",
.input = (const char *)&(struct core_reloc_misc_extensible[]){
{ .a = 1 },
{ .a = 2 }, /* not read */
diff --git a/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c b/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c
index da860b07abb5..d1e32e792536 100644
--- a/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c
+++ b/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c
@@ -174,8 +174,8 @@ static void test_target_no_callees(void)
const char *prog_name[] = {
"fexit/test_pkt_md_access",
};
- test_fexit_bpf2bpf_common("./fexit_bpf2bpf_simple.o",
- "./test_pkt_md_access.o",
+ test_fexit_bpf2bpf_common("./fexit_bpf2bpf_simple.bpf.o",
+ "./test_pkt_md_access.bpf.o",
ARRAY_SIZE(prog_name),
prog_name, true, NULL);
}
@@ -188,8 +188,8 @@ static void test_target_yes_callees(void)
"fexit/test_pkt_access_subprog2",
"fexit/test_pkt_access_subprog3",
};
- test_fexit_bpf2bpf_common("./fexit_bpf2bpf.o",
- "./test_pkt_access.o",
+ test_fexit_bpf2bpf_common("./fexit_bpf2bpf.bpf.o",
+ "./test_pkt_access.bpf.o",
ARRAY_SIZE(prog_name),
prog_name, true, NULL);
}
@@ -206,8 +206,8 @@ static void test_func_replace(void)
"freplace/get_constant",
"freplace/test_pkt_write_access_subprog",
};
- test_fexit_bpf2bpf_common("./fexit_bpf2bpf.o",
- "./test_pkt_access.o",
+ test_fexit_bpf2bpf_common("./fexit_bpf2bpf.bpf.o",
+ "./test_pkt_access.bpf.o",
ARRAY_SIZE(prog_name),
prog_name, true, NULL);
}
@@ -217,8 +217,8 @@ static void test_func_replace_verify(void)
const char *prog_name[] = {
"freplace/do_bind",
};
- test_fexit_bpf2bpf_common("./freplace_connect4.o",
- "./connect4_prog.o",
+ test_fexit_bpf2bpf_common("./freplace_connect4.bpf.o",
+ "./connect4_prog.bpf.o",
ARRAY_SIZE(prog_name),
prog_name, false, NULL);
}
@@ -227,7 +227,7 @@ static int test_second_attach(struct bpf_object *obj)
{
const char *prog_name = "security_new_get_constant";
const char *tgt_name = "get_constant";
- const char *tgt_obj_file = "./test_pkt_access.o";
+ const char *tgt_obj_file = "./test_pkt_access.bpf.o";
struct bpf_program *prog = NULL;
struct bpf_object *tgt_obj;
struct bpf_link *link;
@@ -272,8 +272,8 @@ static void test_func_replace_multi(void)
const char *prog_name[] = {
"freplace/get_constant",
};
- test_fexit_bpf2bpf_common("./freplace_get_constant.o",
- "./test_pkt_access.o",
+ test_fexit_bpf2bpf_common("./freplace_get_constant.bpf.o",
+ "./test_pkt_access.bpf.o",
ARRAY_SIZE(prog_name),
prog_name, true, test_second_attach);
}
@@ -281,10 +281,10 @@ static void test_func_replace_multi(void)
static void test_fmod_ret_freplace(void)
{
struct bpf_object *freplace_obj = NULL, *pkt_obj, *fmod_obj = NULL;
- const char *freplace_name = "./freplace_get_constant.o";
- const char *fmod_ret_name = "./fmod_ret_freplace.o";
+ const char *freplace_name = "./freplace_get_constant.bpf.o";
+ const char *fmod_ret_name = "./fmod_ret_freplace.bpf.o";
DECLARE_LIBBPF_OPTS(bpf_object_open_opts, opts);
- const char *tgt_name = "./test_pkt_access.o";
+ const char *tgt_name = "./test_pkt_access.bpf.o";
struct bpf_link *freplace_link = NULL;
struct bpf_program *prog;
__u32 duration = 0;
@@ -339,8 +339,8 @@ static void test_func_sockmap_update(void)
const char *prog_name[] = {
"freplace/cls_redirect",
};
- test_fexit_bpf2bpf_common("./freplace_cls_redirect.o",
- "./test_cls_redirect.o",
+ test_fexit_bpf2bpf_common("./freplace_cls_redirect.bpf.o",
+ "./test_cls_redirect.bpf.o",
ARRAY_SIZE(prog_name),
prog_name, false, NULL);
}
@@ -385,15 +385,15 @@ close_prog:
static void test_func_replace_return_code(void)
{
/* test invalid return code in the replaced program */
- test_obj_load_failure_common("./freplace_connect_v4_prog.o",
- "./connect4_prog.o");
+ test_obj_load_failure_common("./freplace_connect_v4_prog.bpf.o",
+ "./connect4_prog.bpf.o");
}
static void test_func_map_prog_compatibility(void)
{
/* test with spin lock map value in the replaced program */
- test_obj_load_failure_common("./freplace_attach_probe.o",
- "./test_attach_probe.o");
+ test_obj_load_failure_common("./freplace_attach_probe.bpf.o",
+ "./test_attach_probe.bpf.o");
}
static void test_func_replace_global_func(void)
@@ -402,8 +402,8 @@ static void test_func_replace_global_func(void)
"freplace/test_pkt_access",
};
- test_fexit_bpf2bpf_common("./freplace_global_func.o",
- "./test_pkt_access.o",
+ test_fexit_bpf2bpf_common("./freplace_global_func.bpf.o",
+ "./test_pkt_access.bpf.o",
ARRAY_SIZE(prog_name),
prog_name, false, NULL);
}
diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
index 0c1661ea996e..7acca37a3d2b 100644
--- a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
+++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
@@ -8,6 +8,8 @@
#include "bpf_flow.skel.h"
+#define FLOW_CONTINUE_SADDR 0x7f00007f /* 127.0.0.127 */
+
#ifndef IP_MF
#define IP_MF 0x2000
#endif
@@ -100,6 +102,7 @@ struct test {
} pkt;
struct bpf_flow_keys keys;
__u32 flags;
+ __u32 retval;
};
#define VLAN_HLEN 4
@@ -126,6 +129,7 @@ struct test tests[] = {
.sport = 80,
.dport = 8080,
},
+ .retval = BPF_OK,
},
{
.name = "ipv6",
@@ -146,6 +150,7 @@ struct test tests[] = {
.sport = 80,
.dport = 8080,
},
+ .retval = BPF_OK,
},
{
.name = "802.1q-ipv4",
@@ -168,6 +173,7 @@ struct test tests[] = {
.sport = 80,
.dport = 8080,
},
+ .retval = BPF_OK,
},
{
.name = "802.1ad-ipv6",
@@ -191,6 +197,7 @@ struct test tests[] = {
.sport = 80,
.dport = 8080,
},
+ .retval = BPF_OK,
},
{
.name = "ipv4-frag",
@@ -217,6 +224,7 @@ struct test tests[] = {
.dport = 8080,
},
.flags = BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG,
+ .retval = BPF_OK,
},
{
.name = "ipv4-no-frag",
@@ -239,6 +247,7 @@ struct test tests[] = {
.is_frag = true,
.is_first_frag = true,
},
+ .retval = BPF_OK,
},
{
.name = "ipv6-frag",
@@ -265,6 +274,7 @@ struct test tests[] = {
.dport = 8080,
},
.flags = BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG,
+ .retval = BPF_OK,
},
{
.name = "ipv6-no-frag",
@@ -287,6 +297,7 @@ struct test tests[] = {
.is_frag = true,
.is_first_frag = true,
},
+ .retval = BPF_OK,
},
{
.name = "ipv6-flow-label",
@@ -309,6 +320,7 @@ struct test tests[] = {
.dport = 8080,
.flow_label = __bpf_constant_htonl(0xbeeef),
},
+ .retval = BPF_OK,
},
{
.name = "ipv6-no-flow-label",
@@ -331,6 +343,7 @@ struct test tests[] = {
.flow_label = __bpf_constant_htonl(0xbeeef),
},
.flags = BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL,
+ .retval = BPF_OK,
},
{
.name = "ipip-encap",
@@ -359,6 +372,7 @@ struct test tests[] = {
.sport = 80,
.dport = 8080,
},
+ .retval = BPF_OK,
},
{
.name = "ipip-no-encap",
@@ -386,6 +400,26 @@ struct test tests[] = {
.is_encap = true,
},
.flags = BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP,
+ .retval = BPF_OK,
+ },
+ {
+ .name = "ipip-encap-dissector-continue",
+ .pkt.ipip = {
+ .eth.h_proto = __bpf_constant_htons(ETH_P_IP),
+ .iph.ihl = 5,
+ .iph.protocol = IPPROTO_IPIP,
+ .iph.tot_len = __bpf_constant_htons(MAGIC_BYTES),
+ .iph.saddr = __bpf_constant_htonl(FLOW_CONTINUE_SADDR),
+ .iph_inner.ihl = 5,
+ .iph_inner.protocol = IPPROTO_TCP,
+ .iph_inner.tot_len =
+ __bpf_constant_htons(MAGIC_BYTES) -
+ sizeof(struct iphdr),
+ .tcp.doff = 5,
+ .tcp.source = 99,
+ .tcp.dest = 9090,
+ },
+ .retval = BPF_FLOW_DISSECTOR_CONTINUE,
},
};
@@ -503,6 +537,10 @@ static void run_tests_skb_less(int tap_fd, struct bpf_map *keys)
err = tx_tap(tap_fd, &tests[i].pkt, sizeof(tests[i].pkt));
CHECK(err < 0, "tx_tap", "err %d errno %d\n", err, errno);
+ /* check the stored flow_keys only if BPF_OK expected */
+ if (tests[i].retval != BPF_OK)
+ continue;
+
err = bpf_map_lookup_elem(keys_fd, &key, &flow_keys);
ASSERT_OK(err, "bpf_map_lookup_elem");
@@ -588,7 +626,11 @@ void test_flow_dissector(void)
err = bpf_prog_test_run_opts(prog_fd, &topts);
ASSERT_OK(err, "test_run");
- ASSERT_EQ(topts.retval, 1, "test_run retval");
+ ASSERT_EQ(topts.retval, tests[i].retval, "test_run retval");
+
+ /* check the resulting flow_keys only if BPF_OK returned */
+ if (topts.retval != BPF_OK)
+ continue;
ASSERT_EQ(topts.data_size_out, sizeof(flow_keys),
"test_run data_size_out");
CHECK_FLOW_KEYS(tests[i].name, flow_keys, tests[i].keys);
diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector_load_bytes.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector_load_bytes.c
index 36afb409c25f..c7a47b57ac91 100644
--- a/tools/testing/selftests/bpf/prog_tests/flow_dissector_load_bytes.c
+++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector_load_bytes.c
@@ -44,7 +44,7 @@ void serial_test_flow_dissector_load_bytes(void)
ASSERT_OK(err, "test_run");
ASSERT_EQ(topts.data_size_out, sizeof(flow_keys),
"test_run data_size_out");
- ASSERT_EQ(topts.retval, 1, "test_run retval");
+ ASSERT_EQ(topts.retval, BPF_OK, "test_run retval");
if (fd >= -1)
close(fd);
diff --git a/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c b/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c
index 16048978a1ef..858e0575f502 100644
--- a/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c
+++ b/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c
@@ -84,8 +84,8 @@ static void get_stack_print_output(void *ctx, int cpu, void *data, __u32 size)
void test_get_stack_raw_tp(void)
{
- const char *file = "./test_get_stack_rawtp.o";
- const char *file_err = "./test_get_stack_rawtp_err.o";
+ const char *file = "./test_get_stack_rawtp.bpf.o";
+ const char *file_err = "./test_get_stack_rawtp_err.bpf.o";
const char *prog_name = "bpf_prog1";
int i, err, prog_fd, exp_cnt = MAX_CNT_RAWTP;
struct perf_buffer *pb = NULL;
diff --git a/tools/testing/selftests/bpf/prog_tests/global_data.c b/tools/testing/selftests/bpf/prog_tests/global_data.c
index 027685858925..fadfb64e2a71 100644
--- a/tools/testing/selftests/bpf/prog_tests/global_data.c
+++ b/tools/testing/selftests/bpf/prog_tests/global_data.c
@@ -131,7 +131,7 @@ static void test_global_data_rdonly(struct bpf_object *obj, __u32 duration)
void test_global_data(void)
{
- const char *file = "./test_global_data.o";
+ const char *file = "./test_global_data.bpf.o";
struct bpf_object *obj;
int err, prog_fd;
LIBBPF_OPTS(bpf_test_run_opts, topts,
diff --git a/tools/testing/selftests/bpf/prog_tests/global_data_init.c b/tools/testing/selftests/bpf/prog_tests/global_data_init.c
index 57331c606964..8466332d7406 100644
--- a/tools/testing/selftests/bpf/prog_tests/global_data_init.c
+++ b/tools/testing/selftests/bpf/prog_tests/global_data_init.c
@@ -3,7 +3,7 @@
void test_global_data_init(void)
{
- const char *file = "./test_global_data.o";
+ const char *file = "./test_global_data.bpf.o";
int err = -ENOMEM, map_fd, zero = 0;
__u8 *buff = NULL, *newval = NULL;
struct bpf_object *obj;
diff --git a/tools/testing/selftests/bpf/prog_tests/global_func_args.c b/tools/testing/selftests/bpf/prog_tests/global_func_args.c
index 29039a36cce5..d997099f62d0 100644
--- a/tools/testing/selftests/bpf/prog_tests/global_func_args.c
+++ b/tools/testing/selftests/bpf/prog_tests/global_func_args.c
@@ -39,7 +39,7 @@ static void test_global_func_args0(struct bpf_object *obj)
void test_global_func_args(void)
{
- const char *file = "./test_global_func_args.o";
+ const char *file = "./test_global_func_args.bpf.o";
struct bpf_object *obj;
int err, prog_fd;
LIBBPF_OPTS(bpf_test_run_opts, topts,
diff --git a/tools/testing/selftests/bpf/prog_tests/htab_update.c b/tools/testing/selftests/bpf/prog_tests/htab_update.c
new file mode 100644
index 000000000000..2bc85f4814f4
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/htab_update.c
@@ -0,0 +1,126 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2022. Huawei Technologies Co., Ltd */
+#define _GNU_SOURCE
+#include <sched.h>
+#include <stdbool.h>
+#include <test_progs.h>
+#include "htab_update.skel.h"
+
+struct htab_update_ctx {
+ int fd;
+ int loop;
+ bool stop;
+};
+
+static void test_reenter_update(void)
+{
+ struct htab_update *skel;
+ unsigned int key, value;
+ int err;
+
+ skel = htab_update__open();
+ if (!ASSERT_OK_PTR(skel, "htab_update__open"))
+ return;
+
+ /* lookup_elem_raw() may be inlined and find_kernel_btf_id() will return -ESRCH */
+ bpf_program__set_autoload(skel->progs.lookup_elem_raw, true);
+ err = htab_update__load(skel);
+ if (!ASSERT_TRUE(!err || err == -ESRCH, "htab_update__load") || err)
+ goto out;
+
+ skel->bss->pid = getpid();
+ err = htab_update__attach(skel);
+ if (!ASSERT_OK(err, "htab_update__attach"))
+ goto out;
+
+ /* Will trigger the reentrancy of bpf_map_update_elem() */
+ key = 0;
+ value = 0;
+ err = bpf_map_update_elem(bpf_map__fd(skel->maps.htab), &key, &value, 0);
+ if (!ASSERT_OK(err, "add element"))
+ goto out;
+
+ ASSERT_EQ(skel->bss->update_err, -EBUSY, "no reentrancy");
+out:
+ htab_update__destroy(skel);
+}
+
+static void *htab_update_thread(void *arg)
+{
+ struct htab_update_ctx *ctx = arg;
+ cpu_set_t cpus;
+ int i;
+
+ /* Pinned on CPU 0 */
+ CPU_ZERO(&cpus);
+ CPU_SET(0, &cpus);
+ pthread_setaffinity_np(pthread_self(), sizeof(cpus), &cpus);
+
+ i = 0;
+ while (i++ < ctx->loop && !ctx->stop) {
+ unsigned int key = 0, value = 0;
+ int err;
+
+ err = bpf_map_update_elem(ctx->fd, &key, &value, 0);
+ if (err) {
+ ctx->stop = true;
+ return (void *)(long)err;
+ }
+ }
+
+ return NULL;
+}
+
+static void test_concurrent_update(void)
+{
+ struct htab_update_ctx ctx;
+ struct htab_update *skel;
+ unsigned int i, nr;
+ pthread_t *tids;
+ int err;
+
+ skel = htab_update__open_and_load();
+ if (!ASSERT_OK_PTR(skel, "htab_update__open_and_load"))
+ return;
+
+ ctx.fd = bpf_map__fd(skel->maps.htab);
+ ctx.loop = 1000;
+ ctx.stop = false;
+
+ nr = 4;
+ tids = calloc(nr, sizeof(*tids));
+ if (!ASSERT_NEQ(tids, NULL, "no mem"))
+ goto out;
+
+ for (i = 0; i < nr; i++) {
+ err = pthread_create(&tids[i], NULL, htab_update_thread, &ctx);
+ if (!ASSERT_OK(err, "pthread_create")) {
+ unsigned int j;
+
+ ctx.stop = true;
+ for (j = 0; j < i; j++)
+ pthread_join(tids[j], NULL);
+ goto out;
+ }
+ }
+
+ for (i = 0; i < nr; i++) {
+ void *thread_err = NULL;
+
+ pthread_join(tids[i], &thread_err);
+ ASSERT_EQ(thread_err, NULL, "update error");
+ }
+
+out:
+ if (tids)
+ free(tids);
+ htab_update__destroy(skel);
+}
+
+void test_htab_update(void)
+{
+ if (test__start_subtest("reenter_update"))
+ test_reenter_update();
+ if (test__start_subtest("concurrent_update"))
+ test_concurrent_update();
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/kfree_skb.c b/tools/testing/selftests/bpf/prog_tests/kfree_skb.c
index 1cee6957285e..73579370bfbd 100644
--- a/tools/testing/selftests/bpf/prog_tests/kfree_skb.c
+++ b/tools/testing/selftests/bpf/prog_tests/kfree_skb.c
@@ -69,7 +69,7 @@ void serial_test_kfree_skb(void)
const int zero = 0;
bool test_ok[2];
- err = bpf_prog_test_load("./test_pkt_access.o", BPF_PROG_TYPE_SCHED_CLS,
+ err = bpf_prog_test_load("./test_pkt_access.bpf.o", BPF_PROG_TYPE_SCHED_CLS,
&obj, &prog_fd);
if (CHECK(err, "prog_load sched cls", "err %d errno %d\n", err, errno))
return;
diff --git a/tools/testing/selftests/bpf/prog_tests/kfunc_call.c b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c
index 351fafa006fb..eede7c304f86 100644
--- a/tools/testing/selftests/bpf/prog_tests/kfunc_call.c
+++ b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c
@@ -109,7 +109,7 @@ static void test_destructive(void)
{
__u64 save_caps = 0;
- ASSERT_OK(test_destructive_open_and_load(), "succesful_load");
+ ASSERT_OK(test_destructive_open_and_load(), "successful_load");
if (!ASSERT_OK(cap_disable_effective(1ULL << CAP_SYS_BOOT, &save_caps), "drop_caps"))
return;
diff --git a/tools/testing/selftests/bpf/prog_tests/l4lb_all.c b/tools/testing/selftests/bpf/prog_tests/l4lb_all.c
index 55f733ff4109..9c1a18573ffd 100644
--- a/tools/testing/selftests/bpf/prog_tests/l4lb_all.c
+++ b/tools/testing/selftests/bpf/prog_tests/l4lb_all.c
@@ -90,7 +90,7 @@ out:
void test_l4lb_all(void)
{
if (test__start_subtest("l4lb_inline"))
- test_l4lb("test_l4lb.o");
+ test_l4lb("test_l4lb.bpf.o");
if (test__start_subtest("l4lb_noinline"))
- test_l4lb("test_l4lb_noinline.o");
+ test_l4lb("test_l4lb_noinline.bpf.o");
}
diff --git a/tools/testing/selftests/bpf/prog_tests/load_bytes_relative.c b/tools/testing/selftests/bpf/prog_tests/load_bytes_relative.c
index 4e0b2ec057aa..581c0eb0a0a1 100644
--- a/tools/testing/selftests/bpf/prog_tests/load_bytes_relative.c
+++ b/tools/testing/selftests/bpf/prog_tests/load_bytes_relative.c
@@ -27,8 +27,8 @@ void test_load_bytes_relative(void)
if (CHECK_FAIL(server_fd < 0))
goto close_cgroup_fd;
- err = bpf_prog_test_load("./load_bytes_relative.o", BPF_PROG_TYPE_CGROUP_SKB,
- &obj, &prog_fd);
+ err = bpf_prog_test_load("./load_bytes_relative.bpf.o", BPF_PROG_TYPE_CGROUP_SKB,
+ &obj, &prog_fd);
if (CHECK_FAIL(err))
goto close_server_fd;
diff --git a/tools/testing/selftests/bpf/prog_tests/map_lock.c b/tools/testing/selftests/bpf/prog_tests/map_lock.c
index e4e99b37df64..1d6726f01dd2 100644
--- a/tools/testing/selftests/bpf/prog_tests/map_lock.c
+++ b/tools/testing/selftests/bpf/prog_tests/map_lock.c
@@ -49,7 +49,7 @@ out:
void test_map_lock(void)
{
- const char *file = "./test_map_lock.o";
+ const char *file = "./test_map_lock.bpf.o";
int prog_fd, map_fd[2], vars[17] = {};
pthread_t thread_id[6];
struct bpf_object *obj = NULL;
diff --git a/tools/testing/selftests/bpf/prog_tests/pinning.c b/tools/testing/selftests/bpf/prog_tests/pinning.c
index 31c09ba577eb..d95cee5867b7 100644
--- a/tools/testing/selftests/bpf/prog_tests/pinning.c
+++ b/tools/testing/selftests/bpf/prog_tests/pinning.c
@@ -26,13 +26,13 @@ __u32 get_map_id(struct bpf_object *obj, const char *name)
void test_pinning(void)
{
- const char *file_invalid = "./test_pinning_invalid.o";
+ const char *file_invalid = "./test_pinning_invalid.bpf.o";
const char *custpinpath = "/sys/fs/bpf/custom/pinmap";
const char *nopinpath = "/sys/fs/bpf/nopinmap";
const char *nopinpath2 = "/sys/fs/bpf/nopinmap2";
const char *custpath = "/sys/fs/bpf/custom";
const char *pinpath = "/sys/fs/bpf/pinmap";
- const char *file = "./test_pinning.o";
+ const char *file = "./test_pinning.bpf.o";
__u32 map_id, map_id2, duration = 0;
struct stat statbuf = {};
struct bpf_object *obj;
diff --git a/tools/testing/selftests/bpf/prog_tests/pkt_access.c b/tools/testing/selftests/bpf/prog_tests/pkt_access.c
index 0bcccdc34fbc..682e4ff45b01 100644
--- a/tools/testing/selftests/bpf/prog_tests/pkt_access.c
+++ b/tools/testing/selftests/bpf/prog_tests/pkt_access.c
@@ -4,7 +4,7 @@
void test_pkt_access(void)
{
- const char *file = "./test_pkt_access.o";
+ const char *file = "./test_pkt_access.bpf.o";
struct bpf_object *obj;
int err, prog_fd;
LIBBPF_OPTS(bpf_test_run_opts, topts,
diff --git a/tools/testing/selftests/bpf/prog_tests/pkt_md_access.c b/tools/testing/selftests/bpf/prog_tests/pkt_md_access.c
index 00ee1dd792aa..0d85e0642811 100644
--- a/tools/testing/selftests/bpf/prog_tests/pkt_md_access.c
+++ b/tools/testing/selftests/bpf/prog_tests/pkt_md_access.c
@@ -4,7 +4,7 @@
void test_pkt_md_access(void)
{
- const char *file = "./test_pkt_md_access.o";
+ const char *file = "./test_pkt_md_access.bpf.o";
struct bpf_object *obj;
int err, prog_fd;
LIBBPF_OPTS(bpf_test_run_opts, topts,
diff --git a/tools/testing/selftests/bpf/prog_tests/probe_user.c b/tools/testing/selftests/bpf/prog_tests/probe_user.c
index 34dbd2adc157..8721671321de 100644
--- a/tools/testing/selftests/bpf/prog_tests/probe_user.c
+++ b/tools/testing/selftests/bpf/prog_tests/probe_user.c
@@ -11,7 +11,7 @@ void serial_test_probe_user(void)
#endif
};
enum { prog_count = ARRAY_SIZE(prog_names) };
- const char *obj_file = "./test_probe_user.o";
+ const char *obj_file = "./test_probe_user.bpf.o";
DECLARE_LIBBPF_OPTS(bpf_object_open_opts, opts, );
int err, results_map_fd, sock_fd, duration = 0;
struct sockaddr curr, orig, tmp;
diff --git a/tools/testing/selftests/bpf/prog_tests/queue_stack_map.c b/tools/testing/selftests/bpf/prog_tests/queue_stack_map.c
index d2743fc10032..722c5f2a7776 100644
--- a/tools/testing/selftests/bpf/prog_tests/queue_stack_map.c
+++ b/tools/testing/selftests/bpf/prog_tests/queue_stack_map.c
@@ -28,9 +28,9 @@ static void test_queue_stack_map_by_type(int type)
vals[i] = rand();
if (type == QUEUE)
- strncpy(file, "./test_queue_map.o", sizeof(file));
+ strncpy(file, "./test_queue_map.bpf.o", sizeof(file));
else if (type == STACK)
- strncpy(file, "./test_stack_map.o", sizeof(file));
+ strncpy(file, "./test_stack_map.bpf.o", sizeof(file));
else
return;
diff --git a/tools/testing/selftests/bpf/prog_tests/rdonly_maps.c b/tools/testing/selftests/bpf/prog_tests/rdonly_maps.c
index fd5d2ddfb062..19e2f2526dbd 100644
--- a/tools/testing/selftests/bpf/prog_tests/rdonly_maps.c
+++ b/tools/testing/selftests/bpf/prog_tests/rdonly_maps.c
@@ -16,7 +16,7 @@ struct rdonly_map_subtest {
void test_rdonly_maps(void)
{
- const char *file = "test_rdonly_maps.o";
+ const char *file = "test_rdonly_maps.bpf.o";
struct rdonly_map_subtest subtests[] = {
{ "skip loop", "skip_loop", 0, 0 },
{ "part loop", "part_loop", 3, 2 + 3 + 4 },
diff --git a/tools/testing/selftests/bpf/prog_tests/reference_tracking.c b/tools/testing/selftests/bpf/prog_tests/reference_tracking.c
index 739d2ea6ca55..d863205bbe95 100644
--- a/tools/testing/selftests/bpf/prog_tests/reference_tracking.c
+++ b/tools/testing/selftests/bpf/prog_tests/reference_tracking.c
@@ -3,7 +3,7 @@
void test_reference_tracking(void)
{
- const char *file = "test_sk_lookup_kern.o";
+ const char *file = "test_sk_lookup_kern.bpf.o";
const char *obj_name = "ref_track";
DECLARE_LIBBPF_OPTS(bpf_object_open_opts, open_opts,
.object_name = obj_name,
diff --git a/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c b/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c
index c197261d02e2..f81d08d429a2 100644
--- a/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c
+++ b/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c
@@ -101,7 +101,7 @@ static int resolve_symbols(void)
int type_id;
__u32 nr;
- btf = btf__parse_elf("btf_data.o", NULL);
+ btf = btf__parse_elf("btf_data.bpf.o", NULL);
if (CHECK(libbpf_get_error(btf), "resolve",
"Failed to load BTF from btf_data.o\n"))
return -1;
diff --git a/tools/testing/selftests/bpf/prog_tests/select_reuseport.c b/tools/testing/selftests/bpf/prog_tests/select_reuseport.c
index 1cbd8cd64044..64c5f5eb2994 100644
--- a/tools/testing/selftests/bpf/prog_tests/select_reuseport.c
+++ b/tools/testing/selftests/bpf/prog_tests/select_reuseport.c
@@ -91,9 +91,9 @@ static int prepare_bpf_obj(void)
struct bpf_map *map;
int err;
- obj = bpf_object__open("test_select_reuseport_kern.o");
+ obj = bpf_object__open("test_select_reuseport_kern.bpf.o");
err = libbpf_get_error(obj);
- RET_ERR(err, "open test_select_reuseport_kern.o",
+ RET_ERR(err, "open test_select_reuseport_kern.bpf.o",
"obj:%p PTR_ERR(obj):%d\n", obj, err);
map = bpf_object__find_map_by_name(obj, "outer_map");
diff --git a/tools/testing/selftests/bpf/prog_tests/setget_sockopt.c b/tools/testing/selftests/bpf/prog_tests/setget_sockopt.c
new file mode 100644
index 000000000000..018611e6b248
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/setget_sockopt.c
@@ -0,0 +1,125 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) Meta Platforms, Inc. and affiliates. */
+
+#define _GNU_SOURCE
+#include <sched.h>
+#include <linux/socket.h>
+#include <net/if.h>
+
+#include "test_progs.h"
+#include "cgroup_helpers.h"
+#include "network_helpers.h"
+
+#include "setget_sockopt.skel.h"
+
+#define CG_NAME "/setget-sockopt-test"
+
+static const char addr4_str[] = "127.0.0.1";
+static const char addr6_str[] = "::1";
+static struct setget_sockopt *skel;
+static int cg_fd;
+
+static int create_netns(void)
+{
+ if (!ASSERT_OK(unshare(CLONE_NEWNET), "create netns"))
+ return -1;
+
+ if (!ASSERT_OK(system("ip link set dev lo up"), "set lo up"))
+ return -1;
+
+ if (!ASSERT_OK(system("ip link add dev binddevtest1 type veth peer name binddevtest2"),
+ "add veth"))
+ return -1;
+
+ if (!ASSERT_OK(system("ip link set dev binddevtest1 up"),
+ "bring veth up"))
+ return -1;
+
+ return 0;
+}
+
+static void test_tcp(int family)
+{
+ struct setget_sockopt__bss *bss = skel->bss;
+ int sfd, cfd;
+
+ memset(bss, 0, sizeof(*bss));
+
+ sfd = start_server(family, SOCK_STREAM,
+ family == AF_INET6 ? addr6_str : addr4_str, 0, 0);
+ if (!ASSERT_GE(sfd, 0, "start_server"))
+ return;
+
+ cfd = connect_to_fd(sfd, 0);
+ if (!ASSERT_GE(cfd, 0, "connect_to_fd_server")) {
+ close(sfd);
+ return;
+ }
+ close(sfd);
+ close(cfd);
+
+ ASSERT_EQ(bss->nr_listen, 1, "nr_listen");
+ ASSERT_EQ(bss->nr_connect, 1, "nr_connect");
+ ASSERT_EQ(bss->nr_active, 1, "nr_active");
+ ASSERT_EQ(bss->nr_passive, 1, "nr_passive");
+ ASSERT_EQ(bss->nr_socket_post_create, 2, "nr_socket_post_create");
+ ASSERT_EQ(bss->nr_binddev, 2, "nr_bind");
+}
+
+static void test_udp(int family)
+{
+ struct setget_sockopt__bss *bss = skel->bss;
+ int sfd;
+
+ memset(bss, 0, sizeof(*bss));
+
+ sfd = start_server(family, SOCK_DGRAM,
+ family == AF_INET6 ? addr6_str : addr4_str, 0, 0);
+ if (!ASSERT_GE(sfd, 0, "start_server"))
+ return;
+ close(sfd);
+
+ ASSERT_GE(bss->nr_socket_post_create, 1, "nr_socket_post_create");
+ ASSERT_EQ(bss->nr_binddev, 1, "nr_bind");
+}
+
+void test_setget_sockopt(void)
+{
+ cg_fd = test__join_cgroup(CG_NAME);
+ if (cg_fd < 0)
+ return;
+
+ if (create_netns())
+ goto done;
+
+ skel = setget_sockopt__open();
+ if (!ASSERT_OK_PTR(skel, "open skel"))
+ goto done;
+
+ strcpy(skel->rodata->veth, "binddevtest1");
+ skel->rodata->veth_ifindex = if_nametoindex("binddevtest1");
+ if (!ASSERT_GT(skel->rodata->veth_ifindex, 0, "if_nametoindex"))
+ goto done;
+
+ if (!ASSERT_OK(setget_sockopt__load(skel), "load skel"))
+ goto done;
+
+ skel->links.skops_sockopt =
+ bpf_program__attach_cgroup(skel->progs.skops_sockopt, cg_fd);
+ if (!ASSERT_OK_PTR(skel->links.skops_sockopt, "attach cgroup"))
+ goto done;
+
+ skel->links.socket_post_create =
+ bpf_program__attach_cgroup(skel->progs.socket_post_create, cg_fd);
+ if (!ASSERT_OK_PTR(skel->links.socket_post_create, "attach_cgroup"))
+ goto done;
+
+ test_tcp(AF_INET6);
+ test_tcp(AF_INET);
+ test_udp(AF_INET6);
+ test_udp(AF_INET);
+
+done:
+ setget_sockopt__destroy(skel);
+ close(cg_fd);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/sk_assign.c b/tools/testing/selftests/bpf/prog_tests/sk_assign.c
index 1d272e05188e..3e190ed63976 100644
--- a/tools/testing/selftests/bpf/prog_tests/sk_assign.c
+++ b/tools/testing/selftests/bpf/prog_tests/sk_assign.c
@@ -47,7 +47,7 @@ configure_stack(void)
if (CHECK_FAIL(system("tc qdisc add dev lo clsact")))
return false;
sprintf(tc_cmd, "%s %s %s %s", "tc filter add dev lo ingress bpf",
- "direct-action object-file ./test_sk_assign.o",
+ "direct-action object-file ./test_sk_assign.bpf.o",
"section tc",
(env.verbosity < VERBOSE_VERY) ? " 2>/dev/null" : "verbose");
if (CHECK(system(tc_cmd), "BPF load failed;",
diff --git a/tools/testing/selftests/bpf/prog_tests/skb_ctx.c b/tools/testing/selftests/bpf/prog_tests/skb_ctx.c
index ce0e555b5e38..33f950e2dae3 100644
--- a/tools/testing/selftests/bpf/prog_tests/skb_ctx.c
+++ b/tools/testing/selftests/bpf/prog_tests/skb_ctx.c
@@ -31,7 +31,7 @@ void test_skb_ctx(void)
struct bpf_object *obj;
int err, prog_fd, i;
- err = bpf_prog_test_load("./test_skb_ctx.o", BPF_PROG_TYPE_SCHED_CLS,
+ err = bpf_prog_test_load("./test_skb_ctx.bpf.o", BPF_PROG_TYPE_SCHED_CLS,
&obj, &prog_fd);
if (!ASSERT_OK(err, "load"))
return;
diff --git a/tools/testing/selftests/bpf/prog_tests/skb_helpers.c b/tools/testing/selftests/bpf/prog_tests/skb_helpers.c
index 97dc8b14be48..f7ee25f290f7 100644
--- a/tools/testing/selftests/bpf/prog_tests/skb_helpers.c
+++ b/tools/testing/selftests/bpf/prog_tests/skb_helpers.c
@@ -20,7 +20,7 @@ void test_skb_helpers(void)
struct bpf_object *obj;
int err, prog_fd;
- err = bpf_prog_test_load("./test_skb_helpers.o",
+ err = bpf_prog_test_load("./test_skb_helpers.bpf.o",
BPF_PROG_TYPE_SCHED_CLS, &obj, &prog_fd);
if (!ASSERT_OK(err, "load"))
return;
diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c b/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c
index 8ed78a9383ba..c5cb6e8374b6 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c
@@ -174,7 +174,7 @@ static void run_test(int cgroup_fd)
pthread_t tid;
int err;
- obj = bpf_object__open_file("sockopt_inherit.o", NULL);
+ obj = bpf_object__open_file("sockopt_inherit.bpf.o", NULL);
if (!ASSERT_OK_PTR(obj, "obj_open"))
return;
diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_multi.c b/tools/testing/selftests/bpf/prog_tests/sockopt_multi.c
index abce12ddcc37..28d592dc54a7 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockopt_multi.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockopt_multi.c
@@ -310,7 +310,7 @@ void test_sockopt_multi(void)
if (CHECK_FAIL(cg_child < 0))
goto out;
- obj = bpf_object__open_file("sockopt_multi.o", NULL);
+ obj = bpf_object__open_file("sockopt_multi.bpf.o", NULL);
if (!ASSERT_OK_PTR(obj, "obj_load"))
goto out;
diff --git a/tools/testing/selftests/bpf/prog_tests/spinlock.c b/tools/testing/selftests/bpf/prog_tests/spinlock.c
index 8e329eaee6d7..15eb1372d771 100644
--- a/tools/testing/selftests/bpf/prog_tests/spinlock.c
+++ b/tools/testing/selftests/bpf/prog_tests/spinlock.c
@@ -19,7 +19,7 @@ static void *spin_lock_thread(void *arg)
void test_spinlock(void)
{
- const char *file = "./test_spin_lock.o";
+ const char *file = "./test_spin_lock.bpf.o";
pthread_t thread_id[4];
struct bpf_object *obj = NULL;
int prog_fd;
diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_map.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_map.c
index 313f0a66232e..df59e4ae2951 100644
--- a/tools/testing/selftests/bpf/prog_tests/stacktrace_map.c
+++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_map.c
@@ -6,7 +6,7 @@ void test_stacktrace_map(void)
int control_map_fd, stackid_hmap_fd, stackmap_fd, stack_amap_fd;
const char *prog_name = "oncpu";
int err, prog_fd, stack_trace_len;
- const char *file = "./test_stacktrace_map.o";
+ const char *file = "./test_stacktrace_map.bpf.o";
__u32 key, val, duration = 0;
struct bpf_program *prog;
struct bpf_object *obj;
diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_map_raw_tp.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_map_raw_tp.c
index 1cb8dd36bd8f..c6ef06f55cdb 100644
--- a/tools/testing/selftests/bpf/prog_tests/stacktrace_map_raw_tp.c
+++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_map_raw_tp.c
@@ -5,7 +5,7 @@ void test_stacktrace_map_raw_tp(void)
{
const char *prog_name = "oncpu";
int control_map_fd, stackid_hmap_fd, stackmap_fd;
- const char *file = "./test_stacktrace_map.o";
+ const char *file = "./test_stacktrace_map.bpf.o";
__u32 key, val, duration = 0;
int err, prog_fd;
struct bpf_program *prog;
diff --git a/tools/testing/selftests/bpf/prog_tests/tailcalls.c b/tools/testing/selftests/bpf/prog_tests/tailcalls.c
index 19c70880cfb3..58fe2c586ed7 100644
--- a/tools/testing/selftests/bpf/prog_tests/tailcalls.c
+++ b/tools/testing/selftests/bpf/prog_tests/tailcalls.c
@@ -20,8 +20,8 @@ static void test_tailcall_1(void)
.repeat = 1,
);
- err = bpf_prog_test_load("tailcall1.o", BPF_PROG_TYPE_SCHED_CLS, &obj,
- &prog_fd);
+ err = bpf_prog_test_load("tailcall1.bpf.o", BPF_PROG_TYPE_SCHED_CLS, &obj,
+ &prog_fd);
if (CHECK_FAIL(err))
return;
@@ -156,8 +156,8 @@ static void test_tailcall_2(void)
.repeat = 1,
);
- err = bpf_prog_test_load("tailcall2.o", BPF_PROG_TYPE_SCHED_CLS, &obj,
- &prog_fd);
+ err = bpf_prog_test_load("tailcall2.bpf.o", BPF_PROG_TYPE_SCHED_CLS, &obj,
+ &prog_fd);
if (CHECK_FAIL(err))
return;
@@ -299,7 +299,7 @@ out:
*/
static void test_tailcall_3(void)
{
- test_tailcall_count("tailcall3.o");
+ test_tailcall_count("tailcall3.bpf.o");
}
/* test_tailcall_6 checks that the count value of the tail call limit
@@ -307,7 +307,7 @@ static void test_tailcall_3(void)
*/
static void test_tailcall_6(void)
{
- test_tailcall_count("tailcall6.o");
+ test_tailcall_count("tailcall6.bpf.o");
}
/* test_tailcall_4 checks that the kernel properly selects indirect jump
@@ -329,8 +329,8 @@ static void test_tailcall_4(void)
.repeat = 1,
);
- err = bpf_prog_test_load("tailcall4.o", BPF_PROG_TYPE_SCHED_CLS, &obj,
- &prog_fd);
+ err = bpf_prog_test_load("tailcall4.bpf.o", BPF_PROG_TYPE_SCHED_CLS, &obj,
+ &prog_fd);
if (CHECK_FAIL(err))
return;
@@ -419,8 +419,8 @@ static void test_tailcall_5(void)
.repeat = 1,
);
- err = bpf_prog_test_load("tailcall5.o", BPF_PROG_TYPE_SCHED_CLS, &obj,
- &prog_fd);
+ err = bpf_prog_test_load("tailcall5.bpf.o", BPF_PROG_TYPE_SCHED_CLS, &obj,
+ &prog_fd);
if (CHECK_FAIL(err))
return;
@@ -507,8 +507,8 @@ static void test_tailcall_bpf2bpf_1(void)
.repeat = 1,
);
- err = bpf_prog_test_load("tailcall_bpf2bpf1.o", BPF_PROG_TYPE_SCHED_CLS,
- &obj, &prog_fd);
+ err = bpf_prog_test_load("tailcall_bpf2bpf1.bpf.o", BPF_PROG_TYPE_SCHED_CLS,
+ &obj, &prog_fd);
if (CHECK_FAIL(err))
return;
@@ -591,8 +591,8 @@ static void test_tailcall_bpf2bpf_2(void)
.repeat = 1,
);
- err = bpf_prog_test_load("tailcall_bpf2bpf2.o", BPF_PROG_TYPE_SCHED_CLS,
- &obj, &prog_fd);
+ err = bpf_prog_test_load("tailcall_bpf2bpf2.bpf.o", BPF_PROG_TYPE_SCHED_CLS,
+ &obj, &prog_fd);
if (CHECK_FAIL(err))
return;
@@ -671,8 +671,8 @@ static void test_tailcall_bpf2bpf_3(void)
.repeat = 1,
);
- err = bpf_prog_test_load("tailcall_bpf2bpf3.o", BPF_PROG_TYPE_SCHED_CLS,
- &obj, &prog_fd);
+ err = bpf_prog_test_load("tailcall_bpf2bpf3.bpf.o", BPF_PROG_TYPE_SCHED_CLS,
+ &obj, &prog_fd);
if (CHECK_FAIL(err))
return;
@@ -766,8 +766,8 @@ static void test_tailcall_bpf2bpf_4(bool noise)
.repeat = 1,
);
- err = bpf_prog_test_load("tailcall_bpf2bpf4.o", BPF_PROG_TYPE_SCHED_CLS,
- &obj, &prog_fd);
+ err = bpf_prog_test_load("tailcall_bpf2bpf4.bpf.o", BPF_PROG_TYPE_SCHED_CLS,
+ &obj, &prog_fd);
if (CHECK_FAIL(err))
return;
diff --git a/tools/testing/selftests/bpf/prog_tests/task_fd_query_rawtp.c b/tools/testing/selftests/bpf/prog_tests/task_fd_query_rawtp.c
index 17947c9e1d66..3d34bab01e48 100644
--- a/tools/testing/selftests/bpf/prog_tests/task_fd_query_rawtp.c
+++ b/tools/testing/selftests/bpf/prog_tests/task_fd_query_rawtp.c
@@ -3,7 +3,7 @@
void test_task_fd_query_rawtp(void)
{
- const char *file = "./test_get_stack_rawtp.o";
+ const char *file = "./test_get_stack_rawtp.bpf.o";
__u64 probe_offset, probe_addr;
__u32 len, prog_id, fd_type;
struct bpf_object *obj;
diff --git a/tools/testing/selftests/bpf/prog_tests/task_fd_query_tp.c b/tools/testing/selftests/bpf/prog_tests/task_fd_query_tp.c
index c2a98a7a8dfc..c717741bf8b6 100644
--- a/tools/testing/selftests/bpf/prog_tests/task_fd_query_tp.c
+++ b/tools/testing/selftests/bpf/prog_tests/task_fd_query_tp.c
@@ -4,7 +4,7 @@
static void test_task_fd_query_tp_core(const char *probe_name,
const char *tp_name)
{
- const char *file = "./test_tracepoint.o";
+ const char *file = "./test_tracepoint.bpf.o";
int err, bytes, efd, prog_fd, pmu_fd;
struct perf_event_attr attr = {};
__u64 probe_offset, probe_addr;
diff --git a/tools/testing/selftests/bpf/prog_tests/tcp_estats.c b/tools/testing/selftests/bpf/prog_tests/tcp_estats.c
index 11bf755be4c9..032dbfb26256 100644
--- a/tools/testing/selftests/bpf/prog_tests/tcp_estats.c
+++ b/tools/testing/selftests/bpf/prog_tests/tcp_estats.c
@@ -3,7 +3,7 @@
void test_tcp_estats(void)
{
- const char *file = "./test_tcp_estats.o";
+ const char *file = "./test_tcp_estats.bpf.o";
int err, prog_fd;
struct bpf_object *obj;
__u32 duration = 0;
diff --git a/tools/testing/selftests/bpf/prog_tests/test_bprm_opts.c b/tools/testing/selftests/bpf/prog_tests/test_bprm_opts.c
index 2559bb775762..a0054019e677 100644
--- a/tools/testing/selftests/bpf/prog_tests/test_bprm_opts.c
+++ b/tools/testing/selftests/bpf/prog_tests/test_bprm_opts.c
@@ -9,18 +9,10 @@
#include "bprm_opts.skel.h"
#include "network_helpers.h"
-
-#ifndef __NR_pidfd_open
-#define __NR_pidfd_open 434
-#endif
+#include "task_local_storage_helpers.h"
static const char * const bash_envp[] = { "TMPDIR=shouldnotbeset", NULL };
-static inline int sys_pidfd_open(pid_t pid, unsigned int flags)
-{
- return syscall(__NR_pidfd_open, pid, flags);
-}
-
static int update_storage(int map_fd, int secureexec)
{
int task_fd, ret = 0;
diff --git a/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c b/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c
index b90ee47d3111..7295cc60f724 100644
--- a/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c
+++ b/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c
@@ -65,23 +65,23 @@ struct test_def {
void test_test_global_funcs(void)
{
struct test_def tests[] = {
- { "test_global_func1.o", "combined stack size of 4 calls is 544" },
- { "test_global_func2.o" },
- { "test_global_func3.o" , "the call stack of 8 frames" },
- { "test_global_func4.o" },
- { "test_global_func5.o" , "expected pointer to ctx, but got PTR" },
- { "test_global_func6.o" , "modified ctx ptr R2" },
- { "test_global_func7.o" , "foo() doesn't return scalar" },
- { "test_global_func8.o" },
- { "test_global_func9.o" },
- { "test_global_func10.o", "invalid indirect read from stack" },
- { "test_global_func11.o", "Caller passes invalid args into func#1" },
- { "test_global_func12.o", "invalid mem access 'mem_or_null'" },
- { "test_global_func13.o", "Caller passes invalid args into func#1" },
- { "test_global_func14.o", "reference type('FWD S') size cannot be determined" },
- { "test_global_func15.o", "At program exit the register R0 has value" },
- { "test_global_func16.o", "invalid indirect read from stack" },
- { "test_global_func17.o", "Caller passes invalid args into func#1" },
+ { "test_global_func1.bpf.o", "combined stack size of 4 calls is 544" },
+ { "test_global_func2.bpf.o" },
+ { "test_global_func3.bpf.o", "the call stack of 8 frames" },
+ { "test_global_func4.bpf.o" },
+ { "test_global_func5.bpf.o", "expected pointer to ctx, but got PTR" },
+ { "test_global_func6.bpf.o", "modified ctx ptr R2" },
+ { "test_global_func7.bpf.o", "foo() doesn't return scalar" },
+ { "test_global_func8.bpf.o" },
+ { "test_global_func9.bpf.o" },
+ { "test_global_func10.bpf.o", "invalid indirect read from stack" },
+ { "test_global_func11.bpf.o", "Caller passes invalid args into func#1" },
+ { "test_global_func12.bpf.o", "invalid mem access 'mem_or_null'" },
+ { "test_global_func13.bpf.o", "Caller passes invalid args into func#1" },
+ { "test_global_func14.bpf.o", "reference type('FWD S') size cannot be determined" },
+ { "test_global_func15.bpf.o", "At program exit the register R0 has value" },
+ { "test_global_func16.bpf.o", "invalid indirect read from stack" },
+ { "test_global_func17.bpf.o", "Caller passes invalid args into func#1" },
};
libbpf_print_fn_t old_print_fn = NULL;
int err, i, duration = 0;
diff --git a/tools/testing/selftests/bpf/prog_tests/test_local_storage.c b/tools/testing/selftests/bpf/prog_tests/test_local_storage.c
index 26ac26a88026..9c77cd6b1eaf 100644
--- a/tools/testing/selftests/bpf/prog_tests/test_local_storage.c
+++ b/tools/testing/selftests/bpf/prog_tests/test_local_storage.c
@@ -11,15 +11,7 @@
#include "local_storage.skel.h"
#include "network_helpers.h"
-
-#ifndef __NR_pidfd_open
-#define __NR_pidfd_open 434
-#endif
-
-static inline int sys_pidfd_open(pid_t pid, unsigned int flags)
-{
- return syscall(__NR_pidfd_open, pid, flags);
-}
+#include "task_local_storage_helpers.h"
static unsigned int duration;
diff --git a/tools/testing/selftests/bpf/prog_tests/test_overhead.c b/tools/testing/selftests/bpf/prog_tests/test_overhead.c
index 05acb376f74d..f27013e38d03 100644
--- a/tools/testing/selftests/bpf/prog_tests/test_overhead.c
+++ b/tools/testing/selftests/bpf/prog_tests/test_overhead.c
@@ -72,7 +72,7 @@ void test_test_overhead(void)
if (CHECK_FAIL(prctl(PR_GET_NAME, comm, 0L, 0L, 0L)))
return;
- obj = bpf_object__open_file("./test_overhead.o", NULL);
+ obj = bpf_object__open_file("./test_overhead.bpf.o", NULL);
if (!ASSERT_OK_PTR(obj, "obj_open_file"))
return;
diff --git a/tools/testing/selftests/bpf/prog_tests/tp_attach_query.c b/tools/testing/selftests/bpf/prog_tests/tp_attach_query.c
index 39e79291c82b..a479080533db 100644
--- a/tools/testing/selftests/bpf/prog_tests/tp_attach_query.c
+++ b/tools/testing/selftests/bpf/prog_tests/tp_attach_query.c
@@ -6,7 +6,7 @@ void serial_test_tp_attach_query(void)
const int num_progs = 3;
int i, j, bytes, efd, err, prog_fd[num_progs], pmu_fd[num_progs];
__u32 duration = 0, info_len, saved_prog_ids[num_progs];
- const char *file = "./test_tracepoint.o";
+ const char *file = "./test_tracepoint.bpf.o";
struct perf_event_query_bpf *query;
struct perf_event_attr attr = {};
struct bpf_object *obj[num_progs];
diff --git a/tools/testing/selftests/bpf/prog_tests/trampoline_count.c b/tools/testing/selftests/bpf/prog_tests/trampoline_count.c
index b0acbda6dbf5..564b75bc087f 100644
--- a/tools/testing/selftests/bpf/prog_tests/trampoline_count.c
+++ b/tools/testing/selftests/bpf/prog_tests/trampoline_count.c
@@ -35,7 +35,7 @@ static struct bpf_program *load_prog(char *file, char *name, struct inst *inst)
/* TODO: use different target function to run in concurrent mode */
void serial_test_trampoline_count(void)
{
- char *file = "test_trampoline_count.o";
+ char *file = "test_trampoline_count.bpf.o";
char *const progs[] = { "fentry_test", "fmod_ret_test", "fexit_test" };
struct inst inst[MAX_TRAMP_PROGS + 1] = {};
struct bpf_program *prog;
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp.c b/tools/testing/selftests/bpf/prog_tests/xdp.c
index ec21c53cb1da..947863a1d536 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp.c
@@ -8,7 +8,7 @@ void test_xdp(void)
struct vip key6 = {.protocol = 6, .family = AF_INET6};
struct iptnl_info value4 = {.family = AF_INET};
struct iptnl_info value6 = {.family = AF_INET6};
- const char *file = "./test_xdp.o";
+ const char *file = "./test_xdp.bpf.o";
struct bpf_object *obj;
char buf[128];
struct ipv6hdr iph6;
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_adjust_frags.c b/tools/testing/selftests/bpf/prog_tests/xdp_adjust_frags.c
index 2f033da4cd45..fce203640f8c 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_adjust_frags.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_adjust_frags.c
@@ -4,7 +4,7 @@
static void test_xdp_update_frags(void)
{
- const char *file = "./test_xdp_update_frags.o";
+ const char *file = "./test_xdp_update_frags.bpf.o";
int err, prog_fd, max_skb_frags, buf_size, num;
struct bpf_program *prog;
struct bpf_object *obj;
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c b/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c
index 21ceac24e174..9b9cf8458adf 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c
@@ -4,7 +4,7 @@
static void test_xdp_adjust_tail_shrink(void)
{
- const char *file = "./test_xdp_adjust_tail_shrink.o";
+ const char *file = "./test_xdp_adjust_tail_shrink.bpf.o";
__u32 expect_sz;
struct bpf_object *obj;
int err, prog_fd;
@@ -39,7 +39,7 @@ static void test_xdp_adjust_tail_shrink(void)
static void test_xdp_adjust_tail_grow(void)
{
- const char *file = "./test_xdp_adjust_tail_grow.o";
+ const char *file = "./test_xdp_adjust_tail_grow.bpf.o";
struct bpf_object *obj;
char buf[4096]; /* avoid segfault: large buf to hold grow results */
__u32 expect_sz;
@@ -73,7 +73,7 @@ static void test_xdp_adjust_tail_grow(void)
static void test_xdp_adjust_tail_grow2(void)
{
- const char *file = "./test_xdp_adjust_tail_grow.o";
+ const char *file = "./test_xdp_adjust_tail_grow.bpf.o";
char buf[4096]; /* avoid segfault: large buf to hold grow results */
int tailroom = 320; /* SKB_DATA_ALIGN(sizeof(struct skb_shared_info))*/;
struct bpf_object *obj;
@@ -135,7 +135,7 @@ static void test_xdp_adjust_tail_grow2(void)
static void test_xdp_adjust_frags_tail_shrink(void)
{
- const char *file = "./test_xdp_adjust_tail_shrink.o";
+ const char *file = "./test_xdp_adjust_tail_shrink.bpf.o";
__u32 exp_size;
struct bpf_program *prog;
struct bpf_object *obj;
@@ -202,7 +202,7 @@ out:
static void test_xdp_adjust_frags_tail_grow(void)
{
- const char *file = "./test_xdp_adjust_tail_grow.o";
+ const char *file = "./test_xdp_adjust_tail_grow.bpf.o";
__u32 exp_size;
struct bpf_program *prog;
struct bpf_object *obj;
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_attach.c b/tools/testing/selftests/bpf/prog_tests/xdp_attach.c
index 62aa3edda5e6..062fbc8c8e5e 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_attach.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_attach.c
@@ -8,7 +8,7 @@ void serial_test_xdp_attach(void)
{
__u32 duration = 0, id1, id2, id0 = 0, len;
struct bpf_object *obj1, *obj2, *obj3;
- const char *file = "./test_xdp.o";
+ const char *file = "./test_xdp.bpf.o";
struct bpf_prog_info info = {};
int err, fd1, fd2, fd3;
LIBBPF_OPTS(bpf_xdp_attach_opts, opts);
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_info.c b/tools/testing/selftests/bpf/prog_tests/xdp_info.c
index 0d01ff6cb91a..cd3aa340e65e 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_info.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_info.c
@@ -7,7 +7,7 @@
void serial_test_xdp_info(void)
{
__u32 len = sizeof(struct bpf_prog_info), duration = 0, prog_id;
- const char *file = "./xdp_dummy.o";
+ const char *file = "./xdp_dummy.bpf.o";
struct bpf_prog_info info = {};
struct bpf_object *obj;
int err, prog_fd;
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_perf.c b/tools/testing/selftests/bpf/prog_tests/xdp_perf.c
index f543d1bd21b8..ec5369f247cb 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_perf.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_perf.c
@@ -3,7 +3,7 @@
void test_xdp_perf(void)
{
- const char *file = "./xdp_dummy.o";
+ const char *file = "./xdp_dummy.bpf.o";
struct bpf_object *obj;
char in[128], out[128];
int err, prog_fd;
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c b/tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c
index 874a846e298c..75550a40e029 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c
@@ -82,7 +82,7 @@ static void test_synproxy(bool xdp)
SYS("ethtool -K tmp0 tx off");
if (xdp)
/* Workaround required for veth. */
- SYS("ip link set tmp0 xdp object xdp_dummy.o section xdp 2> /dev/null");
+ SYS("ip link set tmp0 xdp object xdp_dummy.bpf.o section xdp 2> /dev/null");
ns = open_netns("synproxy");
if (!ASSERT_OK_PTR(ns, "setns"))
diff --git a/tools/testing/selftests/bpf/progs/bind4_prog.c b/tools/testing/selftests/bpf/progs/bind4_prog.c
index 474c6a62078a..a487f60b73ac 100644
--- a/tools/testing/selftests/bpf/progs/bind4_prog.c
+++ b/tools/testing/selftests/bpf/progs/bind4_prog.c
@@ -6,8 +6,6 @@
#include <linux/bpf.h>
#include <linux/in.h>
#include <linux/in6.h>
-#include <sys/socket.h>
-#include <netinet/tcp.h>
#include <linux/if.h>
#include <errno.h>
diff --git a/tools/testing/selftests/bpf/progs/bind6_prog.c b/tools/testing/selftests/bpf/progs/bind6_prog.c
index c19cfa869f30..d62cd9e9cf0e 100644
--- a/tools/testing/selftests/bpf/progs/bind6_prog.c
+++ b/tools/testing/selftests/bpf/progs/bind6_prog.c
@@ -6,8 +6,6 @@
#include <linux/bpf.h>
#include <linux/in.h>
#include <linux/in6.h>
-#include <sys/socket.h>
-#include <netinet/tcp.h>
#include <linux/if.h>
#include <errno.h>
diff --git a/tools/testing/selftests/bpf/progs/bpf_flow.c b/tools/testing/selftests/bpf/progs/bpf_flow.c
index f266c757b3df..a20c5ed5e454 100644
--- a/tools/testing/selftests/bpf/progs/bpf_flow.c
+++ b/tools/testing/selftests/bpf/progs/bpf_flow.c
@@ -22,6 +22,8 @@
#define PROG(F) PROG_(F, _##F)
#define PROG_(NUM, NAME) SEC("flow_dissector") int flow_dissector_##NUM
+#define FLOW_CONTINUE_SADDR 0x7f00007f /* 127.0.0.127 */
+
/* These are the identifiers of the BPF programs that will be used in tail
* calls. Name is limited to 16 characters, with the terminating character and
* bpf_func_ above, we have only 6 to work with, anything after will be cropped.
@@ -143,6 +145,19 @@ int _dissect(struct __sk_buff *skb)
{
struct bpf_flow_keys *keys = skb->flow_keys;
+ if (keys->n_proto == bpf_htons(ETH_P_IP)) {
+ /* IP traffic from FLOW_CONTINUE_SADDR falls-back to
+ * standard dissector
+ */
+ struct iphdr *iph, _iph;
+
+ iph = bpf_flow_dissect_get_header(skb, sizeof(*iph), &_iph);
+ if (iph && iph->ihl == 5 &&
+ iph->saddr == bpf_htonl(FLOW_CONTINUE_SADDR)) {
+ return BPF_FLOW_DISSECTOR_CONTINUE;
+ }
+ }
+
return parse_eth_proto(skb, keys->n_proto);
}
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter.h b/tools/testing/selftests/bpf/progs/bpf_iter.h
index e9846606690d..c41ee80533ca 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter.h
+++ b/tools/testing/selftests/bpf/progs/bpf_iter.h
@@ -17,6 +17,7 @@
#define bpf_iter__bpf_sk_storage_map bpf_iter__bpf_sk_storage_map___not_used
#define bpf_iter__sockmap bpf_iter__sockmap___not_used
#define bpf_iter__bpf_link bpf_iter__bpf_link___not_used
+#define bpf_iter__cgroup bpf_iter__cgroup___not_used
#define btf_ptr btf_ptr___not_used
#define BTF_F_COMPACT BTF_F_COMPACT___not_used
#define BTF_F_NONAME BTF_F_NONAME___not_used
@@ -40,6 +41,7 @@
#undef bpf_iter__bpf_sk_storage_map
#undef bpf_iter__sockmap
#undef bpf_iter__bpf_link
+#undef bpf_iter__cgroup
#undef btf_ptr
#undef BTF_F_COMPACT
#undef BTF_F_NONAME
@@ -141,6 +143,11 @@ struct bpf_iter__bpf_link {
struct bpf_link *link;
};
+struct bpf_iter__cgroup {
+ struct bpf_iter_meta *meta;
+ struct cgroup *cgroup;
+} __attribute__((preserve_access_index));
+
struct btf_ptr {
void *ptr;
__u32 type_id;
diff --git a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h
index 98dd2c4815f0..adb087aecc9e 100644
--- a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h
+++ b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h
@@ -6,13 +6,41 @@
#define AF_INET6 10
#define SOL_SOCKET 1
+#define SO_REUSEADDR 2
#define SO_SNDBUF 7
-#define __SO_ACCEPTCON (1 << 16)
+#define SO_RCVBUF 8
+#define SO_KEEPALIVE 9
#define SO_PRIORITY 12
+#define SO_REUSEPORT 15
+#define SO_RCVLOWAT 18
+#define SO_BINDTODEVICE 25
+#define SO_MARK 36
+#define SO_MAX_PACING_RATE 47
+#define SO_BINDTOIFINDEX 62
+#define SO_TXREHASH 74
+#define __SO_ACCEPTCON (1 << 16)
+
+#define IP_TOS 1
+
+#define IPV6_TCLASS 67
+#define IPV6_AUTOFLOWLABEL 70
#define SOL_TCP 6
+#define TCP_NODELAY 1
+#define TCP_MAXSEG 2
+#define TCP_KEEPIDLE 4
+#define TCP_KEEPINTVL 5
+#define TCP_KEEPCNT 6
+#define TCP_SYNCNT 7
+#define TCP_WINDOW_CLAMP 10
#define TCP_CONGESTION 13
+#define TCP_THIN_LINEAR_TIMEOUTS 16
+#define TCP_USER_TIMEOUT 18
+#define TCP_NOTSENT_LOWAT 25
+#define TCP_SAVE_SYN 27
+#define TCP_SAVED_SYN 28
#define TCP_CA_NAME_MAX 16
+#define TCP_NAGLE_OFF 1
#define ICSK_TIME_RETRANS 1
#define ICSK_TIME_PROBE0 3
@@ -49,6 +77,8 @@
#define sk_state __sk_common.skc_state
#define sk_v6_daddr __sk_common.skc_v6_daddr
#define sk_v6_rcv_saddr __sk_common.skc_v6_rcv_saddr
+#define sk_flags __sk_common.skc_flags
+#define sk_reuse __sk_common.skc_reuse
#define s6_addr32 in6_u.u6_addr32
diff --git a/tools/testing/selftests/bpf/progs/cb_refs.c b/tools/testing/selftests/bpf/progs/cb_refs.c
new file mode 100644
index 000000000000..7653df1bc787
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/cb_refs.c
@@ -0,0 +1,116 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <vmlinux.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_helpers.h>
+
+struct map_value {
+ struct prog_test_ref_kfunc __kptr_ref *ptr;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __type(key, int);
+ __type(value, struct map_value);
+ __uint(max_entries, 16);
+} array_map SEC(".maps");
+
+extern struct prog_test_ref_kfunc *bpf_kfunc_call_test_acquire(unsigned long *sp) __ksym;
+extern void bpf_kfunc_call_test_release(struct prog_test_ref_kfunc *p) __ksym;
+
+static __noinline int cb1(void *map, void *key, void *value, void *ctx)
+{
+ void *p = *(void **)ctx;
+ bpf_kfunc_call_test_release(p);
+ /* Without the fix this would cause underflow */
+ return 0;
+}
+
+SEC("?tc")
+int underflow_prog(void *ctx)
+{
+ struct prog_test_ref_kfunc *p;
+ unsigned long sl = 0;
+
+ p = bpf_kfunc_call_test_acquire(&sl);
+ if (!p)
+ return 0;
+ bpf_for_each_map_elem(&array_map, cb1, &p, 0);
+ return 0;
+}
+
+static __always_inline int cb2(void *map, void *key, void *value, void *ctx)
+{
+ unsigned long sl = 0;
+
+ *(void **)ctx = bpf_kfunc_call_test_acquire(&sl);
+ /* Without the fix this would leak memory */
+ return 0;
+}
+
+SEC("?tc")
+int leak_prog(void *ctx)
+{
+ struct prog_test_ref_kfunc *p;
+ struct map_value *v;
+ unsigned long sl;
+
+ v = bpf_map_lookup_elem(&array_map, &(int){0});
+ if (!v)
+ return 0;
+
+ p = NULL;
+ bpf_for_each_map_elem(&array_map, cb2, &p, 0);
+ p = bpf_kptr_xchg(&v->ptr, p);
+ if (p)
+ bpf_kfunc_call_test_release(p);
+ return 0;
+}
+
+static __always_inline int cb(void *map, void *key, void *value, void *ctx)
+{
+ return 0;
+}
+
+static __always_inline int cb3(void *map, void *key, void *value, void *ctx)
+{
+ unsigned long sl = 0;
+ void *p;
+
+ bpf_kfunc_call_test_acquire(&sl);
+ bpf_for_each_map_elem(&array_map, cb, &p, 0);
+ /* It should only complain here, not in cb. This is why we need
+ * callback_ref to be set to frameno.
+ */
+ return 0;
+}
+
+SEC("?tc")
+int nested_cb(void *ctx)
+{
+ struct prog_test_ref_kfunc *p;
+ unsigned long sl = 0;
+ int sp = 0;
+
+ p = bpf_kfunc_call_test_acquire(&sl);
+ if (!p)
+ return 0;
+ bpf_for_each_map_elem(&array_map, cb3, &sp, 0);
+ bpf_kfunc_call_test_release(p);
+ return 0;
+}
+
+SEC("?tc")
+int non_cb_transfer_ref(void *ctx)
+{
+ struct prog_test_ref_kfunc *p;
+ unsigned long sl = 0;
+
+ p = bpf_kfunc_call_test_acquire(&sl);
+ if (!p)
+ return 0;
+ cb1(NULL, NULL, NULL, &p);
+ bpf_kfunc_call_test_acquire(&sl);
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/cgroup_getset_retval_hooks.c b/tools/testing/selftests/bpf/progs/cgroup_getset_retval_hooks.c
new file mode 100644
index 000000000000..13dfb4bbfd28
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/cgroup_getset_retval_hooks.c
@@ -0,0 +1,16 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+#define BPF_RETVAL_HOOK(name, section, ctx, expected_err) \
+ __attribute__((__section__("?" section))) \
+ int name(struct ctx *_ctx) \
+ { \
+ bpf_set_retval(bpf_get_retval()); \
+ return 1; \
+ }
+
+#include "cgroup_getset_retval_hooks.h"
+
+#undef BPF_RETVAL_HOOK
diff --git a/tools/testing/selftests/bpf/progs/cgroup_hierarchical_stats.c b/tools/testing/selftests/bpf/progs/cgroup_hierarchical_stats.c
new file mode 100644
index 000000000000..8ab4253a1592
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/cgroup_hierarchical_stats.c
@@ -0,0 +1,226 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Functions to manage eBPF programs attached to cgroup subsystems
+ *
+ * Copyright 2022 Google LLC.
+ */
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
+
+char _license[] SEC("license") = "GPL";
+
+/*
+ * Start times are stored per-task, not per-cgroup, as multiple tasks in one
+ * cgroup can perform reclaim concurrently.
+ */
+struct {
+ __uint(type, BPF_MAP_TYPE_TASK_STORAGE);
+ __uint(map_flags, BPF_F_NO_PREALLOC);
+ __type(key, int);
+ __type(value, __u64);
+} vmscan_start_time SEC(".maps");
+
+struct vmscan_percpu {
+ /* Previous percpu state, to figure out if we have new updates */
+ __u64 prev;
+ /* Current percpu state */
+ __u64 state;
+};
+
+struct vmscan {
+ /* State propagated through children, pending aggregation */
+ __u64 pending;
+ /* Total state, including all cpus and all children */
+ __u64 state;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_HASH);
+ __uint(max_entries, 100);
+ __type(key, __u64);
+ __type(value, struct vmscan_percpu);
+} pcpu_cgroup_vmscan_elapsed SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(max_entries, 100);
+ __type(key, __u64);
+ __type(value, struct vmscan);
+} cgroup_vmscan_elapsed SEC(".maps");
+
+extern void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) __ksym;
+extern void cgroup_rstat_flush(struct cgroup *cgrp) __ksym;
+
+static struct cgroup *task_memcg(struct task_struct *task)
+{
+ int cgrp_id;
+
+#if __has_builtin(__builtin_preserve_enum_value)
+ cgrp_id = bpf_core_enum_value(enum cgroup_subsys_id, memory_cgrp_id);
+#else
+ cgrp_id = memory_cgrp_id;
+#endif
+ return task->cgroups->subsys[cgrp_id]->cgroup;
+}
+
+static uint64_t cgroup_id(struct cgroup *cgrp)
+{
+ return cgrp->kn->id;
+}
+
+static int create_vmscan_percpu_elem(__u64 cg_id, __u64 state)
+{
+ struct vmscan_percpu pcpu_init = {.state = state, .prev = 0};
+
+ return bpf_map_update_elem(&pcpu_cgroup_vmscan_elapsed, &cg_id,
+ &pcpu_init, BPF_NOEXIST);
+}
+
+static int create_vmscan_elem(__u64 cg_id, __u64 state, __u64 pending)
+{
+ struct vmscan init = {.state = state, .pending = pending};
+
+ return bpf_map_update_elem(&cgroup_vmscan_elapsed, &cg_id,
+ &init, BPF_NOEXIST);
+}
+
+SEC("tp_btf/mm_vmscan_memcg_reclaim_begin")
+int BPF_PROG(vmscan_start, int order, gfp_t gfp_flags)
+{
+ struct task_struct *task = bpf_get_current_task_btf();
+ __u64 *start_time_ptr;
+
+ start_time_ptr = bpf_task_storage_get(&vmscan_start_time, task, 0,
+ BPF_LOCAL_STORAGE_GET_F_CREATE);
+ if (start_time_ptr)
+ *start_time_ptr = bpf_ktime_get_ns();
+ return 0;
+}
+
+SEC("tp_btf/mm_vmscan_memcg_reclaim_end")
+int BPF_PROG(vmscan_end, unsigned long nr_reclaimed)
+{
+ struct vmscan_percpu *pcpu_stat;
+ struct task_struct *current = bpf_get_current_task_btf();
+ struct cgroup *cgrp;
+ __u64 *start_time_ptr;
+ __u64 current_elapsed, cg_id;
+ __u64 end_time = bpf_ktime_get_ns();
+
+ /*
+ * cgrp is the first parent cgroup of current that has memcg enabled in
+ * its subtree_control, or NULL if memcg is disabled in the entire tree.
+ * In a cgroup hierarchy like this:
+ * a
+ * / \
+ * b c
+ * If "a" has memcg enabled, while "b" doesn't, then processes in "b"
+ * will accumulate their stats directly to "a". This makes sure that no
+ * stats are lost from processes in leaf cgroups that don't have memcg
+ * enabled, but only exposes stats for cgroups that have memcg enabled.
+ */
+ cgrp = task_memcg(current);
+ if (!cgrp)
+ return 0;
+
+ cg_id = cgroup_id(cgrp);
+ start_time_ptr = bpf_task_storage_get(&vmscan_start_time, current, 0,
+ BPF_LOCAL_STORAGE_GET_F_CREATE);
+ if (!start_time_ptr)
+ return 0;
+
+ current_elapsed = end_time - *start_time_ptr;
+ pcpu_stat = bpf_map_lookup_elem(&pcpu_cgroup_vmscan_elapsed,
+ &cg_id);
+ if (pcpu_stat)
+ pcpu_stat->state += current_elapsed;
+ else if (create_vmscan_percpu_elem(cg_id, current_elapsed))
+ return 0;
+
+ cgroup_rstat_updated(cgrp, bpf_get_smp_processor_id());
+ return 0;
+}
+
+SEC("fentry/bpf_rstat_flush")
+int BPF_PROG(vmscan_flush, struct cgroup *cgrp, struct cgroup *parent, int cpu)
+{
+ struct vmscan_percpu *pcpu_stat;
+ struct vmscan *total_stat, *parent_stat;
+ __u64 cg_id = cgroup_id(cgrp);
+ __u64 parent_cg_id = parent ? cgroup_id(parent) : 0;
+ __u64 *pcpu_vmscan;
+ __u64 state;
+ __u64 delta = 0;
+
+ /* Add CPU changes on this level since the last flush */
+ pcpu_stat = bpf_map_lookup_percpu_elem(&pcpu_cgroup_vmscan_elapsed,
+ &cg_id, cpu);
+ if (pcpu_stat) {
+ state = pcpu_stat->state;
+ delta += state - pcpu_stat->prev;
+ pcpu_stat->prev = state;
+ }
+
+ total_stat = bpf_map_lookup_elem(&cgroup_vmscan_elapsed, &cg_id);
+ if (!total_stat) {
+ if (create_vmscan_elem(cg_id, delta, 0))
+ return 0;
+
+ goto update_parent;
+ }
+
+ /* Collect pending stats from subtree */
+ if (total_stat->pending) {
+ delta += total_stat->pending;
+ total_stat->pending = 0;
+ }
+
+ /* Propagate changes to this cgroup's total */
+ total_stat->state += delta;
+
+update_parent:
+ /* Skip if there are no changes to propagate, or no parent */
+ if (!delta || !parent_cg_id)
+ return 0;
+
+ /* Propagate changes to cgroup's parent */
+ parent_stat = bpf_map_lookup_elem(&cgroup_vmscan_elapsed,
+ &parent_cg_id);
+ if (parent_stat)
+ parent_stat->pending += delta;
+ else
+ create_vmscan_elem(parent_cg_id, 0, delta);
+ return 0;
+}
+
+SEC("iter.s/cgroup")
+int BPF_PROG(dump_vmscan, struct bpf_iter_meta *meta, struct cgroup *cgrp)
+{
+ struct seq_file *seq = meta->seq;
+ struct vmscan *total_stat;
+ __u64 cg_id = cgrp ? cgroup_id(cgrp) : 0;
+
+ /* Do nothing for the terminal call */
+ if (!cg_id)
+ return 1;
+
+ /* Flush the stats to make sure we get the most updated numbers */
+ cgroup_rstat_flush(cgrp);
+
+ total_stat = bpf_map_lookup_elem(&cgroup_vmscan_elapsed, &cg_id);
+ if (!total_stat) {
+ BPF_SEQ_PRINTF(seq, "cg_id: %llu, total_vmscan_delay: 0\n",
+ cg_id);
+ } else {
+ BPF_SEQ_PRINTF(seq, "cg_id: %llu, total_vmscan_delay: %llu\n",
+ cg_id, total_stat->state);
+ }
+
+ /*
+ * We only dump stats for one cgroup here, so return 1 to stop
+ * iteration after the first cgroup.
+ */
+ return 1;
+}
diff --git a/tools/testing/selftests/bpf/progs/cgroup_iter.c b/tools/testing/selftests/bpf/progs/cgroup_iter.c
new file mode 100644
index 000000000000..de03997322a7
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/cgroup_iter.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2022 Google */
+
+#include "bpf_iter.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+int terminate_early = 0;
+u64 terminal_cgroup = 0;
+
+static inline u64 cgroup_id(struct cgroup *cgrp)
+{
+ return cgrp->kn->id;
+}
+
+SEC("iter/cgroup")
+int cgroup_id_printer(struct bpf_iter__cgroup *ctx)
+{
+ struct seq_file *seq = ctx->meta->seq;
+ struct cgroup *cgrp = ctx->cgroup;
+
+ /* epilogue */
+ if (cgrp == NULL) {
+ BPF_SEQ_PRINTF(seq, "epilogue\n");
+ return 0;
+ }
+
+ /* prologue */
+ if (ctx->meta->seq_num == 0)
+ BPF_SEQ_PRINTF(seq, "prologue\n");
+
+ BPF_SEQ_PRINTF(seq, "%8llu\n", cgroup_id(cgrp));
+
+ if (terminal_cgroup == cgroup_id(cgrp))
+ return 1;
+
+ return terminate_early ? 1 : 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/connect4_prog.c b/tools/testing/selftests/bpf/progs/connect4_prog.c
index b241932911db..ec25371de789 100644
--- a/tools/testing/selftests/bpf/progs/connect4_prog.c
+++ b/tools/testing/selftests/bpf/progs/connect4_prog.c
@@ -7,14 +7,15 @@
#include <linux/bpf.h>
#include <linux/in.h>
#include <linux/in6.h>
-#include <sys/socket.h>
-#include <netinet/tcp.h>
+#include <linux/tcp.h>
#include <linux/if.h>
#include <errno.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_endian.h>
+#include "bpf_tcp_helpers.h"
+
#define SRC_REWRITE_IP4 0x7f000004U
#define DST_REWRITE_IP4 0x7f000001U
#define DST_REWRITE_PORT4 4444
diff --git a/tools/testing/selftests/bpf/progs/fexit_bpf2bpf.c b/tools/testing/selftests/bpf/progs/fexit_bpf2bpf.c
index 48cd14b43741..4547b059d487 100644
--- a/tools/testing/selftests/bpf/progs/fexit_bpf2bpf.c
+++ b/tools/testing/selftests/bpf/progs/fexit_bpf2bpf.c
@@ -73,10 +73,10 @@ int test_subprog2(struct args_subprog2 *ctx)
__builtin_preserve_access_index(&skb->len));
ret = ctx->ret;
- /* bpf_prog_test_load() loads "test_pkt_access.o" with BPF_F_TEST_RND_HI32
- * which randomizes upper 32 bits after BPF_ALU32 insns.
- * Hence after 'w0 <<= 1' upper bits of $rax are random.
- * That is expected and correct. Trim them.
+ /* bpf_prog_test_load() loads "test_pkt_access.bpf.o" with
+ * BPF_F_TEST_RND_HI32 which randomizes upper 32 bits after BPF_ALU32
+ * insns. Hence after 'w0 <<= 1' upper bits of $rax are random. That is
+ * expected and correct. Trim them.
*/
ret = (__u32) ret;
if (len != 74 || ret != 148)
diff --git a/tools/testing/selftests/bpf/progs/htab_update.c b/tools/testing/selftests/bpf/progs/htab_update.c
new file mode 100644
index 000000000000..7481bb30b29b
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/htab_update.c
@@ -0,0 +1,29 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2022. Huawei Technologies Co., Ltd */
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(max_entries, 1);
+ __uint(key_size, sizeof(__u32));
+ __uint(value_size, sizeof(__u32));
+} htab SEC(".maps");
+
+int pid = 0;
+int update_err = 0;
+
+SEC("?fentry/lookup_elem_raw")
+int lookup_elem_raw(void *ctx)
+{
+ __u32 key = 0, value = 1;
+
+ if ((bpf_get_current_pid_tgid() >> 32) != pid)
+ return 0;
+
+ update_err = bpf_map_update_elem(&htab, &key, &value, 0);
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/read_bpf_task_storage_busy.c b/tools/testing/selftests/bpf/progs/read_bpf_task_storage_busy.c
new file mode 100644
index 000000000000..a47bb0120719
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/read_bpf_task_storage_busy.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2022. Huawei Technologies Co., Ltd */
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+extern bool CONFIG_PREEMPT __kconfig __weak;
+extern const int bpf_task_storage_busy __ksym;
+
+char _license[] SEC("license") = "GPL";
+
+int pid = 0;
+int busy = 0;
+
+struct {
+ __uint(type, BPF_MAP_TYPE_TASK_STORAGE);
+ __uint(map_flags, BPF_F_NO_PREALLOC);
+ __type(key, int);
+ __type(value, long);
+} task SEC(".maps");
+
+SEC("raw_tp/sys_enter")
+int BPF_PROG(read_bpf_task_storage_busy)
+{
+ int *value;
+ int key;
+
+ if (!CONFIG_PREEMPT)
+ return 0;
+
+ if (bpf_get_current_pid_tgid() >> 32 != pid)
+ return 0;
+
+ value = bpf_this_cpu_ptr(&bpf_task_storage_busy);
+ if (value)
+ busy = *value;
+
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/setget_sockopt.c b/tools/testing/selftests/bpf/progs/setget_sockopt.c
new file mode 100644
index 000000000000..9523333b8905
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/setget_sockopt.c
@@ -0,0 +1,395 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) Meta Platforms, Inc. and affiliates. */
+
+#include "vmlinux.h"
+#include "bpf_tracing_net.h"
+#include <bpf/bpf_core_read.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+#endif
+
+extern unsigned long CONFIG_HZ __kconfig;
+
+const volatile char veth[IFNAMSIZ];
+const volatile int veth_ifindex;
+
+int nr_listen;
+int nr_passive;
+int nr_active;
+int nr_connect;
+int nr_binddev;
+int nr_socket_post_create;
+
+struct sockopt_test {
+ int opt;
+ int new;
+ int restore;
+ int expected;
+ int tcp_expected;
+ unsigned int flip:1;
+};
+
+static const char not_exist_cc[] = "not_exist";
+static const char cubic_cc[] = "cubic";
+static const char reno_cc[] = "reno";
+
+static const struct sockopt_test sol_socket_tests[] = {
+ { .opt = SO_REUSEADDR, .flip = 1, },
+ { .opt = SO_SNDBUF, .new = 8123, .expected = 8123 * 2, },
+ { .opt = SO_RCVBUF, .new = 8123, .expected = 8123 * 2, },
+ { .opt = SO_KEEPALIVE, .flip = 1, },
+ { .opt = SO_PRIORITY, .new = 0xeb9f, .expected = 0xeb9f, },
+ { .opt = SO_REUSEPORT, .flip = 1, },
+ { .opt = SO_RCVLOWAT, .new = 8123, .expected = 8123, },
+ { .opt = SO_MARK, .new = 0xeb9f, .expected = 0xeb9f, },
+ { .opt = SO_MAX_PACING_RATE, .new = 0xeb9f, .expected = 0xeb9f, },
+ { .opt = SO_TXREHASH, .flip = 1, },
+ { .opt = 0, },
+};
+
+static const struct sockopt_test sol_tcp_tests[] = {
+ { .opt = TCP_NODELAY, .flip = 1, },
+ { .opt = TCP_KEEPIDLE, .new = 123, .expected = 123, .restore = 321, },
+ { .opt = TCP_KEEPINTVL, .new = 123, .expected = 123, .restore = 321, },
+ { .opt = TCP_KEEPCNT, .new = 123, .expected = 123, .restore = 124, },
+ { .opt = TCP_SYNCNT, .new = 123, .expected = 123, .restore = 124, },
+ { .opt = TCP_WINDOW_CLAMP, .new = 8123, .expected = 8123, .restore = 8124, },
+ { .opt = TCP_CONGESTION, },
+ { .opt = TCP_THIN_LINEAR_TIMEOUTS, .flip = 1, },
+ { .opt = TCP_USER_TIMEOUT, .new = 123400, .expected = 123400, },
+ { .opt = TCP_NOTSENT_LOWAT, .new = 1314, .expected = 1314, },
+ { .opt = 0, },
+};
+
+static const struct sockopt_test sol_ip_tests[] = {
+ { .opt = IP_TOS, .new = 0xe1, .expected = 0xe1, .tcp_expected = 0xe0, },
+ { .opt = 0, },
+};
+
+static const struct sockopt_test sol_ipv6_tests[] = {
+ { .opt = IPV6_TCLASS, .new = 0xe1, .expected = 0xe1, .tcp_expected = 0xe0, },
+ { .opt = IPV6_AUTOFLOWLABEL, .flip = 1, },
+ { .opt = 0, },
+};
+
+struct loop_ctx {
+ void *ctx;
+ struct sock *sk;
+};
+
+static int bpf_test_sockopt_flip(void *ctx, struct sock *sk,
+ const struct sockopt_test *t,
+ int level)
+{
+ int old, tmp, new, opt = t->opt;
+
+ opt = t->opt;
+
+ if (bpf_getsockopt(ctx, level, opt, &old, sizeof(old)))
+ return 1;
+ /* kernel initialized txrehash to 255 */
+ if (level == SOL_SOCKET && opt == SO_TXREHASH && old != 0 && old != 1)
+ old = 1;
+
+ new = !old;
+ if (bpf_setsockopt(ctx, level, opt, &new, sizeof(new)))
+ return 1;
+ if (bpf_getsockopt(ctx, level, opt, &tmp, sizeof(tmp)) ||
+ tmp != new)
+ return 1;
+
+ if (bpf_setsockopt(ctx, level, opt, &old, sizeof(old)))
+ return 1;
+
+ return 0;
+}
+
+static int bpf_test_sockopt_int(void *ctx, struct sock *sk,
+ const struct sockopt_test *t,
+ int level)
+{
+ int old, tmp, new, expected, opt;
+
+ opt = t->opt;
+ new = t->new;
+ if (sk->sk_type == SOCK_STREAM && t->tcp_expected)
+ expected = t->tcp_expected;
+ else
+ expected = t->expected;
+
+ if (bpf_getsockopt(ctx, level, opt, &old, sizeof(old)) ||
+ old == new)
+ return 1;
+
+ if (bpf_setsockopt(ctx, level, opt, &new, sizeof(new)))
+ return 1;
+ if (bpf_getsockopt(ctx, level, opt, &tmp, sizeof(tmp)) ||
+ tmp != expected)
+ return 1;
+
+ if (t->restore)
+ old = t->restore;
+ if (bpf_setsockopt(ctx, level, opt, &old, sizeof(old)))
+ return 1;
+
+ return 0;
+}
+
+static int bpf_test_socket_sockopt(__u32 i, struct loop_ctx *lc)
+{
+ const struct sockopt_test *t;
+
+ if (i >= ARRAY_SIZE(sol_socket_tests))
+ return 1;
+
+ t = &sol_socket_tests[i];
+ if (!t->opt)
+ return 1;
+
+ if (t->flip)
+ return bpf_test_sockopt_flip(lc->ctx, lc->sk, t, SOL_SOCKET);
+
+ return bpf_test_sockopt_int(lc->ctx, lc->sk, t, SOL_SOCKET);
+}
+
+static int bpf_test_ip_sockopt(__u32 i, struct loop_ctx *lc)
+{
+ const struct sockopt_test *t;
+
+ if (i >= ARRAY_SIZE(sol_ip_tests))
+ return 1;
+
+ t = &sol_ip_tests[i];
+ if (!t->opt)
+ return 1;
+
+ if (t->flip)
+ return bpf_test_sockopt_flip(lc->ctx, lc->sk, t, IPPROTO_IP);
+
+ return bpf_test_sockopt_int(lc->ctx, lc->sk, t, IPPROTO_IP);
+}
+
+static int bpf_test_ipv6_sockopt(__u32 i, struct loop_ctx *lc)
+{
+ const struct sockopt_test *t;
+
+ if (i >= ARRAY_SIZE(sol_ipv6_tests))
+ return 1;
+
+ t = &sol_ipv6_tests[i];
+ if (!t->opt)
+ return 1;
+
+ if (t->flip)
+ return bpf_test_sockopt_flip(lc->ctx, lc->sk, t, IPPROTO_IPV6);
+
+ return bpf_test_sockopt_int(lc->ctx, lc->sk, t, IPPROTO_IPV6);
+}
+
+static int bpf_test_tcp_sockopt(__u32 i, struct loop_ctx *lc)
+{
+ const struct sockopt_test *t;
+ struct sock *sk;
+ void *ctx;
+
+ if (i >= ARRAY_SIZE(sol_tcp_tests))
+ return 1;
+
+ t = &sol_tcp_tests[i];
+ if (!t->opt)
+ return 1;
+
+ ctx = lc->ctx;
+ sk = lc->sk;
+
+ if (t->opt == TCP_CONGESTION) {
+ char old_cc[16], tmp_cc[16];
+ const char *new_cc;
+ int new_cc_len;
+
+ if (!bpf_setsockopt(ctx, IPPROTO_TCP, TCP_CONGESTION,
+ (void *)not_exist_cc, sizeof(not_exist_cc)))
+ return 1;
+ if (bpf_getsockopt(ctx, IPPROTO_TCP, TCP_CONGESTION, old_cc, sizeof(old_cc)))
+ return 1;
+ if (!bpf_strncmp(old_cc, sizeof(old_cc), cubic_cc)) {
+ new_cc = reno_cc;
+ new_cc_len = sizeof(reno_cc);
+ } else {
+ new_cc = cubic_cc;
+ new_cc_len = sizeof(cubic_cc);
+ }
+ if (bpf_setsockopt(ctx, IPPROTO_TCP, TCP_CONGESTION, (void *)new_cc,
+ new_cc_len))
+ return 1;
+ if (bpf_getsockopt(ctx, IPPROTO_TCP, TCP_CONGESTION, tmp_cc, sizeof(tmp_cc)))
+ return 1;
+ if (bpf_strncmp(tmp_cc, sizeof(tmp_cc), new_cc))
+ return 1;
+ if (bpf_setsockopt(ctx, IPPROTO_TCP, TCP_CONGESTION, old_cc, sizeof(old_cc)))
+ return 1;
+ return 0;
+ }
+
+ if (t->flip)
+ return bpf_test_sockopt_flip(ctx, sk, t, IPPROTO_TCP);
+
+ return bpf_test_sockopt_int(ctx, sk, t, IPPROTO_TCP);
+}
+
+static int bpf_test_sockopt(void *ctx, struct sock *sk)
+{
+ struct loop_ctx lc = { .ctx = ctx, .sk = sk, };
+ __u16 family, proto;
+ int n;
+
+ family = sk->sk_family;
+ proto = sk->sk_protocol;
+
+ n = bpf_loop(ARRAY_SIZE(sol_socket_tests), bpf_test_socket_sockopt, &lc, 0);
+ if (n != ARRAY_SIZE(sol_socket_tests))
+ return -1;
+
+ if (proto == IPPROTO_TCP) {
+ n = bpf_loop(ARRAY_SIZE(sol_tcp_tests), bpf_test_tcp_sockopt, &lc, 0);
+ if (n != ARRAY_SIZE(sol_tcp_tests))
+ return -1;
+ }
+
+ if (family == AF_INET) {
+ n = bpf_loop(ARRAY_SIZE(sol_ip_tests), bpf_test_ip_sockopt, &lc, 0);
+ if (n != ARRAY_SIZE(sol_ip_tests))
+ return -1;
+ } else {
+ n = bpf_loop(ARRAY_SIZE(sol_ipv6_tests), bpf_test_ipv6_sockopt, &lc, 0);
+ if (n != ARRAY_SIZE(sol_ipv6_tests))
+ return -1;
+ }
+
+ return 0;
+}
+
+static int binddev_test(void *ctx)
+{
+ const char empty_ifname[] = "";
+ int ifindex, zero = 0;
+
+ if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE,
+ (void *)veth, sizeof(veth)))
+ return -1;
+ if (bpf_getsockopt(ctx, SOL_SOCKET, SO_BINDTOIFINDEX,
+ &ifindex, sizeof(int)) ||
+ ifindex != veth_ifindex)
+ return -1;
+
+ if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE,
+ (void *)empty_ifname, sizeof(empty_ifname)))
+ return -1;
+ if (bpf_getsockopt(ctx, SOL_SOCKET, SO_BINDTOIFINDEX,
+ &ifindex, sizeof(int)) ||
+ ifindex != 0)
+ return -1;
+
+ if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTOIFINDEX,
+ (void *)&veth_ifindex, sizeof(int)))
+ return -1;
+ if (bpf_getsockopt(ctx, SOL_SOCKET, SO_BINDTOIFINDEX,
+ &ifindex, sizeof(int)) ||
+ ifindex != veth_ifindex)
+ return -1;
+
+ if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTOIFINDEX,
+ &zero, sizeof(int)))
+ return -1;
+ if (bpf_getsockopt(ctx, SOL_SOCKET, SO_BINDTOIFINDEX,
+ &ifindex, sizeof(int)) ||
+ ifindex != 0)
+ return -1;
+
+ return 0;
+}
+
+static int test_tcp_maxseg(void *ctx, struct sock *sk)
+{
+ int val = 1314, tmp;
+
+ if (sk->sk_state != TCP_ESTABLISHED)
+ return bpf_setsockopt(ctx, IPPROTO_TCP, TCP_MAXSEG,
+ &val, sizeof(val));
+
+ if (bpf_getsockopt(ctx, IPPROTO_TCP, TCP_MAXSEG, &tmp, sizeof(tmp)) ||
+ tmp > val)
+ return -1;
+
+ return 0;
+}
+
+static int test_tcp_saved_syn(void *ctx, struct sock *sk)
+{
+ __u8 saved_syn[20];
+ int one = 1;
+
+ if (sk->sk_state == TCP_LISTEN)
+ return bpf_setsockopt(ctx, IPPROTO_TCP, TCP_SAVE_SYN,
+ &one, sizeof(one));
+
+ return bpf_getsockopt(ctx, IPPROTO_TCP, TCP_SAVED_SYN,
+ saved_syn, sizeof(saved_syn));
+}
+
+SEC("lsm_cgroup/socket_post_create")
+int BPF_PROG(socket_post_create, struct socket *sock, int family,
+ int type, int protocol, int kern)
+{
+ struct sock *sk = sock->sk;
+
+ if (!sk)
+ return 1;
+
+ nr_socket_post_create += !bpf_test_sockopt(sk, sk);
+ nr_binddev += !binddev_test(sk);
+
+ return 1;
+}
+
+SEC("sockops")
+int skops_sockopt(struct bpf_sock_ops *skops)
+{
+ struct bpf_sock *bpf_sk = skops->sk;
+ struct sock *sk;
+
+ if (!bpf_sk)
+ return 1;
+
+ sk = (struct sock *)bpf_skc_to_tcp_sock(bpf_sk);
+ if (!sk)
+ return 1;
+
+ switch (skops->op) {
+ case BPF_SOCK_OPS_TCP_LISTEN_CB:
+ nr_listen += !(bpf_test_sockopt(skops, sk) ||
+ test_tcp_maxseg(skops, sk) ||
+ test_tcp_saved_syn(skops, sk));
+ break;
+ case BPF_SOCK_OPS_TCP_CONNECT_CB:
+ nr_connect += !(bpf_test_sockopt(skops, sk) ||
+ test_tcp_maxseg(skops, sk));
+ break;
+ case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
+ nr_active += !(bpf_test_sockopt(skops, sk) ||
+ test_tcp_maxseg(skops, sk));
+ break;
+ case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
+ nr_passive += !(bpf_test_sockopt(skops, sk) ||
+ test_tcp_maxseg(skops, sk) ||
+ test_tcp_saved_syn(skops, sk));
+ break;
+ }
+
+ return 1;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf4.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf4.c
index b67e8022d500..a017d6b2f1dd 100644
--- a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf4.c
+++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf4.c
@@ -19,7 +19,7 @@ struct {
int count = 0;
int noise = 0;
-__always_inline int subprog_noise(void)
+static __always_inline int subprog_noise(void)
{
__u32 key = 0;
diff --git a/tools/testing/selftests/bpf/progs/test_tc_dtime.c b/tools/testing/selftests/bpf/progs/test_tc_dtime.c
index b596479a9ebe..125beec31834 100644
--- a/tools/testing/selftests/bpf/progs/test_tc_dtime.c
+++ b/tools/testing/selftests/bpf/progs/test_tc_dtime.c
@@ -15,7 +15,6 @@
#include <linux/udp.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_endian.h>
-#include <sys/socket.h>
/* veth_src --- veth_src_fwd --- veth_det_fwd --- veth_dst
* | |
diff --git a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c
index df0673c4ecbe..98af55f0bcd3 100644
--- a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c
+++ b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c
@@ -12,6 +12,7 @@
#include <linux/bpf.h>
#include <linux/if_ether.h>
#include <linux/if_packet.h>
+#include <linux/if_tunnel.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/icmp.h>
@@ -386,7 +387,8 @@ int vxlan_get_tunnel_src(struct __sk_buff *skb)
__u32 orig_daddr;
__u32 index = 0;
- ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0);
+ ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key),
+ BPF_F_TUNINFO_FLAGS);
if (ret < 0) {
log_err(ret);
return TC_ACT_SHOT;
@@ -398,10 +400,13 @@ int vxlan_get_tunnel_src(struct __sk_buff *skb)
return TC_ACT_SHOT;
}
- if (key.local_ipv4 != ASSIGNED_ADDR_VETH1 || md.gbp != 0x800FF) {
- bpf_printk("vxlan key %d local ip 0x%x remote ip 0x%x gbp 0x%x\n",
+ if (key.local_ipv4 != ASSIGNED_ADDR_VETH1 || md.gbp != 0x800FF ||
+ !(key.tunnel_flags & TUNNEL_KEY) ||
+ (key.tunnel_flags & TUNNEL_CSUM)) {
+ bpf_printk("vxlan key %d local ip 0x%x remote ip 0x%x gbp 0x%x flags 0x%x\n",
key.tunnel_id, key.local_ipv4,
- key.remote_ipv4, md.gbp);
+ key.remote_ipv4, md.gbp,
+ bpf_ntohs(key.tunnel_flags));
log_err(ret);
return TC_ACT_SHOT;
}
@@ -541,16 +546,19 @@ int ip6vxlan_get_tunnel_src(struct __sk_buff *skb)
}
ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key),
- BPF_F_TUNINFO_IPV6);
+ BPF_F_TUNINFO_IPV6 | BPF_F_TUNINFO_FLAGS);
if (ret < 0) {
log_err(ret);
return TC_ACT_SHOT;
}
- if (bpf_ntohl(key.local_ipv6[3]) != *local_ip) {
- bpf_printk("ip6vxlan key %d local ip6 ::%x remote ip6 ::%x label 0x%x\n",
+ if (bpf_ntohl(key.local_ipv6[3]) != *local_ip ||
+ !(key.tunnel_flags & TUNNEL_KEY) ||
+ !(key.tunnel_flags & TUNNEL_CSUM)) {
+ bpf_printk("ip6vxlan key %d local ip6 ::%x remote ip6 ::%x label 0x%x flags 0x%x\n",
key.tunnel_id, bpf_ntohl(key.local_ipv6[3]),
- bpf_ntohl(key.remote_ipv6[3]), key.tunnel_label);
+ bpf_ntohl(key.remote_ipv6[3]), key.tunnel_label,
+ bpf_ntohs(key.tunnel_flags));
bpf_printk("local_ip 0x%x\n", *local_ip);
log_err(ret);
return TC_ACT_SHOT;
diff --git a/tools/testing/selftests/bpf/progs/timer.c b/tools/testing/selftests/bpf/progs/timer.c
index 5f5309791649..0053c5402173 100644
--- a/tools/testing/selftests/bpf/progs/timer.c
+++ b/tools/testing/selftests/bpf/progs/timer.c
@@ -208,17 +208,6 @@ static int timer_cb2(void *map, int *key, struct hmap_elem *val)
*/
bpf_map_delete_elem(map, key);
- /* in non-preallocated hashmap both 'key' and 'val' are RCU
- * protected and still valid though this element was deleted
- * from the map. Arm this timer for ~35 seconds. When callback
- * finishes the call_rcu will invoke:
- * htab_elem_free_rcu
- * check_and_free_timer
- * bpf_timer_cancel_and_free
- * to cancel this 35 second sleep and delete the timer for real.
- */
- if (bpf_timer_start(&val->timer, 1ull << 35, 0) != 0)
- err |= 256;
ok |= 4;
}
return 0;
diff --git a/tools/testing/selftests/bpf/task_local_storage_helpers.h b/tools/testing/selftests/bpf/task_local_storage_helpers.h
new file mode 100644
index 000000000000..711d5abb7d51
--- /dev/null
+++ b/tools/testing/selftests/bpf/task_local_storage_helpers.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __TASK_LOCAL_STORAGE_HELPER_H
+#define __TASK_LOCAL_STORAGE_HELPER_H
+
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+
+#ifndef __NR_pidfd_open
+#define __NR_pidfd_open 434
+#endif
+
+static inline int sys_pidfd_open(pid_t pid, unsigned int flags)
+{
+ return syscall(__NR_pidfd_open, pid, flags);
+}
+
+#endif
diff --git a/tools/testing/selftests/bpf/test_dev_cgroup.c b/tools/testing/selftests/bpf/test_dev_cgroup.c
index 7886265846a0..adeaf63cb6fa 100644
--- a/tools/testing/selftests/bpf/test_dev_cgroup.c
+++ b/tools/testing/selftests/bpf/test_dev_cgroup.c
@@ -16,7 +16,7 @@
#include "cgroup_helpers.h"
#include "testing_helpers.h"
-#define DEV_CGROUP_PROG "./dev_cgroup.o"
+#define DEV_CGROUP_PROG "./dev_cgroup.bpf.o"
#define TEST_CGROUP "/test-bpf-based-device-cgroup/"
diff --git a/tools/testing/selftests/bpf/test_flow_dissector.sh b/tools/testing/selftests/bpf/test_flow_dissector.sh
index dbd91221727d..5303ce0c977b 100755
--- a/tools/testing/selftests/bpf/test_flow_dissector.sh
+++ b/tools/testing/selftests/bpf/test_flow_dissector.sh
@@ -115,6 +115,14 @@ tc filter add dev lo parent ffff: protocol ip pref 1337 flower ip_proto \
# Send 10 IPv4/UDP packets from port 10. Filter should not drop any.
./test_flow_dissector -i 4 -f 10
+echo "Testing IPv4 from 127.0.0.127 (fallback to generic dissector)..."
+# Send 10 IPv4/UDP packets from port 8. Filter should not drop any.
+./test_flow_dissector -i 4 -S 127.0.0.127 -f 8
+# Send 10 IPv4/UDP packets from port 9. Filter should drop all.
+./test_flow_dissector -i 4 -S 127.0.0.127 -f 9 -F
+# Send 10 IPv4/UDP packets from port 10. Filter should not drop any.
+./test_flow_dissector -i 4 -S 127.0.0.127 -f 10
+
echo "Testing IPIP..."
# Send 10 IPv4/IPv4/UDP packets from port 8. Filter should not drop any.
./with_addr.sh ./with_tunnels.sh ./test_flow_dissector -o 4 -e bare -i 4 \
diff --git a/tools/testing/selftests/bpf/test_lirc_mode2_user.c b/tools/testing/selftests/bpf/test_lirc_mode2_user.c
index 2893e9f2f1e0..4694422aa76c 100644
--- a/tools/testing/selftests/bpf/test_lirc_mode2_user.c
+++ b/tools/testing/selftests/bpf/test_lirc_mode2_user.c
@@ -59,7 +59,7 @@ int main(int argc, char **argv)
return 2;
}
- ret = bpf_prog_test_load("test_lirc_mode2_kern.o",
+ ret = bpf_prog_test_load("test_lirc_mode2_kern.bpf.o",
BPF_PROG_TYPE_LIRC_MODE2, &obj, &progfd);
if (ret) {
printf("Failed to load bpf program\n");
diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c
index cbebfaa7c1e8..00b9cc305e58 100644
--- a/tools/testing/selftests/bpf/test_maps.c
+++ b/tools/testing/selftests/bpf/test_maps.c
@@ -264,10 +264,11 @@ static void test_hashmap_percpu(unsigned int task, void *data)
close(fd);
}
+#define VALUE_SIZE 3
static int helper_fill_hashmap(int max_entries)
{
int i, fd, ret;
- long long key, value;
+ long long key, value[VALUE_SIZE] = {};
fd = bpf_map_create(BPF_MAP_TYPE_HASH, NULL, sizeof(key), sizeof(value),
max_entries, &map_opts);
@@ -276,8 +277,8 @@ static int helper_fill_hashmap(int max_entries)
"err: %s, flags: 0x%x\n", strerror(errno), map_opts.map_flags);
for (i = 0; i < max_entries; i++) {
- key = i; value = key;
- ret = bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST);
+ key = i; value[0] = key;
+ ret = bpf_map_update_elem(fd, &key, value, BPF_NOEXIST);
CHECK(ret != 0,
"can't update hashmap",
"err: %s\n", strerror(ret));
@@ -288,8 +289,8 @@ static int helper_fill_hashmap(int max_entries)
static void test_hashmap_walk(unsigned int task, void *data)
{
- int fd, i, max_entries = 1000;
- long long key, value, next_key;
+ int fd, i, max_entries = 10000;
+ long long key, value[VALUE_SIZE], next_key;
bool next_key_valid = true;
fd = helper_fill_hashmap(max_entries);
@@ -297,7 +298,7 @@ static void test_hashmap_walk(unsigned int task, void *data)
for (i = 0; bpf_map_get_next_key(fd, !i ? NULL : &key,
&next_key) == 0; i++) {
key = next_key;
- assert(bpf_map_lookup_elem(fd, &key, &value) == 0);
+ assert(bpf_map_lookup_elem(fd, &key, value) == 0);
}
assert(i == max_entries);
@@ -305,9 +306,9 @@ static void test_hashmap_walk(unsigned int task, void *data)
assert(bpf_map_get_next_key(fd, NULL, &key) == 0);
for (i = 0; next_key_valid; i++) {
next_key_valid = bpf_map_get_next_key(fd, &key, &next_key) == 0;
- assert(bpf_map_lookup_elem(fd, &key, &value) == 0);
- value++;
- assert(bpf_map_update_elem(fd, &key, &value, BPF_EXIST) == 0);
+ assert(bpf_map_lookup_elem(fd, &key, value) == 0);
+ value[0]++;
+ assert(bpf_map_update_elem(fd, &key, value, BPF_EXIST) == 0);
key = next_key;
}
@@ -316,8 +317,8 @@ static void test_hashmap_walk(unsigned int task, void *data)
for (i = 0; bpf_map_get_next_key(fd, !i ? NULL : &key,
&next_key) == 0; i++) {
key = next_key;
- assert(bpf_map_lookup_elem(fd, &key, &value) == 0);
- assert(value - 1 == key);
+ assert(bpf_map_lookup_elem(fd, &key, value) == 0);
+ assert(value[0] - 1 == key);
}
assert(i == max_entries);
@@ -651,9 +652,9 @@ static void test_stackmap(unsigned int task, void *data)
#include <arpa/inet.h>
#include <sys/select.h>
#include <linux/err.h>
-#define SOCKMAP_PARSE_PROG "./sockmap_parse_prog.o"
-#define SOCKMAP_VERDICT_PROG "./sockmap_verdict_prog.o"
-#define SOCKMAP_TCP_MSG_PROG "./sockmap_tcp_msg_prog.o"
+#define SOCKMAP_PARSE_PROG "./sockmap_parse_prog.bpf.o"
+#define SOCKMAP_VERDICT_PROG "./sockmap_verdict_prog.bpf.o"
+#define SOCKMAP_TCP_MSG_PROG "./sockmap_tcp_msg_prog.bpf.o"
static void test_sockmap(unsigned int tasks, void *data)
{
struct bpf_map *bpf_map_rx, *bpf_map_tx, *bpf_map_msg, *bpf_map_break;
@@ -1143,8 +1144,8 @@ out_sockmap:
exit(1);
}
-#define MAPINMAP_PROG "./test_map_in_map.o"
-#define MAPINMAP_INVALID_PROG "./test_map_in_map_invalid.o"
+#define MAPINMAP_PROG "./test_map_in_map.bpf.o"
+#define MAPINMAP_INVALID_PROG "./test_map_in_map_invalid.bpf.o"
static void test_map_in_map(void)
{
struct bpf_object *obj;
@@ -1371,16 +1372,16 @@ static void __run_parallel(unsigned int tasks,
static void test_map_stress(void)
{
+ run_parallel(100, test_hashmap_walk, NULL);
run_parallel(100, test_hashmap, NULL);
run_parallel(100, test_hashmap_percpu, NULL);
run_parallel(100, test_hashmap_sizes, NULL);
- run_parallel(100, test_hashmap_walk, NULL);
run_parallel(100, test_arraymap, NULL);
run_parallel(100, test_arraymap_percpu, NULL);
}
-#define TASKS 1024
+#define TASKS 100
#define DO_UPDATE 1
#define DO_DELETE 0
@@ -1432,6 +1433,8 @@ static void test_update_delete(unsigned int fn, void *data)
int fd = ((int *)data)[0];
int i, key, value, err;
+ if (fn & 1)
+ test_hashmap_walk(fn, NULL);
for (i = fn; i < MAP_SIZE; i += TASKS) {
key = value = i;
@@ -1455,7 +1458,7 @@ static void test_update_delete(unsigned int fn, void *data)
static void test_map_parallel(void)
{
- int i, fd, key = 0, value = 0;
+ int i, fd, key = 0, value = 0, j = 0;
int data[2];
fd = bpf_map_create(BPF_MAP_TYPE_HASH, NULL, sizeof(key), sizeof(value),
@@ -1466,6 +1469,7 @@ static void test_map_parallel(void)
exit(1);
}
+again:
/* Use the same fd in children to add elements to this map:
* child_0 adds key=0, key=1024, key=2048, ...
* child_1 adds key=1, key=1025, key=2049, ...
@@ -1502,6 +1506,12 @@ static void test_map_parallel(void)
key = -1;
assert(bpf_map_get_next_key(fd, NULL, &key) < 0 && errno == ENOENT);
assert(bpf_map_get_next_key(fd, &key, &key) < 0 && errno == ENOENT);
+
+ key = 0;
+ bpf_map_delete_elem(fd, &key);
+ if (j++ < 5)
+ goto again;
+ close(fd);
}
static void test_map_rdonly(void)
diff --git a/tools/testing/selftests/bpf/test_offload.py b/tools/testing/selftests/bpf/test_offload.py
index 6cd6ef9fc20b..7fc15e0d24a9 100755
--- a/tools/testing/selftests/bpf/test_offload.py
+++ b/tools/testing/selftests/bpf/test_offload.py
@@ -782,7 +782,7 @@ if out.find("/sys/kernel/debug type debugfs") == -1:
cmd("mount -t debugfs none /sys/kernel/debug")
# Check samples are compiled
-samples = ["sample_ret0.o", "sample_map_ret0.o"]
+samples = ["sample_ret0.bpf.o", "sample_map_ret0.bpf.o"]
for s in samples:
ret, out = cmd("ls %s/%s" % (bpf_test_dir, s), fail=False)
skip(ret != 0, "sample %s/%s not found, please compile it" %
@@ -803,7 +803,7 @@ cmd("ip netns delete %s" % (ns))
netns = []
try:
- obj = bpf_obj("sample_ret0.o")
+ obj = bpf_obj("sample_ret0.bpf.o")
bytecode = bpf_bytecode("1,6 0 0 4294967295,")
start_test("Test destruction of generic XDP...")
@@ -1023,7 +1023,7 @@ try:
sim.wait_for_flush()
start_test("Test non-offload XDP attaching to HW...")
- bpftool_prog_load("sample_ret0.o", "/sys/fs/bpf/nooffload")
+ bpftool_prog_load("sample_ret0.bpf.o", "/sys/fs/bpf/nooffload")
nooffload = bpf_pinned("/sys/fs/bpf/nooffload")
ret, _, err = sim.set_xdp(nooffload, "offload",
fail=False, include_stderr=True)
@@ -1032,7 +1032,7 @@ try:
rm("/sys/fs/bpf/nooffload")
start_test("Test offload XDP attaching to drv...")
- bpftool_prog_load("sample_ret0.o", "/sys/fs/bpf/offload",
+ bpftool_prog_load("sample_ret0.bpf.o", "/sys/fs/bpf/offload",
dev=sim['ifname'])
offload = bpf_pinned("/sys/fs/bpf/offload")
ret, _, err = sim.set_xdp(offload, "drv", fail=False, include_stderr=True)
@@ -1043,7 +1043,7 @@ try:
start_test("Test XDP load failure...")
sim.dfs["dev/bpf_bind_verifier_accept"] = 0
- ret, _, err = bpftool_prog_load("sample_ret0.o", "/sys/fs/bpf/offload",
+ ret, _, err = bpftool_prog_load("sample_ret0.bpf.o", "/sys/fs/bpf/offload",
dev=sim['ifname'], fail=False, include_stderr=True)
fail(ret == 0, "verifier should fail on load")
check_verifier_log(err, "[netdevsim] Hello from netdevsim!")
@@ -1169,7 +1169,7 @@ try:
simdev = NetdevSimDev()
sim, = simdev.nsims
- map_obj = bpf_obj("sample_map_ret0.o")
+ map_obj = bpf_obj("sample_map_ret0.bpf.o")
start_test("Test loading program with maps...")
sim.set_xdp(map_obj, "offload", JSON=False) # map fixup msg breaks JSON
@@ -1307,10 +1307,10 @@ try:
sims = (simA, simB1, simB2, simB3)
simB = (simB1, simB2, simB3)
- bpftool_prog_load("sample_map_ret0.o", "/sys/fs/bpf/nsimA",
+ bpftool_prog_load("sample_map_ret0.bpf.o", "/sys/fs/bpf/nsimA",
dev=simA['ifname'])
progA = bpf_pinned("/sys/fs/bpf/nsimA")
- bpftool_prog_load("sample_map_ret0.o", "/sys/fs/bpf/nsimB",
+ bpftool_prog_load("sample_map_ret0.bpf.o", "/sys/fs/bpf/nsimB",
dev=simB1['ifname'])
progB = bpf_pinned("/sys/fs/bpf/nsimB")
@@ -1344,14 +1344,14 @@ try:
mapA = bpftool("prog show %s" % (progA))[1]["map_ids"][0]
mapB = bpftool("prog show %s" % (progB))[1]["map_ids"][0]
- ret, _ = bpftool_prog_load("sample_map_ret0.o", "/sys/fs/bpf/nsimB_",
+ ret, _ = bpftool_prog_load("sample_map_ret0.bpf.o", "/sys/fs/bpf/nsimB_",
dev=simB3['ifname'],
maps=["idx 0 id %d" % (mapB)],
fail=False)
fail(ret != 0, "couldn't reuse a map on the same ASIC")
rm("/sys/fs/bpf/nsimB_")
- ret, _, err = bpftool_prog_load("sample_map_ret0.o", "/sys/fs/bpf/nsimA_",
+ ret, _, err = bpftool_prog_load("sample_map_ret0.bpf.o", "/sys/fs/bpf/nsimA_",
dev=simA['ifname'],
maps=["idx 0 id %d" % (mapB)],
fail=False, include_stderr=True)
@@ -1359,7 +1359,7 @@ try:
fail(err.count("offload device mismatch between prog and map") == 0,
"error message missing for cross-ASIC map")
- ret, _, err = bpftool_prog_load("sample_map_ret0.o", "/sys/fs/bpf/nsimB_",
+ ret, _, err = bpftool_prog_load("sample_map_ret0.bpf.o", "/sys/fs/bpf/nsimB_",
dev=simB1['ifname'],
maps=["idx 0 id %d" % (mapA)],
fail=False, include_stderr=True)
diff --git a/tools/testing/selftests/bpf/test_skb_cgroup_id.sh b/tools/testing/selftests/bpf/test_skb_cgroup_id.sh
index a9bc6f82abc1..515c2eafc97f 100755
--- a/tools/testing/selftests/bpf/test_skb_cgroup_id.sh
+++ b/tools/testing/selftests/bpf/test_skb_cgroup_id.sh
@@ -54,7 +54,7 @@ DIR=$(dirname $0)
TEST_IF="test_cgid_1"
TEST_IF_PEER="test_cgid_2"
MAX_PING_TRIES=5
-BPF_PROG_OBJ="${DIR}/test_skb_cgroup_id_kern.o"
+BPF_PROG_OBJ="${DIR}/test_skb_cgroup_id_kern.bpf.o"
BPF_PROG_SECTION="cgroup_id_logger"
BPF_PROG_ID=0
PROG="${DIR}/test_skb_cgroup_id_user"
diff --git a/tools/testing/selftests/bpf/test_sock_addr.c b/tools/testing/selftests/bpf/test_sock_addr.c
index 458564fcfc82..2c89674fc62c 100644
--- a/tools/testing/selftests/bpf/test_sock_addr.c
+++ b/tools/testing/selftests/bpf/test_sock_addr.c
@@ -26,14 +26,14 @@
#endif
#define CG_PATH "/foo"
-#define CONNECT4_PROG_PATH "./connect4_prog.o"
-#define CONNECT6_PROG_PATH "./connect6_prog.o"
-#define SENDMSG4_PROG_PATH "./sendmsg4_prog.o"
-#define SENDMSG6_PROG_PATH "./sendmsg6_prog.o"
-#define RECVMSG4_PROG_PATH "./recvmsg4_prog.o"
-#define RECVMSG6_PROG_PATH "./recvmsg6_prog.o"
-#define BIND4_PROG_PATH "./bind4_prog.o"
-#define BIND6_PROG_PATH "./bind6_prog.o"
+#define CONNECT4_PROG_PATH "./connect4_prog.bpf.o"
+#define CONNECT6_PROG_PATH "./connect6_prog.bpf.o"
+#define SENDMSG4_PROG_PATH "./sendmsg4_prog.bpf.o"
+#define SENDMSG6_PROG_PATH "./sendmsg6_prog.bpf.o"
+#define RECVMSG4_PROG_PATH "./recvmsg4_prog.bpf.o"
+#define RECVMSG6_PROG_PATH "./recvmsg6_prog.bpf.o"
+#define BIND4_PROG_PATH "./bind4_prog.bpf.o"
+#define BIND6_PROG_PATH "./bind6_prog.bpf.o"
#define SERV4_IP "192.168.1.254"
#define SERV4_REWRITE_IP "127.0.0.1"
diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c
index 0fbaccdc8861..dcb038e342d8 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -52,8 +52,8 @@ static void running_handler(int a);
#define S1_PORT 10000
#define S2_PORT 10001
-#define BPF_SOCKMAP_FILENAME "test_sockmap_kern.o"
-#define BPF_SOCKHASH_FILENAME "test_sockhash_kern.o"
+#define BPF_SOCKMAP_FILENAME "test_sockmap_kern.bpf.o"
+#define BPF_SOCKHASH_FILENAME "test_sockhash_kern.bpf.o"
#define CG_PATH "/sockmap"
/* global sockets */
diff --git a/tools/testing/selftests/bpf/test_sysctl.c b/tools/testing/selftests/bpf/test_sysctl.c
index 57620e7c9048..bcdbd27f22f0 100644
--- a/tools/testing/selftests/bpf/test_sysctl.c
+++ b/tools/testing/selftests/bpf/test_sysctl.c
@@ -1372,7 +1372,7 @@ static struct sysctl_test tests[] = {
},
{
"C prog: deny all writes",
- .prog_file = "./test_sysctl_prog.o",
+ .prog_file = "./test_sysctl_prog.bpf.o",
.attach_type = BPF_CGROUP_SYSCTL,
.sysctl = "net/ipv4/tcp_mem",
.open_flags = O_WRONLY,
@@ -1381,7 +1381,7 @@ static struct sysctl_test tests[] = {
},
{
"C prog: deny access by name",
- .prog_file = "./test_sysctl_prog.o",
+ .prog_file = "./test_sysctl_prog.bpf.o",
.attach_type = BPF_CGROUP_SYSCTL,
.sysctl = "net/ipv4/route/mtu_expires",
.open_flags = O_RDONLY,
@@ -1389,7 +1389,7 @@ static struct sysctl_test tests[] = {
},
{
"C prog: read tcp_mem",
- .prog_file = "./test_sysctl_prog.o",
+ .prog_file = "./test_sysctl_prog.bpf.o",
.attach_type = BPF_CGROUP_SYSCTL,
.sysctl = "net/ipv4/tcp_mem",
.open_flags = O_RDONLY,
diff --git a/tools/testing/selftests/bpf/test_tcp_check_syncookie.sh b/tools/testing/selftests/bpf/test_tcp_check_syncookie.sh
index 102e6588e2fe..b42c24282c25 100755
--- a/tools/testing/selftests/bpf/test_tcp_check_syncookie.sh
+++ b/tools/testing/selftests/bpf/test_tcp_check_syncookie.sh
@@ -76,7 +76,7 @@ main()
DIR=$(dirname $0)
TEST_IF=lo
MAX_PING_TRIES=5
-BPF_PROG_OBJ="${DIR}/test_tcp_check_syncookie_kern.o"
+BPF_PROG_OBJ="${DIR}/test_tcp_check_syncookie_kern.bpf.o"
CLSACT_SECTION="tc"
XDP_SECTION="xdp"
BPF_PROG_ID=0
diff --git a/tools/testing/selftests/bpf/test_tcpnotify_user.c b/tools/testing/selftests/bpf/test_tcpnotify_user.c
index 8284db8b0f13..595194453ff8 100644
--- a/tools/testing/selftests/bpf/test_tcpnotify_user.c
+++ b/tools/testing/selftests/bpf/test_tcpnotify_user.c
@@ -69,7 +69,7 @@ int verify_result(const struct tcpnotify_globals *result)
int main(int argc, char **argv)
{
- const char *file = "test_tcpnotify_kern.o";
+ const char *file = "test_tcpnotify_kern.bpf.o";
struct bpf_map *perf_map, *global_map;
struct tcpnotify_globals g = {0};
struct perf_buffer *pb = NULL;
diff --git a/tools/testing/selftests/bpf/test_xdp_redirect.sh b/tools/testing/selftests/bpf/test_xdp_redirect.sh
index 1d79f31480ad..0746a4fde9d3 100755
--- a/tools/testing/selftests/bpf/test_xdp_redirect.sh
+++ b/tools/testing/selftests/bpf/test_xdp_redirect.sh
@@ -54,10 +54,10 @@ test_xdp_redirect()
return 0
fi
- ip -n ${NS1} link set veth11 $xdpmode obj xdp_dummy.o sec xdp &> /dev/null
- ip -n ${NS2} link set veth22 $xdpmode obj xdp_dummy.o sec xdp &> /dev/null
- ip link set dev veth1 $xdpmode obj test_xdp_redirect.o sec redirect_to_222 &> /dev/null
- ip link set dev veth2 $xdpmode obj test_xdp_redirect.o sec redirect_to_111 &> /dev/null
+ ip -n ${NS1} link set veth11 $xdpmode obj xdp_dummy.bpf.o sec xdp &> /dev/null
+ ip -n ${NS2} link set veth22 $xdpmode obj xdp_dummy.bpf.o sec xdp &> /dev/null
+ ip link set dev veth1 $xdpmode obj test_xdp_redirect.bpf.o sec redirect_to_222 &> /dev/null
+ ip link set dev veth2 $xdpmode obj test_xdp_redirect.bpf.o sec redirect_to_111 &> /dev/null
if ip netns exec ${NS1} ping -c 1 10.1.1.22 &> /dev/null &&
ip netns exec ${NS2} ping -c 1 10.1.1.11 &> /dev/null; then
diff --git a/tools/testing/selftests/bpf/test_xdp_redirect_multi.sh b/tools/testing/selftests/bpf/test_xdp_redirect_multi.sh
index cc57cb87e65f..4c3c3fdd2d73 100755
--- a/tools/testing/selftests/bpf/test_xdp_redirect_multi.sh
+++ b/tools/testing/selftests/bpf/test_xdp_redirect_multi.sh
@@ -94,7 +94,7 @@ setup_ns()
# Add a neigh entry for IPv4 ping test
ip -n ${NS[$i]} neigh add 192.0.2.253 lladdr 00:00:00:00:00:01 dev veth0
ip -n ${NS[$i]} link set veth0 $mode obj \
- xdp_dummy.o sec xdp &> /dev/null || \
+ xdp_dummy.bpf.o sec xdp &> /dev/null || \
{ test_fail "Unable to load dummy xdp" && exit 1; }
IFACES="$IFACES veth$i"
veth_mac[$i]=$(ip -n ${NS[0]} link show veth$i | awk '/link\/ether/ {print $2}')
diff --git a/tools/testing/selftests/bpf/test_xdp_veth.sh b/tools/testing/selftests/bpf/test_xdp_veth.sh
index 49936c4c8567..5211ca9a0239 100755
--- a/tools/testing/selftests/bpf/test_xdp_veth.sh
+++ b/tools/testing/selftests/bpf/test_xdp_veth.sh
@@ -101,7 +101,7 @@ ip -n ${NS3} link set dev veth33 up
mkdir $BPF_DIR
bpftool prog loadall \
- xdp_redirect_map.o $BPF_DIR/progs type xdp \
+ xdp_redirect_map.bpf.o $BPF_DIR/progs type xdp \
pinmaps $BPF_DIR/maps
bpftool map update pinned $BPF_DIR/maps/tx_port key 0 0 0 0 value 122 0 0 0
bpftool map update pinned $BPF_DIR/maps/tx_port key 1 0 0 0 value 133 0 0 0
@@ -110,9 +110,9 @@ ip link set dev veth1 xdp pinned $BPF_DIR/progs/xdp_redirect_map_0
ip link set dev veth2 xdp pinned $BPF_DIR/progs/xdp_redirect_map_1
ip link set dev veth3 xdp pinned $BPF_DIR/progs/xdp_redirect_map_2
-ip -n ${NS1} link set dev veth11 xdp obj xdp_dummy.o sec xdp
-ip -n ${NS2} link set dev veth22 xdp obj xdp_tx.o sec xdp
-ip -n ${NS3} link set dev veth33 xdp obj xdp_dummy.o sec xdp
+ip -n ${NS1} link set dev veth11 xdp obj xdp_dummy.bpf.o sec xdp
+ip -n ${NS2} link set dev veth22 xdp obj xdp_tx.bpf.o sec xdp
+ip -n ${NS3} link set dev veth33 xdp obj xdp_dummy.bpf.o sec xdp
trap cleanup EXIT
diff --git a/tools/testing/selftests/bpf/test_xsk.sh b/tools/testing/selftests/bpf/test_xsk.sh
index 096a957594cd..d821fd098504 100755
--- a/tools/testing/selftests/bpf/test_xsk.sh
+++ b/tools/testing/selftests/bpf/test_xsk.sh
@@ -73,14 +73,20 @@
#
# Run and dump packet contents:
# sudo ./test_xsk.sh -D
+#
+# Run test suite for physical device in loopback mode
+# sudo ./test_xsk.sh -i IFACE
. xsk_prereqs.sh
-while getopts "vD" flag
+ETH=""
+
+while getopts "vDi:" flag
do
case "${flag}" in
v) verbose=1;;
D) dump_pkts=1;;
+ i) ETH=${OPTARG};;
esac
done
@@ -132,18 +138,25 @@ setup_vethPairs() {
ip link set ${VETH0} up
}
-validate_root_exec
-validate_veth_support ${VETH0}
-validate_ip_utility
-setup_vethPairs
-
-retval=$?
-if [ $retval -ne 0 ]; then
- test_status $retval "${TEST_NAME}"
- cleanup_exit ${VETH0} ${VETH1} ${NS1}
- exit $retval
+if [ ! -z $ETH ]; then
+ VETH0=${ETH}
+ VETH1=${ETH}
+ NS1=""
+else
+ validate_root_exec
+ validate_veth_support ${VETH0}
+ validate_ip_utility
+ setup_vethPairs
+
+ retval=$?
+ if [ $retval -ne 0 ]; then
+ test_status $retval "${TEST_NAME}"
+ cleanup_exit ${VETH0} ${VETH1} ${NS1}
+ exit $retval
+ fi
fi
+
if [[ $verbose -eq 1 ]]; then
ARGS+="-v "
fi
@@ -152,26 +165,33 @@ if [[ $dump_pkts -eq 1 ]]; then
ARGS="-D "
fi
+retval=$?
test_status $retval "${TEST_NAME}"
## START TESTS
statusList=()
-TEST_NAME="XSK_SELFTESTS_SOFTIRQ"
+TEST_NAME="XSK_SELFTESTS_${VETH0}_SOFTIRQ"
exec_xskxceiver
-cleanup_exit ${VETH0} ${VETH1} ${NS1}
-TEST_NAME="XSK_SELFTESTS_BUSY_POLL"
+if [ -z $ETH ]; then
+ cleanup_exit ${VETH0} ${VETH1} ${NS1}
+fi
+TEST_NAME="XSK_SELFTESTS_${VETH0}_BUSY_POLL"
busy_poll=1
-setup_vethPairs
+if [ -z $ETH ]; then
+ setup_vethPairs
+fi
exec_xskxceiver
## END TESTS
-cleanup_exit ${VETH0} ${VETH1} ${NS1}
+if [ -z $ETH ]; then
+ cleanup_exit ${VETH0} ${VETH1} ${NS1}
+fi
failures=0
echo -e "\nSummary:"
diff --git a/tools/testing/selftests/bpf/xdp_redirect_multi.c b/tools/testing/selftests/bpf/xdp_redirect_multi.c
index c03b3a75991f..c1fc44c87c30 100644
--- a/tools/testing/selftests/bpf/xdp_redirect_multi.c
+++ b/tools/testing/selftests/bpf/xdp_redirect_multi.c
@@ -142,7 +142,7 @@ int main(int argc, char **argv)
}
printf("\n");
- snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+ snprintf(filename, sizeof(filename), "%s_kern.bpf.o", argv[0]);
obj = bpf_object__open_file(filename, NULL);
err = libbpf_get_error(obj);
if (err)
diff --git a/tools/testing/selftests/bpf/xdp_synproxy.c b/tools/testing/selftests/bpf/xdp_synproxy.c
index d874ddfb39c4..ff35320d2be9 100644
--- a/tools/testing/selftests/bpf/xdp_synproxy.c
+++ b/tools/testing/selftests/bpf/xdp_synproxy.c
@@ -193,7 +193,7 @@ static int syncookie_attach(const char *argv0, unsigned int ifindex, bool tc)
int prog_fd;
int err;
- snprintf(xdp_filename, sizeof(xdp_filename), "%s_kern.o", argv0);
+ snprintf(xdp_filename, sizeof(xdp_filename), "%s_kern.bpf.o", argv0);
obj = bpf_object__open_file(xdp_filename, NULL);
err = libbpf_get_error(obj);
if (err < 0) {
diff --git a/tools/testing/selftests/bpf/xdping.c b/tools/testing/selftests/bpf/xdping.c
index 5b6f977870f8..1503a1d2faa0 100644
--- a/tools/testing/selftests/bpf/xdping.c
+++ b/tools/testing/selftests/bpf/xdping.c
@@ -168,7 +168,7 @@ int main(int argc, char **argv)
/* Use libbpf 1.0 API mode */
libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
- snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+ snprintf(filename, sizeof(filename), "%s_kern.bpf.o", argv[0]);
if (bpf_prog_test_load(filename, BPF_PROG_TYPE_XDP, &obj, &prog_fd)) {
fprintf(stderr, "load of %s failed\n", filename);
diff --git a/tools/testing/selftests/bpf/xsk.c b/tools/testing/selftests/bpf/xsk.c
index f2721a4ae7c5..0b3ff49c740d 100644
--- a/tools/testing/selftests/bpf/xsk.c
+++ b/tools/testing/selftests/bpf/xsk.c
@@ -1237,15 +1237,15 @@ void xsk_socket__delete(struct xsk_socket *xsk)
ctx = xsk->ctx;
umem = ctx->umem;
- xsk_put_ctx(ctx, true);
-
- if (!ctx->refcount) {
+ if (ctx->refcount == 1) {
xsk_delete_bpf_maps(xsk);
close(ctx->prog_fd);
if (ctx->has_bpf_link)
close(ctx->link_fd);
}
+ xsk_put_ctx(ctx, true);
+
err = xsk_get_mmap_offsets(xsk->fd, &off);
if (!err) {
if (xsk->rx) {
diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c
index 14b4737b223c..ef33309bbe49 100644
--- a/tools/testing/selftests/bpf/xskxceiver.c
+++ b/tools/testing/selftests/bpf/xskxceiver.c
@@ -99,6 +99,8 @@
#include <stdatomic.h>
#include "xsk.h"
#include "xskxceiver.h"
+#include <bpf/bpf.h>
+#include <linux/filter.h>
#include "../kselftest.h"
/* AF_XDP APIs were moved into libxdp and marked as deprecated in libbpf.
@@ -122,9 +124,20 @@ static void __exit_with_error(int error, const char *file, const char *func, int
}
#define exit_with_error(error) __exit_with_error(error, __FILE__, __func__, __LINE__)
-
-#define mode_string(test) (test)->ifobj_tx->xdp_flags & XDP_FLAGS_SKB_MODE ? "SKB" : "DRV"
#define busy_poll_string(test) (test)->ifobj_tx->busy_poll ? "BUSY-POLL " : ""
+static char *mode_string(struct test_spec *test)
+{
+ switch (test->mode) {
+ case TEST_MODE_SKB:
+ return "SKB";
+ case TEST_MODE_DRV:
+ return "DRV";
+ case TEST_MODE_ZC:
+ return "ZC";
+ default:
+ return "BOGUS";
+ }
+}
static void report_failure(struct test_spec *test)
{
@@ -299,8 +312,8 @@ static void enable_busy_poll(struct xsk_socket_info *xsk)
exit_with_error(errno);
}
-static int xsk_configure_socket(struct xsk_socket_info *xsk, struct xsk_umem_info *umem,
- struct ifobject *ifobject, bool shared)
+static int __xsk_configure_socket(struct xsk_socket_info *xsk, struct xsk_umem_info *umem,
+ struct ifobject *ifobject, bool shared)
{
struct xsk_socket_config cfg = {};
struct xsk_ring_cons *rxr;
@@ -320,6 +333,51 @@ static int xsk_configure_socket(struct xsk_socket_info *xsk, struct xsk_umem_inf
return xsk_socket__create(&xsk->xsk, ifobject->ifname, 0, umem->umem, rxr, txr, &cfg);
}
+static bool ifobj_zc_avail(struct ifobject *ifobject)
+{
+ size_t umem_sz = DEFAULT_UMEM_BUFFERS * XSK_UMEM__DEFAULT_FRAME_SIZE;
+ int mmap_flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE;
+ struct xsk_socket_info *xsk;
+ struct xsk_umem_info *umem;
+ bool zc_avail = false;
+ void *bufs;
+ int ret;
+
+ bufs = mmap(NULL, umem_sz, PROT_READ | PROT_WRITE, mmap_flags, -1, 0);
+ if (bufs == MAP_FAILED)
+ exit_with_error(errno);
+
+ umem = calloc(1, sizeof(struct xsk_umem_info));
+ if (!umem) {
+ munmap(bufs, umem_sz);
+ exit_with_error(-ENOMEM);
+ }
+ umem->frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE;
+ ret = xsk_configure_umem(umem, bufs, umem_sz);
+ if (ret)
+ exit_with_error(-ret);
+
+ xsk = calloc(1, sizeof(struct xsk_socket_info));
+ if (!xsk)
+ goto out;
+ ifobject->xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
+ ifobject->xdp_flags |= XDP_FLAGS_DRV_MODE;
+ ifobject->bind_flags = XDP_USE_NEED_WAKEUP | XDP_ZEROCOPY;
+ ifobject->rx_on = true;
+ xsk->rxqsize = XSK_RING_CONS__DEFAULT_NUM_DESCS;
+ ret = __xsk_configure_socket(xsk, umem, ifobject, false);
+ if (!ret)
+ zc_avail = true;
+
+ xsk_socket__delete(xsk->xsk);
+ free(xsk);
+out:
+ munmap(umem->buffer, umem_sz);
+ xsk_umem__delete(umem->umem);
+ free(umem);
+ return zc_avail;
+}
+
static struct option long_options[] = {
{"interface", required_argument, 0, 'i'},
{"busy-poll", no_argument, 0, 'b'},
@@ -431,20 +489,24 @@ static void __test_spec_init(struct test_spec *test, struct ifobject *ifobj_tx,
ifobj->use_poll = false;
ifobj->use_fill_ring = true;
ifobj->release_rx = true;
- ifobj->pkt_stream = test->pkt_stream_default;
ifobj->validation_func = NULL;
if (i == 0) {
ifobj->rx_on = false;
ifobj->tx_on = true;
+ ifobj->pkt_stream = test->tx_pkt_stream_default;
} else {
ifobj->rx_on = true;
ifobj->tx_on = false;
+ ifobj->pkt_stream = test->rx_pkt_stream_default;
}
memset(ifobj->umem, 0, sizeof(*ifobj->umem));
ifobj->umem->num_frames = DEFAULT_UMEM_BUFFERS;
ifobj->umem->frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE;
+ if (ifobj->shared_umem && ifobj->rx_on)
+ ifobj->umem->base_addr = DEFAULT_UMEM_BUFFERS *
+ XSK_UMEM__DEFAULT_FRAME_SIZE;
for (j = 0; j < MAX_SOCKETS; j++) {
memset(&ifobj->xsk_arr[j], 0, sizeof(ifobj->xsk_arr[j]));
@@ -463,12 +525,15 @@ static void __test_spec_init(struct test_spec *test, struct ifobject *ifobj_tx,
static void test_spec_init(struct test_spec *test, struct ifobject *ifobj_tx,
struct ifobject *ifobj_rx, enum test_mode mode)
{
- struct pkt_stream *pkt_stream;
+ struct pkt_stream *tx_pkt_stream;
+ struct pkt_stream *rx_pkt_stream;
u32 i;
- pkt_stream = test->pkt_stream_default;
+ tx_pkt_stream = test->tx_pkt_stream_default;
+ rx_pkt_stream = test->rx_pkt_stream_default;
memset(test, 0, sizeof(*test));
- test->pkt_stream_default = pkt_stream;
+ test->tx_pkt_stream_default = tx_pkt_stream;
+ test->rx_pkt_stream_default = rx_pkt_stream;
for (i = 0; i < MAX_INTERFACES; i++) {
struct ifobject *ifobj = i ? ifobj_rx : ifobj_tx;
@@ -479,9 +544,14 @@ static void test_spec_init(struct test_spec *test, struct ifobject *ifobj_tx,
else
ifobj->xdp_flags |= XDP_FLAGS_DRV_MODE;
- ifobj->bind_flags = XDP_USE_NEED_WAKEUP | XDP_COPY;
+ ifobj->bind_flags = XDP_USE_NEED_WAKEUP;
+ if (mode == TEST_MODE_ZC)
+ ifobj->bind_flags |= XDP_ZEROCOPY;
+ else
+ ifobj->bind_flags |= XDP_COPY;
}
+ test->mode = mode;
__test_spec_init(test, ifobj_tx, ifobj_rx);
}
@@ -529,16 +599,17 @@ static void pkt_stream_delete(struct pkt_stream *pkt_stream)
static void pkt_stream_restore_default(struct test_spec *test)
{
struct pkt_stream *tx_pkt_stream = test->ifobj_tx->pkt_stream;
+ struct pkt_stream *rx_pkt_stream = test->ifobj_rx->pkt_stream;
- if (tx_pkt_stream != test->pkt_stream_default) {
+ if (tx_pkt_stream != test->tx_pkt_stream_default) {
pkt_stream_delete(test->ifobj_tx->pkt_stream);
- test->ifobj_tx->pkt_stream = test->pkt_stream_default;
+ test->ifobj_tx->pkt_stream = test->tx_pkt_stream_default;
}
- if (test->ifobj_rx->pkt_stream != test->pkt_stream_default &&
- test->ifobj_rx->pkt_stream != tx_pkt_stream)
+ if (rx_pkt_stream != test->rx_pkt_stream_default) {
pkt_stream_delete(test->ifobj_rx->pkt_stream);
- test->ifobj_rx->pkt_stream = test->pkt_stream_default;
+ test->ifobj_rx->pkt_stream = test->rx_pkt_stream_default;
+ }
}
static struct pkt_stream *__pkt_stream_alloc(u32 nb_pkts)
@@ -561,7 +632,7 @@ static struct pkt_stream *__pkt_stream_alloc(u32 nb_pkts)
static void pkt_set(struct xsk_umem_info *umem, struct pkt *pkt, u64 addr, u32 len)
{
- pkt->addr = addr;
+ pkt->addr = addr + umem->base_addr;
pkt->len = len;
if (len > umem->frame_size - XDP_PACKET_HEADROOM - MIN_PKT_SIZE * 2 - umem->frame_headroom)
pkt->valid = false;
@@ -600,22 +671,29 @@ static void pkt_stream_replace(struct test_spec *test, u32 nb_pkts, u32 pkt_len)
pkt_stream = pkt_stream_generate(test->ifobj_tx->umem, nb_pkts, pkt_len);
test->ifobj_tx->pkt_stream = pkt_stream;
+ pkt_stream = pkt_stream_generate(test->ifobj_rx->umem, nb_pkts, pkt_len);
test->ifobj_rx->pkt_stream = pkt_stream;
}
-static void pkt_stream_replace_half(struct test_spec *test, u32 pkt_len, int offset)
+static void __pkt_stream_replace_half(struct ifobject *ifobj, u32 pkt_len,
+ int offset)
{
- struct xsk_umem_info *umem = test->ifobj_tx->umem;
+ struct xsk_umem_info *umem = ifobj->umem;
struct pkt_stream *pkt_stream;
u32 i;
- pkt_stream = pkt_stream_clone(umem, test->pkt_stream_default);
- for (i = 1; i < test->pkt_stream_default->nb_pkts; i += 2)
+ pkt_stream = pkt_stream_clone(umem, ifobj->pkt_stream);
+ for (i = 1; i < ifobj->pkt_stream->nb_pkts; i += 2)
pkt_set(umem, &pkt_stream->pkts[i],
(i % umem->num_frames) * umem->frame_size + offset, pkt_len);
- test->ifobj_tx->pkt_stream = pkt_stream;
- test->ifobj_rx->pkt_stream = pkt_stream;
+ ifobj->pkt_stream = pkt_stream;
+}
+
+static void pkt_stream_replace_half(struct test_spec *test, u32 pkt_len, int offset)
+{
+ __pkt_stream_replace_half(test->ifobj_tx, pkt_len, offset);
+ __pkt_stream_replace_half(test->ifobj_rx, pkt_len, offset);
}
static void pkt_stream_receive_half(struct test_spec *test)
@@ -657,7 +735,8 @@ static struct pkt *pkt_generate(struct ifobject *ifobject, u32 pkt_nb)
return pkt;
}
-static void pkt_stream_generate_custom(struct test_spec *test, struct pkt *pkts, u32 nb_pkts)
+static void __pkt_stream_generate_custom(struct ifobject *ifobj,
+ struct pkt *pkts, u32 nb_pkts)
{
struct pkt_stream *pkt_stream;
u32 i;
@@ -666,15 +745,20 @@ static void pkt_stream_generate_custom(struct test_spec *test, struct pkt *pkts,
if (!pkt_stream)
exit_with_error(ENOMEM);
- test->ifobj_tx->pkt_stream = pkt_stream;
- test->ifobj_rx->pkt_stream = pkt_stream;
-
for (i = 0; i < nb_pkts; i++) {
- pkt_stream->pkts[i].addr = pkts[i].addr;
+ pkt_stream->pkts[i].addr = pkts[i].addr + ifobj->umem->base_addr;
pkt_stream->pkts[i].len = pkts[i].len;
pkt_stream->pkts[i].payload = i;
pkt_stream->pkts[i].valid = pkts[i].valid;
}
+
+ ifobj->pkt_stream = pkt_stream;
+}
+
+static void pkt_stream_generate_custom(struct test_spec *test, struct pkt *pkts, u32 nb_pkts)
+{
+ __pkt_stream_generate_custom(test->ifobj_tx, pkts, nb_pkts);
+ __pkt_stream_generate_custom(test->ifobj_rx, pkts, nb_pkts);
}
static void pkt_dump(void *pkt, u32 len)
@@ -1126,6 +1210,70 @@ static int validate_tx_invalid_descs(struct ifobject *ifobject)
return TEST_PASS;
}
+static void xsk_configure_socket(struct test_spec *test, struct ifobject *ifobject,
+ struct xsk_umem_info *umem, bool tx)
+{
+ int i, ret;
+
+ for (i = 0; i < test->nb_sockets; i++) {
+ bool shared = (ifobject->shared_umem && tx) ? true : !!i;
+ u32 ctr = 0;
+
+ while (ctr++ < SOCK_RECONF_CTR) {
+ ret = __xsk_configure_socket(&ifobject->xsk_arr[i], umem,
+ ifobject, shared);
+ if (!ret)
+ break;
+
+ /* Retry if it fails as xsk_socket__create() is asynchronous */
+ if (ctr >= SOCK_RECONF_CTR)
+ exit_with_error(-ret);
+ usleep(USLEEP_MAX);
+ }
+ if (ifobject->busy_poll)
+ enable_busy_poll(&ifobject->xsk_arr[i]);
+ }
+}
+
+static void thread_common_ops_tx(struct test_spec *test, struct ifobject *ifobject)
+{
+ xsk_configure_socket(test, ifobject, test->ifobj_rx->umem, true);
+ ifobject->xsk = &ifobject->xsk_arr[0];
+ ifobject->xsk_map_fd = test->ifobj_rx->xsk_map_fd;
+ memcpy(ifobject->umem, test->ifobj_rx->umem, sizeof(struct xsk_umem_info));
+}
+
+static void xsk_populate_fill_ring(struct xsk_umem_info *umem, struct pkt_stream *pkt_stream)
+{
+ u32 idx = 0, i, buffers_to_fill;
+ int ret;
+
+ if (umem->num_frames < XSK_RING_PROD__DEFAULT_NUM_DESCS)
+ buffers_to_fill = umem->num_frames;
+ else
+ buffers_to_fill = XSK_RING_PROD__DEFAULT_NUM_DESCS;
+
+ ret = xsk_ring_prod__reserve(&umem->fq, buffers_to_fill, &idx);
+ if (ret != buffers_to_fill)
+ exit_with_error(ENOSPC);
+ for (i = 0; i < buffers_to_fill; i++) {
+ u64 addr;
+
+ if (pkt_stream->use_addr_for_fill) {
+ struct pkt *pkt = pkt_stream_get_pkt(pkt_stream, i);
+
+ if (!pkt)
+ break;
+ addr = pkt->addr;
+ } else {
+ addr = i * umem->frame_size;
+ }
+
+ *xsk_ring_prod__fill_addr(&umem->fq, idx++) = addr;
+ }
+ xsk_ring_prod__submit(&umem->fq, buffers_to_fill);
+}
+
static void thread_common_ops(struct test_spec *test, struct ifobject *ifobject)
{
u64 umem_sz = ifobject->umem->num_frames * ifobject->umem->frame_size;
@@ -1133,13 +1281,15 @@ static void thread_common_ops(struct test_spec *test, struct ifobject *ifobject)
LIBBPF_OPTS(bpf_xdp_query_opts, opts);
int ret, ifindex;
void *bufs;
- u32 i;
ifobject->ns_fd = switch_namespace(ifobject->nsname);
if (ifobject->umem->unaligned_mode)
mmap_flags |= MAP_HUGETLB;
+ if (ifobject->shared_umem)
+ umem_sz *= 2;
+
bufs = mmap(NULL, umem_sz, PROT_READ | PROT_WRITE, mmap_flags, -1, 0);
if (bufs == MAP_FAILED)
exit_with_error(errno);
@@ -1148,24 +1298,9 @@ static void thread_common_ops(struct test_spec *test, struct ifobject *ifobject)
if (ret)
exit_with_error(-ret);
- for (i = 0; i < test->nb_sockets; i++) {
- u32 ctr = 0;
-
- while (ctr++ < SOCK_RECONF_CTR) {
- ret = xsk_configure_socket(&ifobject->xsk_arr[i], ifobject->umem,
- ifobject, !!i);
- if (!ret)
- break;
-
- /* Retry if it fails as xsk_socket__create() is asynchronous */
- if (ctr >= SOCK_RECONF_CTR)
- exit_with_error(-ret);
- usleep(USLEEP_MAX);
- }
+ xsk_populate_fill_ring(ifobject->umem, ifobject->pkt_stream);
- if (ifobject->busy_poll)
- enable_busy_poll(&ifobject->xsk_arr[i]);
- }
+ xsk_configure_socket(test, ifobject, ifobject->umem, false);
ifobject->xsk = &ifobject->xsk_arr[0];
@@ -1201,22 +1336,18 @@ static void thread_common_ops(struct test_spec *test, struct ifobject *ifobject)
exit_with_error(-ret);
}
-static void testapp_cleanup_xsk_res(struct ifobject *ifobj)
-{
- print_verbose("Destroying socket\n");
- xsk_socket__delete(ifobj->xsk->xsk);
- munmap(ifobj->umem->buffer, ifobj->umem->num_frames * ifobj->umem->frame_size);
- xsk_umem__delete(ifobj->umem->umem);
-}
-
static void *worker_testapp_validate_tx(void *arg)
{
struct test_spec *test = (struct test_spec *)arg;
struct ifobject *ifobject = test->ifobj_tx;
int err;
- if (test->current_step == 1)
- thread_common_ops(test, ifobject);
+ if (test->current_step == 1) {
+ if (!ifobject->shared_umem)
+ thread_common_ops(test, ifobject);
+ else
+ thread_common_ops_tx(test, ifobject);
+ }
print_verbose("Sending %d packets on interface %s\n", ifobject->pkt_stream->nb_pkts,
ifobject->ifname);
@@ -1227,53 +1358,23 @@ static void *worker_testapp_validate_tx(void *arg)
if (err)
report_failure(test);
- if (test->total_steps == test->current_step || err)
- testapp_cleanup_xsk_res(ifobject);
pthread_exit(NULL);
}
-static void xsk_populate_fill_ring(struct xsk_umem_info *umem, struct pkt_stream *pkt_stream)
-{
- u32 idx = 0, i, buffers_to_fill;
- int ret;
-
- if (umem->num_frames < XSK_RING_PROD__DEFAULT_NUM_DESCS)
- buffers_to_fill = umem->num_frames;
- else
- buffers_to_fill = XSK_RING_PROD__DEFAULT_NUM_DESCS;
-
- ret = xsk_ring_prod__reserve(&umem->fq, buffers_to_fill, &idx);
- if (ret != buffers_to_fill)
- exit_with_error(ENOSPC);
- for (i = 0; i < buffers_to_fill; i++) {
- u64 addr;
-
- if (pkt_stream->use_addr_for_fill) {
- struct pkt *pkt = pkt_stream_get_pkt(pkt_stream, i);
-
- if (!pkt)
- break;
- addr = pkt->addr;
- } else {
- addr = i * umem->frame_size;
- }
-
- *xsk_ring_prod__fill_addr(&umem->fq, idx++) = addr;
- }
- xsk_ring_prod__submit(&umem->fq, buffers_to_fill);
-}
-
static void *worker_testapp_validate_rx(void *arg)
{
struct test_spec *test = (struct test_spec *)arg;
struct ifobject *ifobject = test->ifobj_rx;
struct pollfd fds = { };
+ int id = 0;
int err;
- if (test->current_step == 1)
+ if (test->current_step == 1) {
thread_common_ops(test, ifobject);
-
- xsk_populate_fill_ring(ifobject->umem, ifobject->pkt_stream);
+ } else {
+ bpf_map_delete_elem(ifobject->xsk_map_fd, &id);
+ xsk_socket__update_xskmap(ifobject->xsk->xsk, ifobject->xsk_map_fd);
+ }
fds.fd = xsk_socket__fd(ifobject->xsk->xsk);
fds.events = POLLIN;
@@ -1291,25 +1392,44 @@ static void *worker_testapp_validate_rx(void *arg)
pthread_mutex_unlock(&pacing_mutex);
}
- if (test->total_steps == test->current_step || err)
- testapp_cleanup_xsk_res(ifobject);
+ pthread_exit(NULL);
+}
+
+static void testapp_clean_xsk_umem(struct ifobject *ifobj)
+{
+ u64 umem_sz = ifobj->umem->num_frames * ifobj->umem->frame_size;
+
+ if (ifobj->shared_umem)
+ umem_sz *= 2;
+
+ xsk_umem__delete(ifobj->umem->umem);
+ munmap(ifobj->umem->buffer, umem_sz);
+}
+
+static void handler(int signum)
+{
pthread_exit(NULL);
}
static int testapp_validate_traffic_single_thread(struct test_spec *test, struct ifobject *ifobj,
enum test_type type)
{
+ bool old_shared_umem = ifobj->shared_umem;
pthread_t t0;
if (pthread_barrier_init(&barr, NULL, 2))
exit_with_error(errno);
test->current_step++;
- if (type == TEST_TYPE_POLL_RXQ_TMOUT)
+ if (type == TEST_TYPE_POLL_RXQ_TMOUT)
pkt_stream_reset(ifobj->pkt_stream);
pkts_in_flight = 0;
- /*Spawn thread */
+ test->ifobj_rx->shared_umem = false;
+ test->ifobj_tx->shared_umem = false;
+
+ signal(SIGUSR1, handler);
+ /* Spawn thread */
pthread_create(&t0, NULL, ifobj->func_ptr, test);
if (type != TEST_TYPE_POLL_TXQ_TMOUT)
@@ -1318,8 +1438,17 @@ static int testapp_validate_traffic_single_thread(struct test_spec *test, struct
if (pthread_barrier_destroy(&barr))
exit_with_error(errno);
+ pthread_kill(t0, SIGUSR1);
pthread_join(t0, NULL);
+ if (test->total_steps == test->current_step || test->fail) {
+ xsk_socket__delete(ifobj->xsk->xsk);
+ testapp_clean_xsk_umem(ifobj);
+ }
+
+ test->ifobj_rx->shared_umem = old_shared_umem;
+ test->ifobj_tx->shared_umem = old_shared_umem;
+
return !!test->fail;
}
@@ -1349,6 +1478,14 @@ static int testapp_validate_traffic(struct test_spec *test)
pthread_join(t1, NULL);
pthread_join(t0, NULL);
+ if (test->total_steps == test->current_step || test->fail) {
+ xsk_socket__delete(ifobj_tx->xsk->xsk);
+ xsk_socket__delete(ifobj_rx->xsk->xsk);
+ testapp_clean_xsk_umem(ifobj_rx);
+ if (!ifobj_tx->shared_umem)
+ testapp_clean_xsk_umem(ifobj_tx);
+ }
+
return !!test->fail;
}
@@ -1428,9 +1565,9 @@ static void testapp_headroom(struct test_spec *test)
static void testapp_stats_rx_dropped(struct test_spec *test)
{
test_spec_set_name(test, "STAT_RX_DROPPED");
+ pkt_stream_replace_half(test, MIN_PKT_SIZE * 4, 0);
test->ifobj_rx->umem->frame_headroom = test->ifobj_rx->umem->frame_size -
XDP_PACKET_HEADROOM - MIN_PKT_SIZE * 3;
- pkt_stream_replace_half(test, MIN_PKT_SIZE * 4, 0);
pkt_stream_receive_half(test);
test->ifobj_rx->validation_func = validate_rx_dropped;
testapp_validate_traffic(test);
@@ -1553,6 +1690,11 @@ static void testapp_invalid_desc(struct test_spec *test)
pkts[7].valid = false;
}
+ if (test->ifobj_tx->shared_umem) {
+ pkts[4].addr += UMEM_SIZE;
+ pkts[5].addr += UMEM_SIZE;
+ }
+
pkt_stream_generate_custom(test, pkts, ARRAY_SIZE(pkts));
testapp_validate_traffic(test);
pkt_stream_restore_default(test);
@@ -1583,6 +1725,10 @@ static void run_pkt_test(struct test_spec *test, enum test_mode mode, enum test_
{
switch (type) {
case TEST_TYPE_STATS_RX_DROPPED:
+ if (mode == TEST_MODE_ZC) {
+ ksft_test_result_skip("Can not run RX_DROPPED test for ZC mode\n");
+ return;
+ }
testapp_stats_rx_dropped(test);
break;
case TEST_TYPE_STATS_TX_INVALID_DESCS:
@@ -1712,12 +1858,44 @@ static void ifobject_delete(struct ifobject *ifobj)
free(ifobj);
}
+static bool is_xdp_supported(struct ifobject *ifobject)
+{
+ int flags = XDP_FLAGS_DRV_MODE;
+
+ LIBBPF_OPTS(bpf_link_create_opts, opts, .flags = flags);
+ struct bpf_insn insns[2] = {
+ BPF_MOV64_IMM(BPF_REG_0, XDP_PASS),
+ BPF_EXIT_INSN()
+ };
+ int ifindex = if_nametoindex(ifobject->ifname);
+ int prog_fd, insn_cnt = ARRAY_SIZE(insns);
+ int err;
+
+ prog_fd = bpf_prog_load(BPF_PROG_TYPE_XDP, NULL, "GPL", insns, insn_cnt, NULL);
+ if (prog_fd < 0)
+ return false;
+
+ err = bpf_xdp_attach(ifindex, prog_fd, flags, NULL);
+ if (err) {
+ close(prog_fd);
+ return false;
+ }
+
+ bpf_xdp_detach(ifindex, flags, NULL);
+ close(prog_fd);
+
+ return true;
+}
+
int main(int argc, char **argv)
{
- struct pkt_stream *pkt_stream_default;
+ struct pkt_stream *rx_pkt_stream_default;
+ struct pkt_stream *tx_pkt_stream_default;
struct ifobject *ifobj_tx, *ifobj_rx;
+ int modes = TEST_MODE_SKB + 1;
u32 i, j, failed_tests = 0;
struct test_spec test;
+ bool shared_umem;
/* Use libbpf 1.0 API mode */
libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
@@ -1732,6 +1910,10 @@ int main(int argc, char **argv)
setlocale(LC_ALL, "");
parse_command_line(ifobj_tx, ifobj_rx, argc, argv);
+ shared_umem = !strcmp(ifobj_tx->ifname, ifobj_rx->ifname);
+
+ ifobj_tx->shared_umem = shared_umem;
+ ifobj_rx->shared_umem = shared_umem;
if (!validate_interface(ifobj_tx) || !validate_interface(ifobj_rx)) {
usage(basename(argv[0]));
@@ -1743,15 +1925,23 @@ int main(int argc, char **argv)
init_iface(ifobj_rx, MAC2, MAC1, IP2, IP1, UDP_PORT2, UDP_PORT1,
worker_testapp_validate_rx);
+ if (is_xdp_supported(ifobj_tx)) {
+ modes++;
+ if (ifobj_zc_avail(ifobj_tx))
+ modes++;
+ }
+
test_spec_init(&test, ifobj_tx, ifobj_rx, 0);
- pkt_stream_default = pkt_stream_generate(ifobj_tx->umem, DEFAULT_PKT_CNT, PKT_SIZE);
- if (!pkt_stream_default)
+ tx_pkt_stream_default = pkt_stream_generate(ifobj_tx->umem, DEFAULT_PKT_CNT, PKT_SIZE);
+ rx_pkt_stream_default = pkt_stream_generate(ifobj_rx->umem, DEFAULT_PKT_CNT, PKT_SIZE);
+ if (!tx_pkt_stream_default || !rx_pkt_stream_default)
exit_with_error(ENOMEM);
- test.pkt_stream_default = pkt_stream_default;
+ test.tx_pkt_stream_default = tx_pkt_stream_default;
+ test.rx_pkt_stream_default = rx_pkt_stream_default;
- ksft_set_plan(TEST_MODE_MAX * TEST_TYPE_MAX);
+ ksft_set_plan(modes * TEST_TYPE_MAX);
- for (i = 0; i < TEST_MODE_MAX; i++)
+ for (i = 0; i < modes; i++)
for (j = 0; j < TEST_TYPE_MAX; j++) {
test_spec_init(&test, ifobj_tx, ifobj_rx, i);
run_pkt_test(&test, i, j);
@@ -1761,7 +1951,11 @@ int main(int argc, char **argv)
failed_tests++;
}
- pkt_stream_delete(pkt_stream_default);
+ pkt_stream_delete(tx_pkt_stream_default);
+ pkt_stream_delete(rx_pkt_stream_default);
+ free(ifobj_rx->umem);
+ if (!ifobj_tx->shared_umem)
+ free(ifobj_tx->umem);
ifobject_delete(ifobj_tx);
ifobject_delete(ifobj_rx);
diff --git a/tools/testing/selftests/bpf/xskxceiver.h b/tools/testing/selftests/bpf/xskxceiver.h
index ee97576757a9..edb76d2def9f 100644
--- a/tools/testing/selftests/bpf/xskxceiver.h
+++ b/tools/testing/selftests/bpf/xskxceiver.h
@@ -29,8 +29,8 @@
#define TEST_FAILURE -1
#define TEST_CONTINUE 1
#define MAX_INTERFACES 2
-#define MAX_INTERFACE_NAME_CHARS 7
-#define MAX_INTERFACES_NAMESPACE_CHARS 10
+#define MAX_INTERFACE_NAME_CHARS 16
+#define MAX_INTERFACES_NAMESPACE_CHARS 16
#define MAX_SOCKETS 2
#define MAX_TEST_NAME_SIZE 32
#define MAX_TEARDOWN_ITER 10
@@ -62,6 +62,7 @@
enum test_mode {
TEST_MODE_SKB,
TEST_MODE_DRV,
+ TEST_MODE_ZC,
TEST_MODE_MAX
};
@@ -99,6 +100,7 @@ struct xsk_umem_info {
u32 frame_headroom;
void *buffer;
u32 frame_size;
+ u32 base_addr;
bool unaligned_mode;
};
@@ -152,6 +154,7 @@ struct ifobject {
bool busy_poll;
bool use_fill_ring;
bool release_rx;
+ bool shared_umem;
u8 dst_mac[ETH_ALEN];
u8 src_mac[ETH_ALEN];
};
@@ -159,11 +162,13 @@ struct ifobject {
struct test_spec {
struct ifobject *ifobj_tx;
struct ifobject *ifobj_rx;
- struct pkt_stream *pkt_stream_default;
+ struct pkt_stream *tx_pkt_stream_default;
+ struct pkt_stream *rx_pkt_stream_default;
u16 total_steps;
u16 current_step;
u16 nb_sockets;
bool fail;
+ enum test_mode mode;
char name[MAX_TEST_NAME_SIZE];
};