From e7d4798960b3ebcd243ae6a59e04d4fe6518c96c Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Thu, 13 Jun 2019 18:39:58 +0900 Subject: xdp: Add tracepoint for bulk XDP_TX This is introduced for admins to check what is happening on XDP_TX when bulk XDP_TX is in use, which will be first introduced in veth in next commit. v3: - Add act field to be in line with other XDP tracepoints. Signed-off-by: Toshiaki Makita Acked-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann --- kernel/bpf/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index ad3be85f1411..561ed07d3007 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2101,3 +2101,4 @@ EXPORT_SYMBOL(bpf_stats_enabled_key); #include EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception); +EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_bulk_tx); -- cgit v1.2.3 From 9db1ff0a415c7de8eb67df5b2c56ac409ccefc37 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 25 Jun 2019 17:35:03 -0700 Subject: bpf: fix compiler warning with CONFIG_MODULES=n MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With CONFIG_MODULES=n, the following compiler warning occurs: /data/users/yhs/work/net-next/kernel/trace/bpf_trace.c:605:13: warning: ‘do_bpf_send_signal’ defined but not used [-Wunused-function] static void do_bpf_send_signal(struct irq_work *entry) The __init function send_signal_irq_work_init(), which calls do_bpf_send_signal(), is defined under CONFIG_MODULES. Hence, when CONFIG_MODULES=n, nobody calls static function do_bpf_send_signal(), hence the warning. The init function send_signal_irq_work_init() should work without CONFIG_MODULES. Moving it out of CONFIG_MODULES code section fixed the compiler warning, and also make bpf_send_signal() helper work without CONFIG_MODULES. Fixes: 8b401f9ed244 ("bpf: implement bpf_send_signal() helper") Reported-By: Arnd Bergmann Signed-off-by: Yonghong Song Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- kernel/trace/bpf_trace.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index c102c240bb0b..ca1255d14576 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1431,6 +1431,20 @@ int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, return err; } +static int __init send_signal_irq_work_init(void) +{ + int cpu; + struct send_signal_irq_work *work; + + for_each_possible_cpu(cpu) { + work = per_cpu_ptr(&send_signal_work, cpu); + init_irq_work(&work->irq_work, do_bpf_send_signal); + } + return 0; +} + +subsys_initcall(send_signal_irq_work_init); + #ifdef CONFIG_MODULES static int bpf_event_notify(struct notifier_block *nb, unsigned long op, void *module) @@ -1478,18 +1492,5 @@ static int __init bpf_event_init(void) return 0; } -static int __init send_signal_irq_work_init(void) -{ - int cpu; - struct send_signal_irq_work *work; - - for_each_possible_cpu(cpu) { - work = per_cpu_ptr(&send_signal_work, cpu); - init_irq_work(&work->irq_work, do_bpf_send_signal); - } - return 0; -} - fs_initcall(bpf_event_init); -subsys_initcall(send_signal_irq_work_init); #endif /* CONFIG_MODULES */ -- cgit v1.2.3 From e5c891a349d7c556b7b9dc231d6dd78e88a29e5c Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 25 Jun 2019 14:38:58 -0700 Subject: bpf: fix cgroup bpf release synchronization Since commit 4bfc0bb2c60e ("bpf: decouple the lifetime of cgroup_bpf from cgroup itself"), cgroup_bpf release occurs asynchronously (from a worker context), and before the release of the cgroup itself. This introduced a previously non-existing race between the release and update paths. E.g. if a leaf's cgroup_bpf is released and a new bpf program is attached to the one of ancestor cgroups at the same time. The race may result in double-free and other memory corruptions. To fix the problem, let's protect the body of cgroup_bpf_release() with cgroup_mutex, as it was effectively previously, when all this code was called from the cgroup release path with cgroup mutex held. Also let's skip cgroups, which have no chances to invoke a bpf program, on the update path. If the cgroup bpf refcnt reached 0, it means that the cgroup is offline (no attached processes), and there are no associated sockets left. It means there is no point in updating effective progs array! And it can lead to a leak, if it happens after the release. So, let's skip such cgroups. Big thanks for Tejun Heo for discovering and debugging of this problem! Fixes: 4bfc0bb2c60e ("bpf: decouple the lifetime of cgroup_bpf from cgroup itself") Reported-by: Tejun Heo Signed-off-by: Roman Gushchin Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/cgroup.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index c225c42e114a..077ed3a19848 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -16,6 +16,8 @@ #include #include +#include "../cgroup/cgroup-internal.h" + DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key); EXPORT_SYMBOL(cgroup_bpf_enabled_key); @@ -38,6 +40,8 @@ static void cgroup_bpf_release(struct work_struct *work) struct bpf_prog_array *old_array; unsigned int type; + mutex_lock(&cgroup_mutex); + for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) { struct list_head *progs = &cgrp->bpf.progs[type]; struct bpf_prog_list *pl, *tmp; @@ -54,10 +58,12 @@ static void cgroup_bpf_release(struct work_struct *work) } old_array = rcu_dereference_protected( cgrp->bpf.effective[type], - percpu_ref_is_dying(&cgrp->bpf.refcnt)); + lockdep_is_held(&cgroup_mutex)); bpf_prog_array_free(old_array); } + mutex_unlock(&cgroup_mutex); + percpu_ref_exit(&cgrp->bpf.refcnt); cgroup_put(cgrp); } @@ -229,6 +235,9 @@ static int update_effective_progs(struct cgroup *cgrp, css_for_each_descendant_pre(css, &cgrp->self) { struct cgroup *desc = container_of(css, struct cgroup, self); + if (percpu_ref_is_zero(&desc->bpf.refcnt)) + continue; + err = compute_effective_progs(desc, type, &desc->bpf.inactive); if (err) goto cleanup; @@ -238,6 +247,14 @@ static int update_effective_progs(struct cgroup *cgrp, css_for_each_descendant_pre(css, &cgrp->self) { struct cgroup *desc = container_of(css, struct cgroup, self); + if (percpu_ref_is_zero(&desc->bpf.refcnt)) { + if (unlikely(desc->bpf.inactive)) { + bpf_prog_array_free(desc->bpf.inactive); + desc->bpf.inactive = NULL; + } + continue; + } + activate_effective_progs(desc, type, desc->bpf.inactive); desc->bpf.inactive = NULL; } -- cgit v1.2.3 From 0d01da6afc5402f60325c5da31b22f7d56689b49 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 27 Jun 2019 13:38:47 -0700 Subject: bpf: implement getsockopt and setsockopt hooks Implement new BPF_PROG_TYPE_CGROUP_SOCKOPT program type and BPF_CGROUP_{G,S}ETSOCKOPT cgroup hooks. BPF_CGROUP_SETSOCKOPT can modify user setsockopt arguments before passing them down to the kernel or bypass kernel completely. BPF_CGROUP_GETSOCKOPT can can inspect/modify getsockopt arguments that kernel returns. Both hooks reuse existing PTR_TO_PACKET{,_END} infrastructure. The buffer memory is pre-allocated (because I don't think there is a precedent for working with __user memory from bpf). This might be slow to do for each {s,g}etsockopt call, that's why I've added __cgroup_bpf_prog_array_is_empty that exits early if there is nothing attached to a cgroup. Note, however, that there is a race between __cgroup_bpf_prog_array_is_empty and BPF_PROG_RUN_ARRAY where cgroup program layout might have changed; this should not be a problem because in general there is a race between multiple calls to {s,g}etsocktop and user adding/removing bpf progs from a cgroup. The return code of the BPF program is handled as follows: * 0: EPERM * 1: success, continue with next BPF program in the cgroup chain v9: * allow overwriting setsockopt arguments (Alexei Starovoitov): * use set_fs (same as kernel_setsockopt) * buffer is always kzalloc'd (no small on-stack buffer) v8: * use s32 for optlen (Andrii Nakryiko) v7: * return only 0 or 1 (Alexei Starovoitov) * always run all progs (Alexei Starovoitov) * use optval=0 as kernel bypass in setsockopt (Alexei Starovoitov) (decided to use optval=-1 instead, optval=0 might be a valid input) * call getsockopt hook after kernel handlers (Alexei Starovoitov) v6: * rework cgroup chaining; stop as soon as bpf program returns 0 or 2; see patch with the documentation for the details * drop Andrii's and Martin's Acked-by (not sure they are comfortable with the new state of things) v5: * skip copy_to_user() and put_user() when ret == 0 (Martin Lau) v4: * don't export bpf_sk_fullsock helper (Martin Lau) * size != sizeof(__u64) for uapi pointers (Martin Lau) * offsetof instead of bpf_ctx_range when checking ctx access (Martin Lau) v3: * typos in BPF_PROG_CGROUP_SOCKOPT_RUN_ARRAY comments (Andrii Nakryiko) * reverse christmas tree in BPF_PROG_CGROUP_SOCKOPT_RUN_ARRAY (Andrii Nakryiko) * use __bpf_md_ptr instead of __u32 for optval{,_end} (Martin Lau) * use BPF_FIELD_SIZEOF() for consistency (Martin Lau) * new CG_SOCKOPT_ACCESS macro to wrap repeated parts v2: * moved bpf_sockopt_kern fields around to remove a hole (Martin Lau) * aligned bpf_sockopt_kern->buf to 8 bytes (Martin Lau) * bpf_prog_array_is_empty instead of bpf_prog_array_length (Martin Lau) * added [0,2] return code check to verifier (Martin Lau) * dropped unused buf[64] from the stack (Martin Lau) * use PTR_TO_SOCKET for bpf_sockopt->sk (Martin Lau) * dropped bpf_target_off from ctx rewrites (Martin Lau) * use return code for kernel bypass (Martin Lau & Andrii Nakryiko) Cc: Andrii Nakryiko Cc: Martin Lau Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov --- include/linux/bpf-cgroup.h | 45 ++++++ include/linux/bpf.h | 2 + include/linux/bpf_types.h | 1 + include/linux/filter.h | 10 ++ include/uapi/linux/bpf.h | 14 ++ kernel/bpf/cgroup.c | 333 +++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/core.c | 9 ++ kernel/bpf/syscall.c | 19 +++ kernel/bpf/verifier.c | 8 ++ net/core/filter.c | 2 +- net/socket.c | 30 ++++ 11 files changed, 472 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index bd79ae32909a..169fd25f6bc2 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -124,6 +124,14 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, loff_t *ppos, void **new_buf, enum bpf_attach_type type); +int __cgroup_bpf_run_filter_setsockopt(struct sock *sock, int *level, + int *optname, char __user *optval, + int *optlen, char **kernel_optval); +int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, + int optname, char __user *optval, + int __user *optlen, int max_optlen, + int retval); + static inline enum bpf_cgroup_storage_type cgroup_storage_type( struct bpf_map *map) { @@ -286,6 +294,38 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, __ret; \ }) +#define BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock, level, optname, optval, optlen, \ + kernel_optval) \ +({ \ + int __ret = 0; \ + if (cgroup_bpf_enabled) \ + __ret = __cgroup_bpf_run_filter_setsockopt(sock, level, \ + optname, optval, \ + optlen, \ + kernel_optval); \ + __ret; \ +}) + +#define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) \ +({ \ + int __ret = 0; \ + if (cgroup_bpf_enabled) \ + get_user(__ret, optlen); \ + __ret; \ +}) + +#define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, optlen, \ + max_optlen, retval) \ +({ \ + int __ret = retval; \ + if (cgroup_bpf_enabled) \ + __ret = __cgroup_bpf_run_filter_getsockopt(sock, level, \ + optname, optval, \ + optlen, max_optlen, \ + retval); \ + __ret; \ +}) + int cgroup_bpf_prog_attach(const union bpf_attr *attr, enum bpf_prog_type ptype, struct bpf_prog *prog); int cgroup_bpf_prog_detach(const union bpf_attr *attr, @@ -357,6 +397,11 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,pos,nbuf) ({ 0; }) +#define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, \ + optlen, max_optlen, retval) ({ retval; }) +#define BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock, level, optname, optval, optlen, \ + kernel_optval) ({ 0; }) #define for_each_cgroup_storage_type(stype) for (; false; ) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a62e7889b0b6..18f4cc2c6acd 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -518,6 +518,7 @@ struct bpf_prog_array { struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags); void bpf_prog_array_free(struct bpf_prog_array *progs); int bpf_prog_array_length(struct bpf_prog_array *progs); +bool bpf_prog_array_is_empty(struct bpf_prog_array *array); int bpf_prog_array_copy_to_user(struct bpf_prog_array *progs, __u32 __user *prog_ids, u32 cnt); @@ -1051,6 +1052,7 @@ extern const struct bpf_func_proto bpf_spin_unlock_proto; extern const struct bpf_func_proto bpf_get_local_storage_proto; extern const struct bpf_func_proto bpf_strtol_proto; extern const struct bpf_func_proto bpf_strtoul_proto; +extern const struct bpf_func_proto bpf_tcp_sock_proto; /* Shared helpers among cBPF and eBPF. */ void bpf_user_rnd_init_once(void); diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 5a9975678d6f..eec5aeeeaf92 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -30,6 +30,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, raw_tracepoint_writable) #ifdef CONFIG_CGROUP_BPF BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev) BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SYSCTL, cg_sysctl) +BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCKOPT, cg_sockopt) #endif #ifdef CONFIG_BPF_LIRC_MODE2 BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2) diff --git a/include/linux/filter.h b/include/linux/filter.h index 43b45d6db36d..340f7d648974 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1199,4 +1199,14 @@ struct bpf_sysctl_kern { u64 tmp_reg; }; +struct bpf_sockopt_kern { + struct sock *sk; + u8 *optval; + u8 *optval_end; + s32 level; + s32 optname; + s32 optlen; + s32 retval; +}; + #endif /* __LINUX_FILTER_H__ */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b077507efa3f..a396b516a2b2 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -170,6 +170,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_FLOW_DISSECTOR, BPF_PROG_TYPE_CGROUP_SYSCTL, BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, + BPF_PROG_TYPE_CGROUP_SOCKOPT, }; enum bpf_attach_type { @@ -194,6 +195,8 @@ enum bpf_attach_type { BPF_CGROUP_SYSCTL, BPF_CGROUP_UDP4_RECVMSG, BPF_CGROUP_UDP6_RECVMSG, + BPF_CGROUP_GETSOCKOPT, + BPF_CGROUP_SETSOCKOPT, __MAX_BPF_ATTACH_TYPE }; @@ -3541,4 +3544,15 @@ struct bpf_sysctl { */ }; +struct bpf_sockopt { + __bpf_md_ptr(struct bpf_sock *, sk); + __bpf_md_ptr(void *, optval); + __bpf_md_ptr(void *, optval_end); + + __s32 level; + __s32 optname; + __s32 optlen; + __s32 retval; +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 077ed3a19848..76fa0076f20d 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "../cgroup/cgroup-internal.h" @@ -938,6 +939,188 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl); +static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp, + enum bpf_attach_type attach_type) +{ + struct bpf_prog_array *prog_array; + bool empty; + + rcu_read_lock(); + prog_array = rcu_dereference(cgrp->bpf.effective[attach_type]); + empty = bpf_prog_array_is_empty(prog_array); + rcu_read_unlock(); + + return empty; +} + +static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen) +{ + if (unlikely(max_optlen > PAGE_SIZE) || max_optlen < 0) + return -EINVAL; + + ctx->optval = kzalloc(max_optlen, GFP_USER); + if (!ctx->optval) + return -ENOMEM; + + ctx->optval_end = ctx->optval + max_optlen; + ctx->optlen = max_optlen; + + return 0; +} + +static void sockopt_free_buf(struct bpf_sockopt_kern *ctx) +{ + kfree(ctx->optval); +} + +int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, + int *optname, char __user *optval, + int *optlen, char **kernel_optval) +{ + struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + struct bpf_sockopt_kern ctx = { + .sk = sk, + .level = *level, + .optname = *optname, + }; + int ret; + + /* Opportunistic check to see whether we have any BPF program + * attached to the hook so we don't waste time allocating + * memory and locking the socket. + */ + if (!cgroup_bpf_enabled || + __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT)) + return 0; + + ret = sockopt_alloc_buf(&ctx, *optlen); + if (ret) + return ret; + + if (copy_from_user(ctx.optval, optval, *optlen) != 0) { + ret = -EFAULT; + goto out; + } + + lock_sock(sk); + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_SETSOCKOPT], + &ctx, BPF_PROG_RUN); + release_sock(sk); + + if (!ret) { + ret = -EPERM; + goto out; + } + + if (ctx.optlen == -1) { + /* optlen set to -1, bypass kernel */ + ret = 1; + } else if (ctx.optlen > *optlen || ctx.optlen < -1) { + /* optlen is out of bounds */ + ret = -EFAULT; + } else { + /* optlen within bounds, run kernel handler */ + ret = 0; + + /* export any potential modifications */ + *level = ctx.level; + *optname = ctx.optname; + *optlen = ctx.optlen; + *kernel_optval = ctx.optval; + } + +out: + if (ret) + sockopt_free_buf(&ctx); + return ret; +} +EXPORT_SYMBOL(__cgroup_bpf_run_filter_setsockopt); + +int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, + int optname, char __user *optval, + int __user *optlen, int max_optlen, + int retval) +{ + struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + struct bpf_sockopt_kern ctx = { + .sk = sk, + .level = level, + .optname = optname, + .retval = retval, + }; + int ret; + + /* Opportunistic check to see whether we have any BPF program + * attached to the hook so we don't waste time allocating + * memory and locking the socket. + */ + if (!cgroup_bpf_enabled || + __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT)) + return retval; + + ret = sockopt_alloc_buf(&ctx, max_optlen); + if (ret) + return ret; + + if (!retval) { + /* If kernel getsockopt finished successfully, + * copy whatever was returned to the user back + * into our temporary buffer. Set optlen to the + * one that kernel returned as well to let + * BPF programs inspect the value. + */ + + if (get_user(ctx.optlen, optlen)) { + ret = -EFAULT; + goto out; + } + + if (ctx.optlen > max_optlen) + ctx.optlen = max_optlen; + + if (copy_from_user(ctx.optval, optval, ctx.optlen) != 0) { + ret = -EFAULT; + goto out; + } + } + + lock_sock(sk); + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT], + &ctx, BPF_PROG_RUN); + release_sock(sk); + + if (!ret) { + ret = -EPERM; + goto out; + } + + if (ctx.optlen > max_optlen) { + ret = -EFAULT; + goto out; + } + + /* BPF programs only allowed to set retval to 0, not some + * arbitrary value. + */ + if (ctx.retval != 0 && ctx.retval != retval) { + ret = -EFAULT; + goto out; + } + + if (copy_to_user(optval, ctx.optval, ctx.optlen) || + put_user(ctx.optlen, optlen)) { + ret = -EFAULT; + goto out; + } + + ret = ctx.retval; + +out: + sockopt_free_buf(&ctx); + return ret; +} +EXPORT_SYMBOL(__cgroup_bpf_run_filter_getsockopt); + static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp, size_t *lenp) { @@ -1198,3 +1381,153 @@ const struct bpf_verifier_ops cg_sysctl_verifier_ops = { const struct bpf_prog_ops cg_sysctl_prog_ops = { }; + +static const struct bpf_func_proto * +cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_sk_storage_get: + return &bpf_sk_storage_get_proto; + case BPF_FUNC_sk_storage_delete: + return &bpf_sk_storage_delete_proto; +#ifdef CONFIG_INET + case BPF_FUNC_tcp_sock: + return &bpf_tcp_sock_proto; +#endif + default: + return cgroup_base_func_proto(func_id, prog); + } +} + +static bool cg_sockopt_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + const int size_default = sizeof(__u32); + + if (off < 0 || off >= sizeof(struct bpf_sockopt)) + return false; + + if (off % size != 0) + return false; + + if (type == BPF_WRITE) { + switch (off) { + case offsetof(struct bpf_sockopt, retval): + if (size != size_default) + return false; + return prog->expected_attach_type == + BPF_CGROUP_GETSOCKOPT; + case offsetof(struct bpf_sockopt, optname): + /* fallthrough */ + case offsetof(struct bpf_sockopt, level): + if (size != size_default) + return false; + return prog->expected_attach_type == + BPF_CGROUP_SETSOCKOPT; + case offsetof(struct bpf_sockopt, optlen): + return size == size_default; + default: + return false; + } + } + + switch (off) { + case offsetof(struct bpf_sockopt, sk): + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_SOCKET; + break; + case offsetof(struct bpf_sockopt, optval): + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_PACKET; + break; + case offsetof(struct bpf_sockopt, optval_end): + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_PACKET_END; + break; + case offsetof(struct bpf_sockopt, retval): + if (size != size_default) + return false; + return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT; + default: + if (size != size_default) + return false; + break; + } + return true; +} + +#define CG_SOCKOPT_ACCESS_FIELD(T, F) \ + T(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F), \ + si->dst_reg, si->src_reg, \ + offsetof(struct bpf_sockopt_kern, F)) + +static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, + u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + + switch (si->off) { + case offsetof(struct bpf_sockopt, sk): + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, sk); + break; + case offsetof(struct bpf_sockopt, level): + if (type == BPF_WRITE) + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, level); + else + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, level); + break; + case offsetof(struct bpf_sockopt, optname): + if (type == BPF_WRITE) + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optname); + else + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optname); + break; + case offsetof(struct bpf_sockopt, optlen): + if (type == BPF_WRITE) + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optlen); + else + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optlen); + break; + case offsetof(struct bpf_sockopt, retval): + if (type == BPF_WRITE) + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, retval); + else + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, retval); + break; + case offsetof(struct bpf_sockopt, optval): + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval); + break; + case offsetof(struct bpf_sockopt, optval_end): + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval_end); + break; + } + + return insn - insn_buf; +} + +static int cg_sockopt_get_prologue(struct bpf_insn *insn_buf, + bool direct_write, + const struct bpf_prog *prog) +{ + /* Nothing to do for sockopt argument. The data is kzalloc'ated. + */ + return 0; +} + +const struct bpf_verifier_ops cg_sockopt_verifier_ops = { + .get_func_proto = cg_sockopt_func_proto, + .is_valid_access = cg_sockopt_is_valid_access, + .convert_ctx_access = cg_sockopt_convert_ctx_access, + .gen_prologue = cg_sockopt_get_prologue, +}; + +const struct bpf_prog_ops cg_sockopt_prog_ops = { +}; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 561ed07d3007..e2c1b43728da 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1809,6 +1809,15 @@ int bpf_prog_array_length(struct bpf_prog_array *array) return cnt; } +bool bpf_prog_array_is_empty(struct bpf_prog_array *array) +{ + struct bpf_prog_array_item *item; + + for (item = array->items; item->prog; item++) + if (item->prog != &dummy_bpf_prog.prog) + return false; + return true; +} static bool bpf_prog_array_copy_core(struct bpf_prog_array *array, u32 *prog_ids, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7713cf39795a..b0f545e07425 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1590,6 +1590,14 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type, default: return -EINVAL; } + case BPF_PROG_TYPE_CGROUP_SOCKOPT: + switch (expected_attach_type) { + case BPF_CGROUP_SETSOCKOPT: + case BPF_CGROUP_GETSOCKOPT: + return 0; + default: + return -EINVAL; + } default: return 0; } @@ -1840,6 +1848,7 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, switch (prog->type) { case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + case BPF_PROG_TYPE_CGROUP_SOCKOPT: return attach_type == prog->expected_attach_type ? 0 : -EINVAL; case BPF_PROG_TYPE_CGROUP_SKB: return prog->enforce_expected_attach_type && @@ -1912,6 +1921,10 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_CGROUP_SYSCTL: ptype = BPF_PROG_TYPE_CGROUP_SYSCTL; break; + case BPF_CGROUP_GETSOCKOPT: + case BPF_CGROUP_SETSOCKOPT: + ptype = BPF_PROG_TYPE_CGROUP_SOCKOPT; + break; default: return -EINVAL; } @@ -1995,6 +2008,10 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_CGROUP_SYSCTL: ptype = BPF_PROG_TYPE_CGROUP_SYSCTL; break; + case BPF_CGROUP_GETSOCKOPT: + case BPF_CGROUP_SETSOCKOPT: + ptype = BPF_PROG_TYPE_CGROUP_SOCKOPT; + break; default: return -EINVAL; } @@ -2031,6 +2048,8 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_SOCK_OPS: case BPF_CGROUP_DEVICE: case BPF_CGROUP_SYSCTL: + case BPF_CGROUP_GETSOCKOPT: + case BPF_CGROUP_SETSOCKOPT: break; case BPF_LIRC_MODE2: return lirc_prog_query(attr, uattr); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0e079b2298f8..6b5623d320f9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2215,6 +2215,13 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, env->seen_direct_write = true; return true; + + case BPF_PROG_TYPE_CGROUP_SOCKOPT: + if (t == BPF_WRITE) + env->seen_direct_write = true; + + return true; + default: return false; } @@ -6066,6 +6073,7 @@ static int check_return_code(struct bpf_verifier_env *env) case BPF_PROG_TYPE_SOCK_OPS: case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_SYSCTL: + case BPF_PROG_TYPE_CGROUP_SOCKOPT: break; default: return 0; diff --git a/net/core/filter.c b/net/core/filter.c index 2014d76e0d2a..dc8534be12fc 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5651,7 +5651,7 @@ BPF_CALL_1(bpf_tcp_sock, struct sock *, sk) return (unsigned long)NULL; } -static const struct bpf_func_proto bpf_tcp_sock_proto = { +const struct bpf_func_proto bpf_tcp_sock_proto = { .func = bpf_tcp_sock, .gpl_only = false, .ret_type = RET_PTR_TO_TCP_SOCK_OR_NULL, diff --git a/net/socket.c b/net/socket.c index 963df5dbdd54..0ddfbfb761d9 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2051,6 +2051,8 @@ SYSCALL_DEFINE4(recv, int, fd, void __user *, ubuf, size_t, size, static int __sys_setsockopt(int fd, int level, int optname, char __user *optval, int optlen) { + mm_segment_t oldfs = get_fs(); + char *kernel_optval = NULL; int err, fput_needed; struct socket *sock; @@ -2063,6 +2065,22 @@ static int __sys_setsockopt(int fd, int level, int optname, if (err) goto out_put; + err = BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock->sk, &level, + &optname, optval, &optlen, + &kernel_optval); + + if (err < 0) { + goto out_put; + } else if (err > 0) { + err = 0; + goto out_put; + } + + if (kernel_optval) { + set_fs(KERNEL_DS); + optval = (char __user __force *)kernel_optval; + } + if (level == SOL_SOCKET) err = sock_setsockopt(sock, level, optname, optval, @@ -2071,6 +2089,11 @@ static int __sys_setsockopt(int fd, int level, int optname, err = sock->ops->setsockopt(sock, level, optname, optval, optlen); + + if (kernel_optval) { + set_fs(oldfs); + kfree(kernel_optval); + } out_put: fput_light(sock->file, fput_needed); } @@ -2093,6 +2116,7 @@ static int __sys_getsockopt(int fd, int level, int optname, { int err, fput_needed; struct socket *sock; + int max_optlen; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (sock != NULL) { @@ -2100,6 +2124,8 @@ static int __sys_getsockopt(int fd, int level, int optname, if (err) goto out_put; + max_optlen = BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen); + if (level == SOL_SOCKET) err = sock_getsockopt(sock, level, optname, optval, @@ -2108,6 +2134,10 @@ static int __sys_getsockopt(int fd, int level, int optname, err = sock->ops->getsockopt(sock, level, optname, optval, optlen); + + err = BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock->sk, level, optname, + optval, optlen, + max_optlen, err); out_put: fput_light(sock->file, fput_needed); } -- cgit v1.2.3 From c8af5cd75e2411d5a5aacf115f59a5ff6b87f3fa Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Fri, 28 Jun 2019 11:12:34 +0200 Subject: xskmap: Move non-standard list manipulation to helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a helper in list.h for the non-standard way of clearing a list that is used in xskmap. This makes it easier to reuse it in the other map types, and also makes sure this usage is not forgotten in any list refactorings in the future. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann --- include/linux/list.h | 14 ++++++++++++++ kernel/bpf/xskmap.c | 3 +-- 2 files changed, 15 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/list.h b/include/linux/list.h index e951228db4b2..85c92555e31f 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -106,6 +106,20 @@ static inline void __list_del(struct list_head * prev, struct list_head * next) WRITE_ONCE(prev->next, next); } +/* + * Delete a list entry and clear the 'prev' pointer. + * + * This is a special-purpose list clearing method used in the networking code + * for lists allocated as per-cpu, where we don't want to incur the extra + * WRITE_ONCE() overhead of a regular list_del_init(). The code that uses this + * needs to check the node 'prev' pointer instead of calling list_empty(). + */ +static inline void __list_del_clearprev(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); + entry->prev = NULL; +} + /** * list_del - deletes entry from list. * @entry: the element to delete from the list. diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index ef7338cebd18..9bb96ace9fa1 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -145,8 +145,7 @@ void __xsk_map_flush(struct bpf_map *map) list_for_each_entry_safe(xs, tmp, flush_list, flush_node) { xsk_flush(xs); - __list_del(xs->flush_node.prev, xs->flush_node.next); - xs->flush_node.prev = NULL; + __list_del_clearprev(&xs->flush_node); } } -- cgit v1.2.3 From d5df2830ca9922d03a33940ea424c9a5f39f1162 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Fri, 28 Jun 2019 11:12:34 +0200 Subject: devmap/cpumap: Use flush list instead of bitmap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The socket map uses a linked list instead of a bitmap to keep track of which entries to flush. Do the same for devmap and cpumap, as this means we don't have to care about the map index when enqueueing things into the map (and so we can cache the map lookup). Signed-off-by: Toke Høiland-Jørgensen Acked-by: Jonathan Lemon Acked-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- kernel/bpf/cpumap.c | 105 +++++++++++++++++++++++---------------------------- kernel/bpf/devmap.c | 107 +++++++++++++++++++++++----------------------------- net/core/filter.c | 2 - 3 files changed, 95 insertions(+), 119 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 8dff08768087..ef49e17ae47c 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -32,14 +32,19 @@ /* General idea: XDP packets getting XDP redirected to another CPU, * will maximum be stored/queued for one driver ->poll() call. It is - * guaranteed that setting flush bit and flush operation happen on + * guaranteed that queueing the frame and the flush operation happen on * same CPU. Thus, cpu_map_flush operation can deduct via this_cpu_ptr() * which queue in bpf_cpu_map_entry contains packets. */ #define CPU_MAP_BULK_SIZE 8 /* 8 == one cacheline on 64-bit archs */ +struct bpf_cpu_map_entry; +struct bpf_cpu_map; + struct xdp_bulk_queue { void *q[CPU_MAP_BULK_SIZE]; + struct list_head flush_node; + struct bpf_cpu_map_entry *obj; unsigned int count; }; @@ -52,6 +57,8 @@ struct bpf_cpu_map_entry { /* XDP can run multiple RX-ring queues, need __percpu enqueue store */ struct xdp_bulk_queue __percpu *bulkq; + struct bpf_cpu_map *cmap; + /* Queue with potential multi-producers, and single-consumer kthread */ struct ptr_ring *queue; struct task_struct *kthread; @@ -65,23 +72,17 @@ struct bpf_cpu_map { struct bpf_map map; /* Below members specific for map type */ struct bpf_cpu_map_entry **cpu_map; - unsigned long __percpu *flush_needed; + struct list_head __percpu *flush_list; }; -static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, - struct xdp_bulk_queue *bq, bool in_napi_ctx); - -static u64 cpu_map_bitmap_size(const union bpf_attr *attr) -{ - return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long); -} +static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx); static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) { struct bpf_cpu_map *cmap; int err = -ENOMEM; + int ret, cpu; u64 cost; - int ret; if (!capable(CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); @@ -105,7 +106,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) /* make sure page count doesn't overflow */ cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *); - cost += cpu_map_bitmap_size(attr) * num_possible_cpus(); + cost += sizeof(struct list_head) * num_possible_cpus(); /* Notice returns -EPERM on if map size is larger than memlock limit */ ret = bpf_map_charge_init(&cmap->map.memory, cost); @@ -114,12 +115,13 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) goto free_cmap; } - /* A per cpu bitfield with a bit per possible CPU in map */ - cmap->flush_needed = __alloc_percpu(cpu_map_bitmap_size(attr), - __alignof__(unsigned long)); - if (!cmap->flush_needed) + cmap->flush_list = alloc_percpu(struct list_head); + if (!cmap->flush_list) goto free_charge; + for_each_possible_cpu(cpu) + INIT_LIST_HEAD(per_cpu_ptr(cmap->flush_list, cpu)); + /* Alloc array for possible remote "destination" CPUs */ cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *), @@ -129,7 +131,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) return &cmap->map; free_percpu: - free_percpu(cmap->flush_needed); + free_percpu(cmap->flush_list); free_charge: bpf_map_charge_finish(&cmap->map.memory); free_cmap: @@ -334,7 +336,8 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, { gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; struct bpf_cpu_map_entry *rcpu; - int numa, err; + struct xdp_bulk_queue *bq; + int numa, err, i; /* Have map->numa_node, but choose node of redirect target CPU */ numa = cpu_to_node(cpu); @@ -349,6 +352,11 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, if (!rcpu->bulkq) goto free_rcu; + for_each_possible_cpu(i) { + bq = per_cpu_ptr(rcpu->bulkq, i); + bq->obj = rcpu; + } + /* Alloc queue */ rcpu->queue = kzalloc_node(sizeof(*rcpu->queue), gfp, numa); if (!rcpu->queue) @@ -405,7 +413,7 @@ static void __cpu_map_entry_free(struct rcu_head *rcu) struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu); /* No concurrent bq_enqueue can run at this point */ - bq_flush_to_queue(rcpu, bq, false); + bq_flush_to_queue(bq, false); } free_percpu(rcpu->bulkq); /* Cannot kthread_stop() here, last put free rcpu resources */ @@ -488,6 +496,7 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id); if (!rcpu) return -ENOMEM; + rcpu->cmap = cmap; } rcu_read_lock(); __cpu_map_entry_replace(cmap, key_cpu, rcpu); @@ -514,14 +523,14 @@ static void cpu_map_free(struct bpf_map *map) synchronize_rcu(); /* To ensure all pending flush operations have completed wait for flush - * bitmap to indicate all flush_needed bits to be zero on _all_ cpus. - * Because the above synchronize_rcu() ensures the map is disconnected - * from the program we can assume no new bits will be set. + * list be empty on _all_ cpus. Because the above synchronize_rcu() + * ensures the map is disconnected from the program we can assume no new + * items will be added to the list. */ for_each_online_cpu(cpu) { - unsigned long *bitmap = per_cpu_ptr(cmap->flush_needed, cpu); + struct list_head *flush_list = per_cpu_ptr(cmap->flush_list, cpu); - while (!bitmap_empty(bitmap, cmap->map.max_entries)) + while (!list_empty(flush_list)) cond_resched(); } @@ -538,7 +547,7 @@ static void cpu_map_free(struct bpf_map *map) /* bq flush and cleanup happens after RCU graze-period */ __cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */ } - free_percpu(cmap->flush_needed); + free_percpu(cmap->flush_list); bpf_map_area_free(cmap->cpu_map); kfree(cmap); } @@ -590,9 +599,9 @@ const struct bpf_map_ops cpu_map_ops = { .map_check_btf = map_check_no_btf, }; -static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, - struct xdp_bulk_queue *bq, bool in_napi_ctx) +static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx) { + struct bpf_cpu_map_entry *rcpu = bq->obj; unsigned int processed = 0, drops = 0; const int to_cpu = rcpu->cpu; struct ptr_ring *q; @@ -621,6 +630,8 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, bq->count = 0; spin_unlock(&q->producer_lock); + __list_del_clearprev(&bq->flush_node); + /* Feedback loop via tracepoints */ trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu); return 0; @@ -631,10 +642,11 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, */ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) { + struct list_head *flush_list = this_cpu_ptr(rcpu->cmap->flush_list); struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq); if (unlikely(bq->count == CPU_MAP_BULK_SIZE)) - bq_flush_to_queue(rcpu, bq, true); + bq_flush_to_queue(bq, true); /* Notice, xdp_buff/page MUST be queued here, long enough for * driver to code invoking us to finished, due to driver @@ -646,6 +658,10 @@ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) * operation, when completing napi->poll call. */ bq->q[bq->count++] = xdpf; + + if (!bq->flush_node.prev) + list_add(&bq->flush_node, flush_list); + return 0; } @@ -665,41 +681,16 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, return 0; } -void __cpu_map_insert_ctx(struct bpf_map *map, u32 bit) -{ - struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); - unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed); - - __set_bit(bit, bitmap); -} - void __cpu_map_flush(struct bpf_map *map) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); - unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed); - u32 bit; - - /* The napi->poll softirq makes sure __cpu_map_insert_ctx() - * and __cpu_map_flush() happen on same CPU. Thus, the percpu - * bitmap indicate which percpu bulkq have packets. - */ - for_each_set_bit(bit, bitmap, map->max_entries) { - struct bpf_cpu_map_entry *rcpu = READ_ONCE(cmap->cpu_map[bit]); - struct xdp_bulk_queue *bq; - - /* This is possible if entry is removed by user space - * between xdp redirect and flush op. - */ - if (unlikely(!rcpu)) - continue; - - __clear_bit(bit, bitmap); + struct list_head *flush_list = this_cpu_ptr(cmap->flush_list); + struct xdp_bulk_queue *bq, *tmp; - /* Flush all frames in bulkq to real queue */ - bq = this_cpu_ptr(rcpu->bulkq); - bq_flush_to_queue(rcpu, bq, true); + list_for_each_entry_safe(bq, tmp, flush_list, flush_node) { + bq_flush_to_queue(bq, true); /* If already running, costs spin_lock_irqsave + smb_mb */ - wake_up_process(rcpu->kthread); + wake_up_process(bq->obj->kthread); } } diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 40e86a7e0ef0..a4dddc867cbf 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -17,9 +17,8 @@ * datapath always has a valid copy. However, the datapath does a "flush" * operation that pushes any pending packets in the driver outside the RCU * critical section. Each bpf_dtab_netdev tracks these pending operations using - * an atomic per-cpu bitmap. The bpf_dtab_netdev object will not be destroyed - * until all bits are cleared indicating outstanding flush operations have - * completed. + * a per-cpu flush list. The bpf_dtab_netdev object will not be destroyed until + * this list is empty, indicating outstanding flush operations have completed. * * BPF syscalls may race with BPF program calls on any of the update, delete * or lookup operations. As noted above the xchg() operation also keep the @@ -48,9 +47,13 @@ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) #define DEV_MAP_BULK_SIZE 16 +struct bpf_dtab_netdev; + struct xdp_bulk_queue { struct xdp_frame *q[DEV_MAP_BULK_SIZE]; + struct list_head flush_node; struct net_device *dev_rx; + struct bpf_dtab_netdev *obj; unsigned int count; }; @@ -65,23 +68,18 @@ struct bpf_dtab_netdev { struct bpf_dtab { struct bpf_map map; struct bpf_dtab_netdev **netdev_map; - unsigned long __percpu *flush_needed; + struct list_head __percpu *flush_list; struct list_head list; }; static DEFINE_SPINLOCK(dev_map_lock); static LIST_HEAD(dev_map_list); -static u64 dev_map_bitmap_size(const union bpf_attr *attr) -{ - return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long); -} - static struct bpf_map *dev_map_alloc(union bpf_attr *attr) { struct bpf_dtab *dtab; + int err, cpu; u64 cost; - int err; if (!capable(CAP_NET_ADMIN)) return ERR_PTR(-EPERM); @@ -99,7 +97,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) /* make sure page count doesn't overflow */ cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); - cost += dev_map_bitmap_size(attr) * num_possible_cpus(); + cost += sizeof(struct list_head) * num_possible_cpus(); /* if map size is larger than memlock limit, reject it */ err = bpf_map_charge_init(&dtab->map.memory, cost); @@ -108,28 +106,30 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) err = -ENOMEM; - /* A per cpu bitfield with a bit per possible net device */ - dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr), - __alignof__(unsigned long), - GFP_KERNEL | __GFP_NOWARN); - if (!dtab->flush_needed) + dtab->flush_list = alloc_percpu(struct list_head); + if (!dtab->flush_list) goto free_charge; + for_each_possible_cpu(cpu) + INIT_LIST_HEAD(per_cpu_ptr(dtab->flush_list, cpu)); + dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *), dtab->map.numa_node); if (!dtab->netdev_map) - goto free_charge; + goto free_percpu; spin_lock(&dev_map_lock); list_add_tail_rcu(&dtab->list, &dev_map_list); spin_unlock(&dev_map_lock); return &dtab->map; + +free_percpu: + free_percpu(dtab->flush_list); free_charge: bpf_map_charge_finish(&dtab->map.memory); free_dtab: - free_percpu(dtab->flush_needed); kfree(dtab); return ERR_PTR(err); } @@ -158,14 +158,14 @@ static void dev_map_free(struct bpf_map *map) rcu_barrier(); /* To ensure all pending flush operations have completed wait for flush - * bitmap to indicate all flush_needed bits to be zero on _all_ cpus. + * list to empty on _all_ cpus. * Because the above synchronize_rcu() ensures the map is disconnected - * from the program we can assume no new bits will be set. + * from the program we can assume no new items will be added. */ for_each_online_cpu(cpu) { - unsigned long *bitmap = per_cpu_ptr(dtab->flush_needed, cpu); + struct list_head *flush_list = per_cpu_ptr(dtab->flush_list, cpu); - while (!bitmap_empty(bitmap, dtab->map.max_entries)) + while (!list_empty(flush_list)) cond_resched(); } @@ -181,7 +181,7 @@ static void dev_map_free(struct bpf_map *map) kfree(dev); } - free_percpu(dtab->flush_needed); + free_percpu(dtab->flush_list); bpf_map_area_free(dtab->netdev_map); kfree(dtab); } @@ -203,18 +203,10 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } -void __dev_map_insert_ctx(struct bpf_map *map, u32 bit) -{ - struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed); - - __set_bit(bit, bitmap); -} - -static int bq_xmit_all(struct bpf_dtab_netdev *obj, - struct xdp_bulk_queue *bq, u32 flags, +static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags, bool in_napi_ctx) { + struct bpf_dtab_netdev *obj = bq->obj; struct net_device *dev = obj->dev; int sent = 0, drops = 0, err = 0; int i; @@ -241,6 +233,7 @@ out: trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit, sent, drops, bq->dev_rx, dev, err); bq->dev_rx = NULL; + __list_del_clearprev(&bq->flush_node); return 0; error: /* If ndo_xdp_xmit fails with an errno, no frames have been @@ -263,31 +256,18 @@ error: * from the driver before returning from its napi->poll() routine. The poll() * routine is called either from busy_poll context or net_rx_action signaled * from NET_RX_SOFTIRQ. Either way the poll routine must complete before the - * net device can be torn down. On devmap tear down we ensure the ctx bitmap - * is zeroed before completing to ensure all flush operations have completed. + * net device can be torn down. On devmap tear down we ensure the flush list + * is empty before completing to ensure all flush operations have completed. */ void __dev_map_flush(struct bpf_map *map) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed); - u32 bit; + struct list_head *flush_list = this_cpu_ptr(dtab->flush_list); + struct xdp_bulk_queue *bq, *tmp; rcu_read_lock(); - for_each_set_bit(bit, bitmap, map->max_entries) { - struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]); - struct xdp_bulk_queue *bq; - - /* This is possible if the dev entry is removed by user space - * between xdp redirect and flush op. - */ - if (unlikely(!dev)) - continue; - - bq = this_cpu_ptr(dev->bulkq); - bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, true); - - __clear_bit(bit, bitmap); - } + list_for_each_entry_safe(bq, tmp, flush_list, flush_node) + bq_xmit_all(bq, XDP_XMIT_FLUSH, true); rcu_read_unlock(); } @@ -314,10 +294,11 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf, struct net_device *dev_rx) { + struct list_head *flush_list = this_cpu_ptr(obj->dtab->flush_list); struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq); if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) - bq_xmit_all(obj, bq, 0, true); + bq_xmit_all(bq, 0, true); /* Ingress dev_rx will be the same for all xdp_frame's in * bulk_queue, because bq stored per-CPU and must be flushed @@ -327,6 +308,10 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf, bq->dev_rx = dev_rx; bq->q[bq->count++] = xdpf; + + if (!bq->flush_node.prev) + list_add(&bq->flush_node, flush_list); + return 0; } @@ -377,17 +362,12 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev) { if (dev->dev->netdev_ops->ndo_xdp_xmit) { struct xdp_bulk_queue *bq; - unsigned long *bitmap; - int cpu; rcu_read_lock(); for_each_online_cpu(cpu) { - bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu); - __clear_bit(dev->bit, bitmap); - bq = per_cpu_ptr(dev->bulkq, cpu); - bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, false); + bq_xmit_all(bq, XDP_XMIT_FLUSH, false); } rcu_read_unlock(); } @@ -434,8 +414,10 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, struct net *net = current->nsproxy->net_ns; gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; struct bpf_dtab_netdev *dev, *old_dev; - u32 i = *(u32 *)key; u32 ifindex = *(u32 *)value; + struct xdp_bulk_queue *bq; + u32 i = *(u32 *)key; + int cpu; if (unlikely(map_flags > BPF_EXIST)) return -EINVAL; @@ -458,6 +440,11 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, return -ENOMEM; } + for_each_possible_cpu(cpu) { + bq = per_cpu_ptr(dev->bulkq, cpu); + bq->obj = dev; + } + dev->dev = dev_get_by_index(net, ifindex); if (!dev->dev) { free_percpu(dev->bulkq); diff --git a/net/core/filter.c b/net/core/filter.c index dc8534be12fc..1e5fd37e9ab5 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3523,7 +3523,6 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, err = dev_map_enqueue(dst, xdp, dev_rx); if (unlikely(err)) return err; - __dev_map_insert_ctx(map, index); break; } case BPF_MAP_TYPE_CPUMAP: { @@ -3532,7 +3531,6 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, err = cpu_map_enqueue(rcpu, xdp, dev_rx); if (unlikely(err)) return err; - __cpu_map_insert_ctx(map, index); break; } case BPF_MAP_TYPE_XSKMAP: { -- cgit v1.2.3 From 0cdbb4b09a0658b72c563638d476113aadd91afb Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Fri, 28 Jun 2019 11:12:35 +0200 Subject: devmap: Allow map lookups from eBPF MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We don't currently allow lookups into a devmap from eBPF, because the map lookup returns a pointer directly to the dev->ifindex, which shouldn't be modifiable from eBPF. However, being able to do lookups in devmaps is useful to know (e.g.) whether forwarding to a specific interface is enabled. Currently, programs work around this by keeping a shadow map of another type which indicates whether a map index is valid. Since we now have a flag to make maps read-only from the eBPF side, we can simply lift the lookup restriction if we make sure this flag is always set. Signed-off-by: Toke Høiland-Jørgensen Acked-by: Jonathan Lemon Acked-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- kernel/bpf/devmap.c | 5 +++++ kernel/bpf/verifier.c | 7 ++----- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index a4dddc867cbf..d83cf8ccc872 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -89,6 +89,11 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); + /* Lookup returns a pointer straight to dev->ifindex, so make sure the + * verifier prevents writes from the BPF side + */ + attr->map_flags |= BPF_F_RDONLY_PROG; + dtab = kzalloc(sizeof(*dtab), GFP_USER); if (!dtab) return ERR_PTR(-ENOMEM); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6b5623d320f9..9b6ee93d5a85 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3414,12 +3414,9 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (func_id != BPF_FUNC_get_local_storage) goto error; break; - /* devmap returns a pointer to a live net_device ifindex that we cannot - * allow to be modified from bpf side. So do not allow lookup elements - * for now. - */ case BPF_MAP_TYPE_DEVMAP: - if (func_id != BPF_FUNC_redirect_map) + if (func_id != BPF_FUNC_redirect_map && + func_id != BPF_FUNC_map_lookup_elem) goto error; break; /* Restrict bpf side of cpumap and xskmap, open when use-cases -- cgit v1.2.3 From a3ce685dd01a786fa5bc388e47d0066a4f842591 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 28 Jun 2019 09:24:09 -0700 Subject: bpf: fix precision tracking When equivalent state is found the current state needs to propagate precision marks. Otherwise the verifier will prune the search incorrectly. There is a price for correctness: before before broken fixed cnst spill precise precise bpf_lb-DLB_L3.o 1923 8128 1863 1898 bpf_lb-DLB_L4.o 3077 6707 2468 2666 bpf_lb-DUNKNOWN.o 1062 1062 544 544 bpf_lxc-DDROP_ALL.o 166729 380712 22629 36823 bpf_lxc-DUNKNOWN.o 174607 440652 28805 45325 bpf_netdev.o 8407 31904 6801 7002 bpf_overlay.o 5420 23569 4754 4858 bpf_lxc_jit.o 39389 359445 50925 69631 Overall precision tracking is still very effective. Fixes: b5dc0163d8fd ("bpf: precise scalar_value tracking") Reported-by: Lawrence Brakmo Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Tested-by: Lawrence Brakmo Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 121 ++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 107 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9b6ee93d5a85..a2e763703c30 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1659,16 +1659,18 @@ static void mark_all_scalars_precise(struct bpf_verifier_env *env, } } -static int mark_chain_precision(struct bpf_verifier_env *env, int regno) +static int __mark_chain_precision(struct bpf_verifier_env *env, int regno, + int spi) { struct bpf_verifier_state *st = env->cur_state; int first_idx = st->first_insn_idx; int last_idx = env->insn_idx; struct bpf_func_state *func; struct bpf_reg_state *reg; - u32 reg_mask = 1u << regno; - u64 stack_mask = 0; + u32 reg_mask = regno >= 0 ? 1u << regno : 0; + u64 stack_mask = spi >= 0 ? 1ull << spi : 0; bool skip_first = true; + bool new_marks = false; int i, err; if (!env->allow_ptr_leaks) @@ -1676,18 +1678,43 @@ static int mark_chain_precision(struct bpf_verifier_env *env, int regno) return 0; func = st->frame[st->curframe]; - reg = &func->regs[regno]; - if (reg->type != SCALAR_VALUE) { - WARN_ONCE(1, "backtracing misuse"); - return -EFAULT; + if (regno >= 0) { + reg = &func->regs[regno]; + if (reg->type != SCALAR_VALUE) { + WARN_ONCE(1, "backtracing misuse"); + return -EFAULT; + } + if (!reg->precise) + new_marks = true; + else + reg_mask = 0; + reg->precise = true; } - if (reg->precise) - return 0; - func->regs[regno].precise = true; + while (spi >= 0) { + if (func->stack[spi].slot_type[0] != STACK_SPILL) { + stack_mask = 0; + break; + } + reg = &func->stack[spi].spilled_ptr; + if (reg->type != SCALAR_VALUE) { + stack_mask = 0; + break; + } + if (!reg->precise) + new_marks = true; + else + stack_mask = 0; + reg->precise = true; + break; + } + + if (!new_marks) + return 0; + if (!reg_mask && !stack_mask) + return 0; for (;;) { DECLARE_BITMAP(mask, 64); - bool new_marks = false; u32 history = st->jmp_history_cnt; if (env->log.level & BPF_LOG_LEVEL) @@ -1730,12 +1757,15 @@ static int mark_chain_precision(struct bpf_verifier_env *env, int regno) if (!st) break; + new_marks = false; func = st->frame[st->curframe]; bitmap_from_u64(mask, reg_mask); for_each_set_bit(i, mask, 32) { reg = &func->regs[i]; - if (reg->type != SCALAR_VALUE) + if (reg->type != SCALAR_VALUE) { + reg_mask &= ~(1u << i); continue; + } if (!reg->precise) new_marks = true; reg->precise = true; @@ -1756,11 +1786,15 @@ static int mark_chain_precision(struct bpf_verifier_env *env, int regno) return -EFAULT; } - if (func->stack[i].slot_type[0] != STACK_SPILL) + if (func->stack[i].slot_type[0] != STACK_SPILL) { + stack_mask &= ~(1ull << i); continue; + } reg = &func->stack[i].spilled_ptr; - if (reg->type != SCALAR_VALUE) + if (reg->type != SCALAR_VALUE) { + stack_mask &= ~(1ull << i); continue; + } if (!reg->precise) new_marks = true; reg->precise = true; @@ -1772,6 +1806,8 @@ static int mark_chain_precision(struct bpf_verifier_env *env, int regno) reg_mask, stack_mask); } + if (!reg_mask && !stack_mask) + break; if (!new_marks) break; @@ -1781,6 +1817,15 @@ static int mark_chain_precision(struct bpf_verifier_env *env, int regno) return 0; } +static int mark_chain_precision(struct bpf_verifier_env *env, int regno) +{ + return __mark_chain_precision(env, regno, -1); +} + +static int mark_chain_precision_stack(struct bpf_verifier_env *env, int spi) +{ + return __mark_chain_precision(env, -1, spi); +} static bool is_spillable_regtype(enum bpf_reg_type type) { @@ -7111,6 +7156,46 @@ static int propagate_liveness(struct bpf_verifier_env *env, return 0; } +/* find precise scalars in the previous equivalent state and + * propagate them into the current state + */ +static int propagate_precision(struct bpf_verifier_env *env, + const struct bpf_verifier_state *old) +{ + struct bpf_reg_state *state_reg; + struct bpf_func_state *state; + int i, err = 0; + + state = old->frame[old->curframe]; + state_reg = state->regs; + for (i = 0; i < BPF_REG_FP; i++, state_reg++) { + if (state_reg->type != SCALAR_VALUE || + !state_reg->precise) + continue; + if (env->log.level & BPF_LOG_LEVEL2) + verbose(env, "propagating r%d\n", i); + err = mark_chain_precision(env, i); + if (err < 0) + return err; + } + + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { + if (state->stack[i].slot_type[0] != STACK_SPILL) + continue; + state_reg = &state->stack[i].spilled_ptr; + if (state_reg->type != SCALAR_VALUE || + !state_reg->precise) + continue; + if (env->log.level & BPF_LOG_LEVEL2) + verbose(env, "propagating fp%d\n", + (-i - 1) * BPF_REG_SIZE); + err = mark_chain_precision_stack(env, i); + if (err < 0) + return err; + } + return 0; +} + static bool states_maybe_looping(struct bpf_verifier_state *old, struct bpf_verifier_state *cur) { @@ -7203,6 +7288,14 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * this state and will pop a new one. */ err = propagate_liveness(env, &sl->state, cur); + + /* if previous state reached the exit with precision and + * current state is equivalent to it (except precsion marks) + * the precision needs to be propagated back in + * the current state. + */ + err = err ? : push_jmp_history(env, cur); + err = err ? : propagate_precision(env, &sl->state); if (err) return err; return 1; -- cgit v1.2.3