From 441420a1f0b3031f228453697406c86f110e59d4 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 1 Mar 2020 00:10:43 -0800 Subject: bpf: Reliably preserve btf_trace_xxx types btf_trace_xxx types, crucial for tp_btf BPF programs (raw tracepoint with verifier-checked direct memory access), have to be preserved in kernel BTF to allow verifier do its job and enforce type/memory safety. It was reported ([0]) that for kernels built with Clang current type-casting approach doesn't preserve these types. This patch fixes it by declaring an anonymous union for each registered tracepoint, capturing both struct bpf_raw_event_map information, as well as recording btf_trace_##call type reliably. Structurally, it's still the same content as for a plain struct bpf_raw_event_map, so no other changes are necessary. [0] https://github.com/iovisor/bcc/issues/2770#issuecomment-591007692 Fixes: e8c423fb31fa ("bpf: Add typecast to raw_tracepoints to help BTF generation") Reported-by: Wenbo Zhang Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200301081045.3491005-2-andriin@fb.com --- include/trace/bpf_probe.h | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/trace/bpf_probe.h b/include/trace/bpf_probe.h index b04c29270973..1ce3be63add1 100644 --- a/include/trace/bpf_probe.h +++ b/include/trace/bpf_probe.h @@ -75,13 +75,17 @@ static inline void bpf_test_probe_##call(void) \ check_trace_callback_type_##call(__bpf_trace_##template); \ } \ typedef void (*btf_trace_##call)(void *__data, proto); \ -static struct bpf_raw_event_map __used \ - __attribute__((section("__bpf_raw_tp_map"))) \ -__bpf_trace_tp_map_##call = { \ - .tp = &__tracepoint_##call, \ - .bpf_func = (void *)(btf_trace_##call)__bpf_trace_##template, \ - .num_args = COUNT_ARGS(args), \ - .writable_size = size, \ +static union { \ + struct bpf_raw_event_map event; \ + btf_trace_##call handler; \ +} __bpf_trace_tp_map_##call __used \ +__attribute__((section("__bpf_raw_tp_map"))) = { \ + .event = { \ + .tp = &__tracepoint_##call, \ + .bpf_func = __bpf_trace_##template, \ + .num_args = COUNT_ARGS(args), \ + .writable_size = size, \ + }, \ }; #define FIRST(x, ...) x -- cgit v1.2.3 From 70ed506c3bbcfa846d4636b23051ca79fa4781f7 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 2 Mar 2020 20:31:57 -0800 Subject: bpf: Introduce pinnable bpf_link abstraction Introduce bpf_link abstraction, representing an attachment of BPF program to a BPF hook point (e.g., tracepoint, perf event, etc). bpf_link encapsulates ownership of attached BPF program, reference counting of a link itself, when reference from multiple anonymous inodes, as well as ensures that release callback will be called from a process context, so that users can safely take mutex locks and sleep. Additionally, with a new abstraction it's now possible to generalize pinning of a link object in BPF FS, allowing to explicitly prevent BPF program detachment on process exit by pinning it in a BPF FS and let it open from independent other process to keep working with it. Convert two existing bpf_link-like objects (raw tracepoint and tracing BPF program attachments) into utilizing bpf_link framework, making them pinnable in BPF FS. More FD-based bpf_links will be added in follow up patches. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200303043159.323675-2-andriin@fb.com --- include/linux/bpf.h | 13 +++ kernel/bpf/inode.c | 42 +++++++++- kernel/bpf/syscall.c | 223 +++++++++++++++++++++++++++++++++++++++++---------- 3 files changed, 232 insertions(+), 46 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6015a4daf118..f13c78c6f29d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1056,6 +1056,19 @@ extern int sysctl_unprivileged_bpf_disabled; int bpf_map_new_fd(struct bpf_map *map, int flags); int bpf_prog_new_fd(struct bpf_prog *prog); +struct bpf_link; + +struct bpf_link_ops { + void (*release)(struct bpf_link *link); +}; + +void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops, + struct bpf_prog *prog); +void bpf_link_inc(struct bpf_link *link); +void bpf_link_put(struct bpf_link *link); +int bpf_link_new_fd(struct bpf_link *link); +struct bpf_link *bpf_link_get_from_fd(u32 ufd); + int bpf_obj_pin_user(u32 ufd, const char __user *pathname); int bpf_obj_get_user(const char __user *pathname, int flags); diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 5e40e7fccc21..95087d9f4ed3 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -25,6 +25,7 @@ enum bpf_type { BPF_TYPE_UNSPEC = 0, BPF_TYPE_PROG, BPF_TYPE_MAP, + BPF_TYPE_LINK, }; static void *bpf_any_get(void *raw, enum bpf_type type) @@ -36,6 +37,9 @@ static void *bpf_any_get(void *raw, enum bpf_type type) case BPF_TYPE_MAP: bpf_map_inc_with_uref(raw); break; + case BPF_TYPE_LINK: + bpf_link_inc(raw); + break; default: WARN_ON_ONCE(1); break; @@ -53,6 +57,9 @@ static void bpf_any_put(void *raw, enum bpf_type type) case BPF_TYPE_MAP: bpf_map_put_with_uref(raw); break; + case BPF_TYPE_LINK: + bpf_link_put(raw); + break; default: WARN_ON_ONCE(1); break; @@ -63,20 +70,32 @@ static void *bpf_fd_probe_obj(u32 ufd, enum bpf_type *type) { void *raw; - *type = BPF_TYPE_MAP; raw = bpf_map_get_with_uref(ufd); - if (IS_ERR(raw)) { + if (!IS_ERR(raw)) { + *type = BPF_TYPE_MAP; + return raw; + } + + raw = bpf_prog_get(ufd); + if (!IS_ERR(raw)) { *type = BPF_TYPE_PROG; - raw = bpf_prog_get(ufd); + return raw; } - return raw; + raw = bpf_link_get_from_fd(ufd); + if (!IS_ERR(raw)) { + *type = BPF_TYPE_LINK; + return raw; + } + + return ERR_PTR(-EINVAL); } static const struct inode_operations bpf_dir_iops; static const struct inode_operations bpf_prog_iops = { }; static const struct inode_operations bpf_map_iops = { }; +static const struct inode_operations bpf_link_iops = { }; static struct inode *bpf_get_inode(struct super_block *sb, const struct inode *dir, @@ -114,6 +133,8 @@ static int bpf_inode_type(const struct inode *inode, enum bpf_type *type) *type = BPF_TYPE_PROG; else if (inode->i_op == &bpf_map_iops) *type = BPF_TYPE_MAP; + else if (inode->i_op == &bpf_link_iops) + *type = BPF_TYPE_LINK; else return -EACCES; @@ -335,6 +356,12 @@ static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg) &bpffs_map_fops : &bpffs_obj_fops); } +static int bpf_mklink(struct dentry *dentry, umode_t mode, void *arg) +{ + return bpf_mkobj_ops(dentry, mode, arg, &bpf_link_iops, + &bpffs_obj_fops); +} + static struct dentry * bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags) { @@ -411,6 +438,9 @@ static int bpf_obj_do_pin(const char __user *pathname, void *raw, case BPF_TYPE_MAP: ret = vfs_mkobj(dentry, mode, bpf_mkmap, raw); break; + case BPF_TYPE_LINK: + ret = vfs_mkobj(dentry, mode, bpf_mklink, raw); + break; default: ret = -EPERM; } @@ -487,6 +517,8 @@ int bpf_obj_get_user(const char __user *pathname, int flags) ret = bpf_prog_new_fd(raw); else if (type == BPF_TYPE_MAP) ret = bpf_map_new_fd(raw, f_flags); + else if (type == BPF_TYPE_LINK) + ret = bpf_link_new_fd(raw); else return -ENOENT; @@ -504,6 +536,8 @@ static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type if (inode->i_op == &bpf_map_iops) return ERR_PTR(-EINVAL); + if (inode->i_op == &bpf_link_iops) + return ERR_PTR(-EINVAL); if (inode->i_op != &bpf_prog_iops) return ERR_PTR(-EACCES); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c536c65256ad..13de65363ba2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2173,24 +2173,154 @@ static int bpf_obj_get(const union bpf_attr *attr) attr->file_flags); } -static int bpf_tracing_prog_release(struct inode *inode, struct file *filp) +struct bpf_link { + atomic64_t refcnt; + const struct bpf_link_ops *ops; + struct bpf_prog *prog; + struct work_struct work; +}; + +void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops, + struct bpf_prog *prog) { - struct bpf_prog *prog = filp->private_data; + atomic64_set(&link->refcnt, 1); + link->ops = ops; + link->prog = prog; +} + +void bpf_link_inc(struct bpf_link *link) +{ + atomic64_inc(&link->refcnt); +} + +/* bpf_link_free is guaranteed to be called from process context */ +static void bpf_link_free(struct bpf_link *link) +{ + struct bpf_prog *prog; - WARN_ON_ONCE(bpf_trampoline_unlink_prog(prog)); + /* remember prog locally, because release below will free link memory */ + prog = link->prog; + /* extra clean up and kfree of container link struct */ + link->ops->release(link); + /* no more accesing of link members after this point */ bpf_prog_put(prog); +} + +static void bpf_link_put_deferred(struct work_struct *work) +{ + struct bpf_link *link = container_of(work, struct bpf_link, work); + + bpf_link_free(link); +} + +/* bpf_link_put can be called from atomic context, but ensures that resources + * are freed from process context + */ +void bpf_link_put(struct bpf_link *link) +{ + if (!atomic64_dec_and_test(&link->refcnt)) + return; + + if (in_atomic()) { + INIT_WORK(&link->work, bpf_link_put_deferred); + schedule_work(&link->work); + } else { + bpf_link_free(link); + } +} + +static int bpf_link_release(struct inode *inode, struct file *filp) +{ + struct bpf_link *link = filp->private_data; + + bpf_link_put(link); return 0; } -static const struct file_operations bpf_tracing_prog_fops = { - .release = bpf_tracing_prog_release, +#ifdef CONFIG_PROC_FS +static const struct bpf_link_ops bpf_raw_tp_lops; +static const struct bpf_link_ops bpf_tracing_link_lops; +static const struct bpf_link_ops bpf_xdp_link_lops; + +static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) +{ + const struct bpf_link *link = filp->private_data; + const struct bpf_prog *prog = link->prog; + char prog_tag[sizeof(prog->tag) * 2 + 1] = { }; + const char *link_type; + + if (link->ops == &bpf_raw_tp_lops) + link_type = "raw_tracepoint"; + else if (link->ops == &bpf_tracing_link_lops) + link_type = "tracing"; + else + link_type = "unknown"; + + bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); + seq_printf(m, + "link_type:\t%s\n" + "prog_tag:\t%s\n" + "prog_id:\t%u\n", + link_type, + prog_tag, + prog->aux->id); +} +#endif + +const struct file_operations bpf_link_fops = { +#ifdef CONFIG_PROC_FS + .show_fdinfo = bpf_link_show_fdinfo, +#endif + .release = bpf_link_release, .read = bpf_dummy_read, .write = bpf_dummy_write, }; +int bpf_link_new_fd(struct bpf_link *link) +{ + return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC); +} + +struct bpf_link *bpf_link_get_from_fd(u32 ufd) +{ + struct fd f = fdget(ufd); + struct bpf_link *link; + + if (!f.file) + return ERR_PTR(-EBADF); + if (f.file->f_op != &bpf_link_fops) { + fdput(f); + return ERR_PTR(-EINVAL); + } + + link = f.file->private_data; + bpf_link_inc(link); + fdput(f); + + return link; +} + +struct bpf_tracing_link { + struct bpf_link link; +}; + +static void bpf_tracing_link_release(struct bpf_link *link) +{ + struct bpf_tracing_link *tr_link = + container_of(link, struct bpf_tracing_link, link); + + WARN_ON_ONCE(bpf_trampoline_unlink_prog(link->prog)); + kfree(tr_link); +} + +static const struct bpf_link_ops bpf_tracing_link_lops = { + .release = bpf_tracing_link_release, +}; + static int bpf_tracing_prog_attach(struct bpf_prog *prog) { - int tr_fd, err; + struct bpf_tracing_link *link; + int link_fd, err; if (prog->expected_attach_type != BPF_TRACE_FENTRY && prog->expected_attach_type != BPF_TRACE_FEXIT && @@ -2199,58 +2329,61 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog) goto out_put_prog; } + link = kzalloc(sizeof(*link), GFP_USER); + if (!link) { + err = -ENOMEM; + goto out_put_prog; + } + bpf_link_init(&link->link, &bpf_tracing_link_lops, prog); + err = bpf_trampoline_link_prog(prog); if (err) - goto out_put_prog; + goto out_free_link; - tr_fd = anon_inode_getfd("bpf-tracing-prog", &bpf_tracing_prog_fops, - prog, O_CLOEXEC); - if (tr_fd < 0) { + link_fd = bpf_link_new_fd(&link->link); + if (link_fd < 0) { WARN_ON_ONCE(bpf_trampoline_unlink_prog(prog)); - err = tr_fd; - goto out_put_prog; + err = link_fd; + goto out_free_link; } - return tr_fd; + return link_fd; +out_free_link: + kfree(link); out_put_prog: bpf_prog_put(prog); return err; } -struct bpf_raw_tracepoint { +struct bpf_raw_tp_link { + struct bpf_link link; struct bpf_raw_event_map *btp; - struct bpf_prog *prog; }; -static int bpf_raw_tracepoint_release(struct inode *inode, struct file *filp) +static void bpf_raw_tp_link_release(struct bpf_link *link) { - struct bpf_raw_tracepoint *raw_tp = filp->private_data; + struct bpf_raw_tp_link *raw_tp = + container_of(link, struct bpf_raw_tp_link, link); - if (raw_tp->prog) { - bpf_probe_unregister(raw_tp->btp, raw_tp->prog); - bpf_prog_put(raw_tp->prog); - } + bpf_probe_unregister(raw_tp->btp, raw_tp->link.prog); bpf_put_raw_tracepoint(raw_tp->btp); kfree(raw_tp); - return 0; } -static const struct file_operations bpf_raw_tp_fops = { - .release = bpf_raw_tracepoint_release, - .read = bpf_dummy_read, - .write = bpf_dummy_write, +static const struct bpf_link_ops bpf_raw_tp_lops = { + .release = bpf_raw_tp_link_release, }; #define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd static int bpf_raw_tracepoint_open(const union bpf_attr *attr) { - struct bpf_raw_tracepoint *raw_tp; + struct bpf_raw_tp_link *raw_tp; struct bpf_raw_event_map *btp; struct bpf_prog *prog; const char *tp_name; char buf[128]; - int tp_fd, err; + int link_fd, err; if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN)) return -EINVAL; @@ -2302,21 +2435,20 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) err = -ENOMEM; goto out_put_btp; } + bpf_link_init(&raw_tp->link, &bpf_raw_tp_lops, prog); raw_tp->btp = btp; - raw_tp->prog = prog; err = bpf_probe_register(raw_tp->btp, prog); if (err) goto out_free_tp; - tp_fd = anon_inode_getfd("bpf-raw-tracepoint", &bpf_raw_tp_fops, raw_tp, - O_CLOEXEC); - if (tp_fd < 0) { + link_fd = bpf_link_new_fd(&raw_tp->link); + if (link_fd < 0) { bpf_probe_unregister(raw_tp->btp, prog); - err = tp_fd; + err = link_fd; goto out_free_tp; } - return tp_fd; + return link_fd; out_free_tp: kfree(raw_tp); @@ -3266,15 +3398,21 @@ static int bpf_task_fd_query(const union bpf_attr *attr, if (err) goto out; - if (file->f_op == &bpf_raw_tp_fops) { - struct bpf_raw_tracepoint *raw_tp = file->private_data; - struct bpf_raw_event_map *btp = raw_tp->btp; + if (file->f_op == &bpf_link_fops) { + struct bpf_link *link = file->private_data; - err = bpf_task_fd_query_copy(attr, uattr, - raw_tp->prog->aux->id, - BPF_FD_TYPE_RAW_TRACEPOINT, - btp->tp->name, 0, 0); - goto put_file; + if (link->ops == &bpf_raw_tp_lops) { + struct bpf_raw_tp_link *raw_tp = + container_of(link, struct bpf_raw_tp_link, link); + struct bpf_raw_event_map *btp = raw_tp->btp; + + err = bpf_task_fd_query_copy(attr, uattr, + raw_tp->link.prog->aux->id, + BPF_FD_TYPE_RAW_TRACEPOINT, + btp->tp->name, 0, 0); + goto put_file; + } + goto out_not_supp; } event = perf_get_event(file); @@ -3294,6 +3432,7 @@ static int bpf_task_fd_query(const union bpf_attr *attr, goto put_file; } +out_not_supp: err = -ENOTSUPP; put_file: fput(file); -- cgit v1.2.3 From cf62089b0edd7e74a1f474844b4d9f7b5697fb5c Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 3 Mar 2020 15:05:01 -0500 Subject: bpf: Add gso_size to __sk_buff BPF programs may want to know whether an skb is gso. The canonical answer is skb_is_gso(skb), which tests that gso_size != 0. Expose this field in the same manner as gso_segs. That field itself is not a sufficient signal, as the comment in skb_shared_info makes clear: gso_segs may be zero, e.g., from dodgy sources. Also prepare net/bpf/test_run for upcoming BPF_PROG_TEST_RUN tests of the feature. Signed-off-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200303200503.226217-2-willemdebruijn.kernel@gmail.com --- include/uapi/linux/bpf.h | 1 + net/bpf/test_run.c | 7 +++++++ net/core/filter.c | 44 ++++++++++++++++++++++++++++++-------------- 3 files changed, 38 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 8e98ced0963b..180337fae97e 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3176,6 +3176,7 @@ struct __sk_buff { __u32 wire_len; __u32 gso_segs; __bpf_md_ptr(struct bpf_sock *, sk); + __u32 gso_size; }; struct bpf_tunnel_key { diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 562443f94133..1cd7a1c2f8b2 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -277,6 +277,12 @@ static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb) /* gso_segs is allowed */ if (!range_is_zero(__skb, offsetofend(struct __sk_buff, gso_segs), + offsetof(struct __sk_buff, gso_size))) + return -EINVAL; + + /* gso_size is allowed */ + + if (!range_is_zero(__skb, offsetofend(struct __sk_buff, gso_size), sizeof(struct __sk_buff))) return -EINVAL; @@ -297,6 +303,7 @@ static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb) if (__skb->gso_segs > GSO_MAX_SEGS) return -EINVAL; skb_shinfo(skb)->gso_segs = __skb->gso_segs; + skb_shinfo(skb)->gso_size = __skb->gso_size; return 0; } diff --git a/net/core/filter.c b/net/core/filter.c index 4a08c9fb2be7..cd0a532db4e7 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7139,6 +7139,27 @@ static u32 flow_dissector_convert_ctx_access(enum bpf_access_type type, return insn - insn_buf; } +static struct bpf_insn *bpf_convert_shinfo_access(const struct bpf_insn *si, + struct bpf_insn *insn) +{ + /* si->dst_reg = skb_shinfo(SKB); */ +#ifdef NET_SKBUFF_DATA_USES_OFFSET + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end), + BPF_REG_AX, si->src_reg, + offsetof(struct sk_buff, end)); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, head), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, head)); + *insn++ = BPF_ALU64_REG(BPF_ADD, si->dst_reg, BPF_REG_AX); +#else + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, end)); +#endif + + return insn; +} + static u32 bpf_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, @@ -7461,26 +7482,21 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct __sk_buff, gso_segs): - /* si->dst_reg = skb_shinfo(SKB); */ -#ifdef NET_SKBUFF_DATA_USES_OFFSET - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end), - BPF_REG_AX, si->src_reg, - offsetof(struct sk_buff, end)); - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, head), - si->dst_reg, si->src_reg, - offsetof(struct sk_buff, head)); - *insn++ = BPF_ALU64_REG(BPF_ADD, si->dst_reg, BPF_REG_AX); -#else - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end), - si->dst_reg, si->src_reg, - offsetof(struct sk_buff, end)); -#endif + insn = bpf_convert_shinfo_access(si, insn); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_segs), si->dst_reg, si->dst_reg, bpf_target_off(struct skb_shared_info, gso_segs, 2, target_size)); break; + case offsetof(struct __sk_buff, gso_size): + insn = bpf_convert_shinfo_access(si, insn); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_size), + si->dst_reg, si->dst_reg, + bpf_target_off(struct skb_shared_info, + gso_size, 2, + target_size)); + break; case offsetof(struct __sk_buff, wire_len): BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, pkt_len) != 4); -- cgit v1.2.3 From 1aae4bdd787998ea331a56f3db9d8595790fe2f9 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 2 Mar 2020 16:32:31 -0800 Subject: bpf: Switch BPF UAPI #define constants used from BPF program side to enums Switch BPF UAPI constants, previously defined as #define macro, to anonymous enum values. This preserves constants values and behavior in expressions, but has added advantaged of being captured as part of DWARF and, subsequently, BTF type info. Which, in turn, greatly improves usefulness of generated vmlinux.h for BPF applications, as it will not require BPF users to copy/paste various flags and constants, which are frequently used with BPF helpers. Only those constants that are used/useful from BPF program side are converted. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200303003233.3496043-2-andriin@fb.com --- include/uapi/linux/bpf.h | 175 +++++++++++++++++++++++++--------------- tools/include/uapi/linux/bpf.h | 177 +++++++++++++++++++++++++---------------- 2 files changed, 219 insertions(+), 133 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 180337fae97e..d6b33ea27bcc 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -325,44 +325,46 @@ enum bpf_attach_type { #define BPF_PSEUDO_CALL 1 /* flags for BPF_MAP_UPDATE_ELEM command */ -#define BPF_ANY 0 /* create new element or update existing */ -#define BPF_NOEXIST 1 /* create new element if it didn't exist */ -#define BPF_EXIST 2 /* update existing element */ -#define BPF_F_LOCK 4 /* spin_lock-ed map_lookup/map_update */ +enum { + BPF_ANY = 0, /* create new element or update existing */ + BPF_NOEXIST = 1, /* create new element if it didn't exist */ + BPF_EXIST = 2, /* update existing element */ + BPF_F_LOCK = 4, /* spin_lock-ed map_lookup/map_update */ +}; /* flags for BPF_MAP_CREATE command */ -#define BPF_F_NO_PREALLOC (1U << 0) +enum { + BPF_F_NO_PREALLOC = (1U << 0), /* Instead of having one common LRU list in the * BPF_MAP_TYPE_LRU_[PERCPU_]HASH map, use a percpu LRU list * which can scale and perform better. * Note, the LRU nodes (including free nodes) cannot be moved * across different LRU lists. */ -#define BPF_F_NO_COMMON_LRU (1U << 1) + BPF_F_NO_COMMON_LRU = (1U << 1), /* Specify numa node during map creation */ -#define BPF_F_NUMA_NODE (1U << 2) - -#define BPF_OBJ_NAME_LEN 16U + BPF_F_NUMA_NODE = (1U << 2), /* Flags for accessing BPF object from syscall side. */ -#define BPF_F_RDONLY (1U << 3) -#define BPF_F_WRONLY (1U << 4) + BPF_F_RDONLY = (1U << 3), + BPF_F_WRONLY = (1U << 4), /* Flag for stack_map, store build_id+offset instead of pointer */ -#define BPF_F_STACK_BUILD_ID (1U << 5) + BPF_F_STACK_BUILD_ID = (1U << 5), /* Zero-initialize hash function seed. This should only be used for testing. */ -#define BPF_F_ZERO_SEED (1U << 6) + BPF_F_ZERO_SEED = (1U << 6), /* Flags for accessing BPF object from program side. */ -#define BPF_F_RDONLY_PROG (1U << 7) -#define BPF_F_WRONLY_PROG (1U << 8) + BPF_F_RDONLY_PROG = (1U << 7), + BPF_F_WRONLY_PROG = (1U << 8), /* Clone map from listener for newly accepted socket */ -#define BPF_F_CLONE (1U << 9) + BPF_F_CLONE = (1U << 9), /* Enable memory-mapping BPF map */ -#define BPF_F_MMAPABLE (1U << 10) + BPF_F_MMAPABLE = (1U << 10), +}; /* Flags for BPF_PROG_QUERY. */ @@ -391,6 +393,8 @@ struct bpf_stack_build_id { }; }; +#define BPF_OBJ_NAME_LEN 16U + union bpf_attr { struct { /* anonymous struct used by BPF_MAP_CREATE command */ __u32 map_type; /* one of enum bpf_map_type */ @@ -3045,72 +3049,100 @@ enum bpf_func_id { /* All flags used by eBPF helper functions, placed here. */ /* BPF_FUNC_skb_store_bytes flags. */ -#define BPF_F_RECOMPUTE_CSUM (1ULL << 0) -#define BPF_F_INVALIDATE_HASH (1ULL << 1) +enum { + BPF_F_RECOMPUTE_CSUM = (1ULL << 0), + BPF_F_INVALIDATE_HASH = (1ULL << 1), +}; /* BPF_FUNC_l3_csum_replace and BPF_FUNC_l4_csum_replace flags. * First 4 bits are for passing the header field size. */ -#define BPF_F_HDR_FIELD_MASK 0xfULL +enum { + BPF_F_HDR_FIELD_MASK = 0xfULL, +}; /* BPF_FUNC_l4_csum_replace flags. */ -#define BPF_F_PSEUDO_HDR (1ULL << 4) -#define BPF_F_MARK_MANGLED_0 (1ULL << 5) -#define BPF_F_MARK_ENFORCE (1ULL << 6) +enum { + BPF_F_PSEUDO_HDR = (1ULL << 4), + BPF_F_MARK_MANGLED_0 = (1ULL << 5), + BPF_F_MARK_ENFORCE = (1ULL << 6), +}; /* BPF_FUNC_clone_redirect and BPF_FUNC_redirect flags. */ -#define BPF_F_INGRESS (1ULL << 0) +enum { + BPF_F_INGRESS = (1ULL << 0), +}; /* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */ -#define BPF_F_TUNINFO_IPV6 (1ULL << 0) +enum { + BPF_F_TUNINFO_IPV6 = (1ULL << 0), +}; /* flags for both BPF_FUNC_get_stackid and BPF_FUNC_get_stack. */ -#define BPF_F_SKIP_FIELD_MASK 0xffULL -#define BPF_F_USER_STACK (1ULL << 8) +enum { + BPF_F_SKIP_FIELD_MASK = 0xffULL, + BPF_F_USER_STACK = (1ULL << 8), /* flags used by BPF_FUNC_get_stackid only. */ -#define BPF_F_FAST_STACK_CMP (1ULL << 9) -#define BPF_F_REUSE_STACKID (1ULL << 10) + BPF_F_FAST_STACK_CMP = (1ULL << 9), + BPF_F_REUSE_STACKID = (1ULL << 10), /* flags used by BPF_FUNC_get_stack only. */ -#define BPF_F_USER_BUILD_ID (1ULL << 11) + BPF_F_USER_BUILD_ID = (1ULL << 11), +}; /* BPF_FUNC_skb_set_tunnel_key flags. */ -#define BPF_F_ZERO_CSUM_TX (1ULL << 1) -#define BPF_F_DONT_FRAGMENT (1ULL << 2) -#define BPF_F_SEQ_NUMBER (1ULL << 3) +enum { + BPF_F_ZERO_CSUM_TX = (1ULL << 1), + BPF_F_DONT_FRAGMENT = (1ULL << 2), + BPF_F_SEQ_NUMBER = (1ULL << 3), +}; /* BPF_FUNC_perf_event_output, BPF_FUNC_perf_event_read and * BPF_FUNC_perf_event_read_value flags. */ -#define BPF_F_INDEX_MASK 0xffffffffULL -#define BPF_F_CURRENT_CPU BPF_F_INDEX_MASK +enum { + BPF_F_INDEX_MASK = 0xffffffffULL, + BPF_F_CURRENT_CPU = BPF_F_INDEX_MASK, /* BPF_FUNC_perf_event_output for sk_buff input context. */ -#define BPF_F_CTXLEN_MASK (0xfffffULL << 32) + BPF_F_CTXLEN_MASK = (0xfffffULL << 32), +}; /* Current network namespace */ -#define BPF_F_CURRENT_NETNS (-1L) +enum { + BPF_F_CURRENT_NETNS = (-1L), +}; /* BPF_FUNC_skb_adjust_room flags. */ -#define BPF_F_ADJ_ROOM_FIXED_GSO (1ULL << 0) +enum { + BPF_F_ADJ_ROOM_FIXED_GSO = (1ULL << 0), + BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 = (1ULL << 1), + BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 = (1ULL << 2), + BPF_F_ADJ_ROOM_ENCAP_L4_GRE = (1ULL << 3), + BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4), +}; -#define BPF_ADJ_ROOM_ENCAP_L2_MASK 0xff -#define BPF_ADJ_ROOM_ENCAP_L2_SHIFT 56 +enum { + BPF_ADJ_ROOM_ENCAP_L2_MASK = 0xff, + BPF_ADJ_ROOM_ENCAP_L2_SHIFT = 56, +}; -#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 (1ULL << 1) -#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 (1ULL << 2) -#define BPF_F_ADJ_ROOM_ENCAP_L4_GRE (1ULL << 3) -#define BPF_F_ADJ_ROOM_ENCAP_L4_UDP (1ULL << 4) #define BPF_F_ADJ_ROOM_ENCAP_L2(len) (((__u64)len & \ BPF_ADJ_ROOM_ENCAP_L2_MASK) \ << BPF_ADJ_ROOM_ENCAP_L2_SHIFT) /* BPF_FUNC_sysctl_get_name flags. */ -#define BPF_F_SYSCTL_BASE_NAME (1ULL << 0) +enum { + BPF_F_SYSCTL_BASE_NAME = (1ULL << 0), +}; /* BPF_FUNC_sk_storage_get flags */ -#define BPF_SK_STORAGE_GET_F_CREATE (1ULL << 0) +enum { + BPF_SK_STORAGE_GET_F_CREATE = (1ULL << 0), +}; /* BPF_FUNC_read_branch_records flags. */ -#define BPF_F_GET_BRANCH_RECORDS_SIZE (1ULL << 0) +enum { + BPF_F_GET_BRANCH_RECORDS_SIZE = (1ULL << 0), +}; /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { @@ -3529,13 +3561,14 @@ struct bpf_sock_ops { }; /* Definitions for bpf_sock_ops_cb_flags */ -#define BPF_SOCK_OPS_RTO_CB_FLAG (1<<0) -#define BPF_SOCK_OPS_RETRANS_CB_FLAG (1<<1) -#define BPF_SOCK_OPS_STATE_CB_FLAG (1<<2) -#define BPF_SOCK_OPS_RTT_CB_FLAG (1<<3) -#define BPF_SOCK_OPS_ALL_CB_FLAGS 0xF /* Mask of all currently - * supported cb flags - */ +enum { + BPF_SOCK_OPS_RTO_CB_FLAG = (1<<0), + BPF_SOCK_OPS_RETRANS_CB_FLAG = (1<<1), + BPF_SOCK_OPS_STATE_CB_FLAG = (1<<2), + BPF_SOCK_OPS_RTT_CB_FLAG = (1<<3), +/* Mask of all currently supported cb flags */ + BPF_SOCK_OPS_ALL_CB_FLAGS = 0xF, +}; /* List of known BPF sock_ops operators. * New entries can only be added at the end @@ -3614,8 +3647,10 @@ enum { BPF_TCP_MAX_STATES /* Leave at the end! */ }; -#define TCP_BPF_IW 1001 /* Set TCP initial congestion window */ -#define TCP_BPF_SNDCWND_CLAMP 1002 /* Set sndcwnd_clamp */ +enum { + TCP_BPF_IW = 1001, /* Set TCP initial congestion window */ + TCP_BPF_SNDCWND_CLAMP = 1002, /* Set sndcwnd_clamp */ +}; struct bpf_perf_event_value { __u64 counter; @@ -3623,12 +3658,16 @@ struct bpf_perf_event_value { __u64 running; }; -#define BPF_DEVCG_ACC_MKNOD (1ULL << 0) -#define BPF_DEVCG_ACC_READ (1ULL << 1) -#define BPF_DEVCG_ACC_WRITE (1ULL << 2) +enum { + BPF_DEVCG_ACC_MKNOD = (1ULL << 0), + BPF_DEVCG_ACC_READ = (1ULL << 1), + BPF_DEVCG_ACC_WRITE = (1ULL << 2), +}; -#define BPF_DEVCG_DEV_BLOCK (1ULL << 0) -#define BPF_DEVCG_DEV_CHAR (1ULL << 1) +enum { + BPF_DEVCG_DEV_BLOCK = (1ULL << 0), + BPF_DEVCG_DEV_CHAR = (1ULL << 1), +}; struct bpf_cgroup_dev_ctx { /* access_type encoded as (BPF_DEVCG_ACC_* << 16) | BPF_DEVCG_DEV_* */ @@ -3644,8 +3683,10 @@ struct bpf_raw_tracepoint_args { /* DIRECT: Skip the FIB rules and go to FIB table associated with device * OUTPUT: Do lookup from egress perspective; default is ingress */ -#define BPF_FIB_LOOKUP_DIRECT (1U << 0) -#define BPF_FIB_LOOKUP_OUTPUT (1U << 1) +enum { + BPF_FIB_LOOKUP_DIRECT = (1U << 0), + BPF_FIB_LOOKUP_OUTPUT = (1U << 1), +}; enum { BPF_FIB_LKUP_RET_SUCCESS, /* lookup successful */ @@ -3717,9 +3758,11 @@ enum bpf_task_fd_type { BPF_FD_TYPE_URETPROBE, /* filename + offset */ }; -#define BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG (1U << 0) -#define BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL (1U << 1) -#define BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP (1U << 2) +enum { + BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG = (1U << 0), + BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL = (1U << 1), + BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP = (1U << 2), +}; struct bpf_flow_keys { __u16 nhoff; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 7c689f4552dd..d6b33ea27bcc 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -73,7 +73,7 @@ struct bpf_insn { /* Key of an a BPF_MAP_TYPE_LPM_TRIE entry */ struct bpf_lpm_trie_key { __u32 prefixlen; /* up to 32 for AF_INET, 128 for AF_INET6 */ - __u8 data[0]; /* Arbitrary size */ + __u8 data[]; /* Arbitrary size */ }; struct bpf_cgroup_storage_key { @@ -325,44 +325,46 @@ enum bpf_attach_type { #define BPF_PSEUDO_CALL 1 /* flags for BPF_MAP_UPDATE_ELEM command */ -#define BPF_ANY 0 /* create new element or update existing */ -#define BPF_NOEXIST 1 /* create new element if it didn't exist */ -#define BPF_EXIST 2 /* update existing element */ -#define BPF_F_LOCK 4 /* spin_lock-ed map_lookup/map_update */ +enum { + BPF_ANY = 0, /* create new element or update existing */ + BPF_NOEXIST = 1, /* create new element if it didn't exist */ + BPF_EXIST = 2, /* update existing element */ + BPF_F_LOCK = 4, /* spin_lock-ed map_lookup/map_update */ +}; /* flags for BPF_MAP_CREATE command */ -#define BPF_F_NO_PREALLOC (1U << 0) +enum { + BPF_F_NO_PREALLOC = (1U << 0), /* Instead of having one common LRU list in the * BPF_MAP_TYPE_LRU_[PERCPU_]HASH map, use a percpu LRU list * which can scale and perform better. * Note, the LRU nodes (including free nodes) cannot be moved * across different LRU lists. */ -#define BPF_F_NO_COMMON_LRU (1U << 1) + BPF_F_NO_COMMON_LRU = (1U << 1), /* Specify numa node during map creation */ -#define BPF_F_NUMA_NODE (1U << 2) - -#define BPF_OBJ_NAME_LEN 16U + BPF_F_NUMA_NODE = (1U << 2), /* Flags for accessing BPF object from syscall side. */ -#define BPF_F_RDONLY (1U << 3) -#define BPF_F_WRONLY (1U << 4) + BPF_F_RDONLY = (1U << 3), + BPF_F_WRONLY = (1U << 4), /* Flag for stack_map, store build_id+offset instead of pointer */ -#define BPF_F_STACK_BUILD_ID (1U << 5) + BPF_F_STACK_BUILD_ID = (1U << 5), /* Zero-initialize hash function seed. This should only be used for testing. */ -#define BPF_F_ZERO_SEED (1U << 6) + BPF_F_ZERO_SEED = (1U << 6), /* Flags for accessing BPF object from program side. */ -#define BPF_F_RDONLY_PROG (1U << 7) -#define BPF_F_WRONLY_PROG (1U << 8) + BPF_F_RDONLY_PROG = (1U << 7), + BPF_F_WRONLY_PROG = (1U << 8), /* Clone map from listener for newly accepted socket */ -#define BPF_F_CLONE (1U << 9) + BPF_F_CLONE = (1U << 9), /* Enable memory-mapping BPF map */ -#define BPF_F_MMAPABLE (1U << 10) + BPF_F_MMAPABLE = (1U << 10), +}; /* Flags for BPF_PROG_QUERY. */ @@ -391,6 +393,8 @@ struct bpf_stack_build_id { }; }; +#define BPF_OBJ_NAME_LEN 16U + union bpf_attr { struct { /* anonymous struct used by BPF_MAP_CREATE command */ __u32 map_type; /* one of enum bpf_map_type */ @@ -3045,72 +3049,100 @@ enum bpf_func_id { /* All flags used by eBPF helper functions, placed here. */ /* BPF_FUNC_skb_store_bytes flags. */ -#define BPF_F_RECOMPUTE_CSUM (1ULL << 0) -#define BPF_F_INVALIDATE_HASH (1ULL << 1) +enum { + BPF_F_RECOMPUTE_CSUM = (1ULL << 0), + BPF_F_INVALIDATE_HASH = (1ULL << 1), +}; /* BPF_FUNC_l3_csum_replace and BPF_FUNC_l4_csum_replace flags. * First 4 bits are for passing the header field size. */ -#define BPF_F_HDR_FIELD_MASK 0xfULL +enum { + BPF_F_HDR_FIELD_MASK = 0xfULL, +}; /* BPF_FUNC_l4_csum_replace flags. */ -#define BPF_F_PSEUDO_HDR (1ULL << 4) -#define BPF_F_MARK_MANGLED_0 (1ULL << 5) -#define BPF_F_MARK_ENFORCE (1ULL << 6) +enum { + BPF_F_PSEUDO_HDR = (1ULL << 4), + BPF_F_MARK_MANGLED_0 = (1ULL << 5), + BPF_F_MARK_ENFORCE = (1ULL << 6), +}; /* BPF_FUNC_clone_redirect and BPF_FUNC_redirect flags. */ -#define BPF_F_INGRESS (1ULL << 0) +enum { + BPF_F_INGRESS = (1ULL << 0), +}; /* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */ -#define BPF_F_TUNINFO_IPV6 (1ULL << 0) +enum { + BPF_F_TUNINFO_IPV6 = (1ULL << 0), +}; /* flags for both BPF_FUNC_get_stackid and BPF_FUNC_get_stack. */ -#define BPF_F_SKIP_FIELD_MASK 0xffULL -#define BPF_F_USER_STACK (1ULL << 8) +enum { + BPF_F_SKIP_FIELD_MASK = 0xffULL, + BPF_F_USER_STACK = (1ULL << 8), /* flags used by BPF_FUNC_get_stackid only. */ -#define BPF_F_FAST_STACK_CMP (1ULL << 9) -#define BPF_F_REUSE_STACKID (1ULL << 10) + BPF_F_FAST_STACK_CMP = (1ULL << 9), + BPF_F_REUSE_STACKID = (1ULL << 10), /* flags used by BPF_FUNC_get_stack only. */ -#define BPF_F_USER_BUILD_ID (1ULL << 11) + BPF_F_USER_BUILD_ID = (1ULL << 11), +}; /* BPF_FUNC_skb_set_tunnel_key flags. */ -#define BPF_F_ZERO_CSUM_TX (1ULL << 1) -#define BPF_F_DONT_FRAGMENT (1ULL << 2) -#define BPF_F_SEQ_NUMBER (1ULL << 3) +enum { + BPF_F_ZERO_CSUM_TX = (1ULL << 1), + BPF_F_DONT_FRAGMENT = (1ULL << 2), + BPF_F_SEQ_NUMBER = (1ULL << 3), +}; /* BPF_FUNC_perf_event_output, BPF_FUNC_perf_event_read and * BPF_FUNC_perf_event_read_value flags. */ -#define BPF_F_INDEX_MASK 0xffffffffULL -#define BPF_F_CURRENT_CPU BPF_F_INDEX_MASK +enum { + BPF_F_INDEX_MASK = 0xffffffffULL, + BPF_F_CURRENT_CPU = BPF_F_INDEX_MASK, /* BPF_FUNC_perf_event_output for sk_buff input context. */ -#define BPF_F_CTXLEN_MASK (0xfffffULL << 32) + BPF_F_CTXLEN_MASK = (0xfffffULL << 32), +}; /* Current network namespace */ -#define BPF_F_CURRENT_NETNS (-1L) +enum { + BPF_F_CURRENT_NETNS = (-1L), +}; /* BPF_FUNC_skb_adjust_room flags. */ -#define BPF_F_ADJ_ROOM_FIXED_GSO (1ULL << 0) +enum { + BPF_F_ADJ_ROOM_FIXED_GSO = (1ULL << 0), + BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 = (1ULL << 1), + BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 = (1ULL << 2), + BPF_F_ADJ_ROOM_ENCAP_L4_GRE = (1ULL << 3), + BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4), +}; -#define BPF_ADJ_ROOM_ENCAP_L2_MASK 0xff -#define BPF_ADJ_ROOM_ENCAP_L2_SHIFT 56 +enum { + BPF_ADJ_ROOM_ENCAP_L2_MASK = 0xff, + BPF_ADJ_ROOM_ENCAP_L2_SHIFT = 56, +}; -#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 (1ULL << 1) -#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 (1ULL << 2) -#define BPF_F_ADJ_ROOM_ENCAP_L4_GRE (1ULL << 3) -#define BPF_F_ADJ_ROOM_ENCAP_L4_UDP (1ULL << 4) #define BPF_F_ADJ_ROOM_ENCAP_L2(len) (((__u64)len & \ BPF_ADJ_ROOM_ENCAP_L2_MASK) \ << BPF_ADJ_ROOM_ENCAP_L2_SHIFT) /* BPF_FUNC_sysctl_get_name flags. */ -#define BPF_F_SYSCTL_BASE_NAME (1ULL << 0) +enum { + BPF_F_SYSCTL_BASE_NAME = (1ULL << 0), +}; /* BPF_FUNC_sk_storage_get flags */ -#define BPF_SK_STORAGE_GET_F_CREATE (1ULL << 0) +enum { + BPF_SK_STORAGE_GET_F_CREATE = (1ULL << 0), +}; /* BPF_FUNC_read_branch_records flags. */ -#define BPF_F_GET_BRANCH_RECORDS_SIZE (1ULL << 0) +enum { + BPF_F_GET_BRANCH_RECORDS_SIZE = (1ULL << 0), +}; /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { @@ -3529,13 +3561,14 @@ struct bpf_sock_ops { }; /* Definitions for bpf_sock_ops_cb_flags */ -#define BPF_SOCK_OPS_RTO_CB_FLAG (1<<0) -#define BPF_SOCK_OPS_RETRANS_CB_FLAG (1<<1) -#define BPF_SOCK_OPS_STATE_CB_FLAG (1<<2) -#define BPF_SOCK_OPS_RTT_CB_FLAG (1<<3) -#define BPF_SOCK_OPS_ALL_CB_FLAGS 0xF /* Mask of all currently - * supported cb flags - */ +enum { + BPF_SOCK_OPS_RTO_CB_FLAG = (1<<0), + BPF_SOCK_OPS_RETRANS_CB_FLAG = (1<<1), + BPF_SOCK_OPS_STATE_CB_FLAG = (1<<2), + BPF_SOCK_OPS_RTT_CB_FLAG = (1<<3), +/* Mask of all currently supported cb flags */ + BPF_SOCK_OPS_ALL_CB_FLAGS = 0xF, +}; /* List of known BPF sock_ops operators. * New entries can only be added at the end @@ -3614,8 +3647,10 @@ enum { BPF_TCP_MAX_STATES /* Leave at the end! */ }; -#define TCP_BPF_IW 1001 /* Set TCP initial congestion window */ -#define TCP_BPF_SNDCWND_CLAMP 1002 /* Set sndcwnd_clamp */ +enum { + TCP_BPF_IW = 1001, /* Set TCP initial congestion window */ + TCP_BPF_SNDCWND_CLAMP = 1002, /* Set sndcwnd_clamp */ +}; struct bpf_perf_event_value { __u64 counter; @@ -3623,12 +3658,16 @@ struct bpf_perf_event_value { __u64 running; }; -#define BPF_DEVCG_ACC_MKNOD (1ULL << 0) -#define BPF_DEVCG_ACC_READ (1ULL << 1) -#define BPF_DEVCG_ACC_WRITE (1ULL << 2) +enum { + BPF_DEVCG_ACC_MKNOD = (1ULL << 0), + BPF_DEVCG_ACC_READ = (1ULL << 1), + BPF_DEVCG_ACC_WRITE = (1ULL << 2), +}; -#define BPF_DEVCG_DEV_BLOCK (1ULL << 0) -#define BPF_DEVCG_DEV_CHAR (1ULL << 1) +enum { + BPF_DEVCG_DEV_BLOCK = (1ULL << 0), + BPF_DEVCG_DEV_CHAR = (1ULL << 1), +}; struct bpf_cgroup_dev_ctx { /* access_type encoded as (BPF_DEVCG_ACC_* << 16) | BPF_DEVCG_DEV_* */ @@ -3644,8 +3683,10 @@ struct bpf_raw_tracepoint_args { /* DIRECT: Skip the FIB rules and go to FIB table associated with device * OUTPUT: Do lookup from egress perspective; default is ingress */ -#define BPF_FIB_LOOKUP_DIRECT (1U << 0) -#define BPF_FIB_LOOKUP_OUTPUT (1U << 1) +enum { + BPF_FIB_LOOKUP_DIRECT = (1U << 0), + BPF_FIB_LOOKUP_OUTPUT = (1U << 1), +}; enum { BPF_FIB_LKUP_RET_SUCCESS, /* lookup successful */ @@ -3717,9 +3758,11 @@ enum bpf_task_fd_type { BPF_FD_TYPE_URETPROBE, /* filename + offset */ }; -#define BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG (1U << 0) -#define BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL (1U << 1) -#define BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP (1U << 2) +enum { + BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG = (1U << 0), + BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL = (1U << 1), + BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP = (1U << 2), +}; struct bpf_flow_keys { __u16 nhoff; -- cgit v1.2.3 From 88fd9e5352fe05f7fe57778293aebd4cd106960b Mon Sep 17 00:00:00 2001 From: KP Singh Date: Wed, 4 Mar 2020 20:18:47 +0100 Subject: bpf: Refactor trampoline update code As we need to introduce a third type of attachment for trampolines, the flattened signature of arch_prepare_bpf_trampoline gets even more complicated. Refactor the prog and count argument to arch_prepare_bpf_trampoline to use bpf_tramp_progs to simplify the addition and accounting for new attachment types. Signed-off-by: KP Singh Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200304191853.1529-2-kpsingh@chromium.org --- arch/x86/net/bpf_jit_comp.c | 31 ++++++++++++----------- include/linux/bpf.h | 13 ++++++++-- kernel/bpf/bpf_struct_ops.c | 10 +++++++- kernel/bpf/trampoline.c | 62 +++++++++++++++++++++++++-------------------- 4 files changed, 71 insertions(+), 45 deletions(-) (limited to 'include') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 9ba08e9abc09..15c7d28bc05c 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1362,12 +1362,12 @@ static void restore_regs(const struct btf_func_model *m, u8 **prog, int nr_args, } static int invoke_bpf(const struct btf_func_model *m, u8 **pprog, - struct bpf_prog **progs, int prog_cnt, int stack_size) + struct bpf_tramp_progs *tp, int stack_size) { u8 *prog = *pprog; int cnt = 0, i; - for (i = 0; i < prog_cnt; i++) { + for (i = 0; i < tp->nr_progs; i++) { if (emit_call(&prog, __bpf_prog_enter, prog)) return -EINVAL; /* remember prog start time returned by __bpf_prog_enter */ @@ -1376,17 +1376,17 @@ static int invoke_bpf(const struct btf_func_model *m, u8 **pprog, /* arg1: lea rdi, [rbp - stack_size] */ EMIT4(0x48, 0x8D, 0x7D, -stack_size); /* arg2: progs[i]->insnsi for interpreter */ - if (!progs[i]->jited) + if (!tp->progs[i]->jited) emit_mov_imm64(&prog, BPF_REG_2, - (long) progs[i]->insnsi >> 32, - (u32) (long) progs[i]->insnsi); + (long) tp->progs[i]->insnsi >> 32, + (u32) (long) tp->progs[i]->insnsi); /* call JITed bpf program or interpreter */ - if (emit_call(&prog, progs[i]->bpf_func, prog)) + if (emit_call(&prog, tp->progs[i]->bpf_func, prog)) return -EINVAL; /* arg1: mov rdi, progs[i] */ - emit_mov_imm64(&prog, BPF_REG_1, (long) progs[i] >> 32, - (u32) (long) progs[i]); + emit_mov_imm64(&prog, BPF_REG_1, (long) tp->progs[i] >> 32, + (u32) (long) tp->progs[i]); /* arg2: mov rsi, rbx <- start time in nsec */ emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6); if (emit_call(&prog, __bpf_prog_exit, prog)) @@ -1458,12 +1458,13 @@ static int invoke_bpf(const struct btf_func_model *m, u8 **pprog, */ int arch_prepare_bpf_trampoline(void *image, void *image_end, const struct btf_func_model *m, u32 flags, - struct bpf_prog **fentry_progs, int fentry_cnt, - struct bpf_prog **fexit_progs, int fexit_cnt, + struct bpf_tramp_progs *tprogs, void *orig_call) { int cnt = 0, nr_args = m->nr_args; int stack_size = nr_args * 8; + struct bpf_tramp_progs *fentry = &tprogs[BPF_TRAMP_FENTRY]; + struct bpf_tramp_progs *fexit = &tprogs[BPF_TRAMP_FEXIT]; u8 *prog; /* x86-64 supports up to 6 arguments. 7+ can be added in the future */ @@ -1492,12 +1493,12 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, save_regs(m, &prog, nr_args, stack_size); - if (fentry_cnt) - if (invoke_bpf(m, &prog, fentry_progs, fentry_cnt, stack_size)) + if (fentry->nr_progs) + if (invoke_bpf(m, &prog, fentry, stack_size)) return -EINVAL; if (flags & BPF_TRAMP_F_CALL_ORIG) { - if (fentry_cnt) + if (fentry->nr_progs) restore_regs(m, &prog, nr_args, stack_size); /* call original function */ @@ -1507,8 +1508,8 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8); } - if (fexit_cnt) - if (invoke_bpf(m, &prog, fexit_progs, fexit_cnt, stack_size)) + if (fexit->nr_progs) + if (invoke_bpf(m, &prog, fexit, stack_size)) return -EINVAL; if (flags & BPF_TRAMP_F_RESTORE_REGS) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f13c78c6f29d..98ec10b23dbb 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -433,6 +433,16 @@ struct btf_func_model { */ #define BPF_TRAMP_F_SKIP_FRAME BIT(2) +/* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50 + * bytes on x86. Pick a number to fit into BPF_IMAGE_SIZE / 2 + */ +#define BPF_MAX_TRAMP_PROGS 40 + +struct bpf_tramp_progs { + struct bpf_prog *progs[BPF_MAX_TRAMP_PROGS]; + int nr_progs; +}; + /* Different use cases for BPF trampoline: * 1. replace nop at the function entry (kprobe equivalent) * flags = BPF_TRAMP_F_RESTORE_REGS @@ -455,8 +465,7 @@ struct btf_func_model { */ int arch_prepare_bpf_trampoline(void *image, void *image_end, const struct btf_func_model *m, u32 flags, - struct bpf_prog **fentry_progs, int fentry_cnt, - struct bpf_prog **fexit_progs, int fexit_cnt, + struct bpf_tramp_progs *tprogs, void *orig_call); /* these two functions are called from generated trampoline */ u64 notrace __bpf_prog_enter(void); diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index c498f0fffb40..ca5cc8cdb6eb 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -320,6 +320,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, struct bpf_struct_ops_value *uvalue, *kvalue; const struct btf_member *member; const struct btf_type *t = st_ops->type; + struct bpf_tramp_progs *tprogs = NULL; void *udata, *kdata; int prog_fd, err = 0; void *image; @@ -343,6 +344,10 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, if (uvalue->state || refcount_read(&uvalue->refcnt)) return -EINVAL; + tprogs = kcalloc(BPF_TRAMP_MAX, sizeof(*tprogs), GFP_KERNEL); + if (!tprogs) + return -ENOMEM; + uvalue = (struct bpf_struct_ops_value *)st_map->uvalue; kvalue = (struct bpf_struct_ops_value *)&st_map->kvalue; @@ -425,10 +430,12 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, goto reset_unlock; } + tprogs[BPF_TRAMP_FENTRY].progs[0] = prog; + tprogs[BPF_TRAMP_FENTRY].nr_progs = 1; err = arch_prepare_bpf_trampoline(image, st_map->image + PAGE_SIZE, &st_ops->func_models[i], 0, - &prog, 1, NULL, 0, NULL); + tprogs, NULL); if (err < 0) goto reset_unlock; @@ -469,6 +476,7 @@ reset_unlock: memset(uvalue, 0, map->value_size); memset(kvalue, 0, map->value_size); unlock: + kfree(tprogs); mutex_unlock(&st_map->lock); return err; } diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 704fa787fec0..546198f6f307 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -190,40 +190,49 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr) return ret; } -/* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50 - * bytes on x86. Pick a number to fit into BPF_IMAGE_SIZE / 2 - */ -#define BPF_MAX_TRAMP_PROGS 40 +static struct bpf_tramp_progs * +bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total) +{ + const struct bpf_prog_aux *aux; + struct bpf_tramp_progs *tprogs; + struct bpf_prog **progs; + int kind; + + *total = 0; + tprogs = kcalloc(BPF_TRAMP_MAX, sizeof(*tprogs), GFP_KERNEL); + if (!tprogs) + return ERR_PTR(-ENOMEM); + + for (kind = 0; kind < BPF_TRAMP_MAX; kind++) { + tprogs[kind].nr_progs = tr->progs_cnt[kind]; + *total += tr->progs_cnt[kind]; + progs = tprogs[kind].progs; + + hlist_for_each_entry(aux, &tr->progs_hlist[kind], tramp_hlist) + *progs++ = aux->prog; + } + return tprogs; +} static int bpf_trampoline_update(struct bpf_trampoline *tr) { void *old_image = tr->image + ((tr->selector + 1) & 1) * BPF_IMAGE_SIZE/2; void *new_image = tr->image + (tr->selector & 1) * BPF_IMAGE_SIZE/2; - struct bpf_prog *progs_to_run[BPF_MAX_TRAMP_PROGS]; - int fentry_cnt = tr->progs_cnt[BPF_TRAMP_FENTRY]; - int fexit_cnt = tr->progs_cnt[BPF_TRAMP_FEXIT]; - struct bpf_prog **progs, **fentry, **fexit; + struct bpf_tramp_progs *tprogs; u32 flags = BPF_TRAMP_F_RESTORE_REGS; - struct bpf_prog_aux *aux; - int err; + int err, total; - if (fentry_cnt + fexit_cnt == 0) { + tprogs = bpf_trampoline_get_progs(tr, &total); + if (IS_ERR(tprogs)) + return PTR_ERR(tprogs); + + if (total == 0) { err = unregister_fentry(tr, old_image); tr->selector = 0; goto out; } - /* populate fentry progs */ - fentry = progs = progs_to_run; - hlist_for_each_entry(aux, &tr->progs_hlist[BPF_TRAMP_FENTRY], tramp_hlist) - *progs++ = aux->prog; - - /* populate fexit progs */ - fexit = progs; - hlist_for_each_entry(aux, &tr->progs_hlist[BPF_TRAMP_FEXIT], tramp_hlist) - *progs++ = aux->prog; - - if (fexit_cnt) + if (tprogs[BPF_TRAMP_FEXIT].nr_progs) flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME; /* Though the second half of trampoline page is unused a task could be @@ -232,12 +241,11 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) * preempted task. Hence wait for tasks to voluntarily schedule or go * to userspace. */ + synchronize_rcu_tasks(); err = arch_prepare_bpf_trampoline(new_image, new_image + BPF_IMAGE_SIZE / 2, - &tr->func.model, flags, - fentry, fentry_cnt, - fexit, fexit_cnt, + &tr->func.model, flags, tprogs, tr->func.addr); if (err < 0) goto out; @@ -252,6 +260,7 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) goto out; tr->selector++; out: + kfree(tprogs); return err; } @@ -409,8 +418,7 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start) int __weak arch_prepare_bpf_trampoline(void *image, void *image_end, const struct btf_func_model *m, u32 flags, - struct bpf_prog **fentry_progs, int fentry_cnt, - struct bpf_prog **fexit_progs, int fexit_cnt, + struct bpf_tramp_progs *tprogs, void *orig_call) { return -ENOTSUPP; -- cgit v1.2.3 From ae24082331d9bbaae283aafbe930a8f0eb85605a Mon Sep 17 00:00:00 2001 From: KP Singh Date: Wed, 4 Mar 2020 20:18:49 +0100 Subject: bpf: Introduce BPF_MODIFY_RETURN When multiple programs are attached, each program receives the return value from the previous program on the stack and the last program provides the return value to the attached function. The fmod_ret bpf programs are run after the fentry programs and before the fexit programs. The original function is only called if all the fmod_ret programs return 0 to avoid any unintended side-effects. The success value, i.e. 0 is not currently configurable but can be made so where user-space can specify it at load time. For example: int func_to_be_attached(int a, int b) { <--- do_fentry do_fmod_ret: if (ret != 0) goto do_fexit; original_function: } <--- do_fexit The fmod_ret program attached to this function can be defined as: SEC("fmod_ret/func_to_be_attached") int BPF_PROG(func_name, int a, int b, int ret) { // This will skip the original function logic. return 1; } The first fmod_ret program is passed 0 in its return argument. Signed-off-by: KP Singh Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200304191853.1529-4-kpsingh@chromium.org --- arch/x86/net/bpf_jit_comp.c | 130 +++++++++++++++++++++++++++++++++++++---- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 1 + kernel/bpf/btf.c | 3 +- kernel/bpf/syscall.c | 1 + kernel/bpf/trampoline.c | 5 +- kernel/bpf/verifier.c | 1 + tools/include/uapi/linux/bpf.h | 1 + 8 files changed, 130 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index d6349e930b06..b1fd000feb89 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1362,7 +1362,7 @@ static void restore_regs(const struct btf_func_model *m, u8 **prog, int nr_args, } static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, - struct bpf_prog *p, int stack_size) + struct bpf_prog *p, int stack_size, bool mod_ret) { u8 *prog = *pprog; int cnt = 0; @@ -1383,6 +1383,13 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, if (emit_call(&prog, p->bpf_func, prog)) return -EINVAL; + /* BPF_TRAMP_MODIFY_RETURN trampolines can modify the return + * of the previous call which is then passed on the stack to + * the next BPF program. + */ + if (mod_ret) + emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8); + /* arg1: mov rdi, progs[i] */ emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p); @@ -1442,6 +1449,23 @@ static int emit_cond_near_jump(u8 **pprog, void *func, void *ip, u8 jmp_cond) return 0; } +static int emit_mod_ret_check_imm8(u8 **pprog, int value) +{ + u8 *prog = *pprog; + int cnt = 0; + + if (!is_imm8(value)) + return -EINVAL; + + if (value == 0) + EMIT2(0x85, add_2reg(0xC0, BPF_REG_0, BPF_REG_0)); + else + EMIT3(0x83, add_1reg(0xF8, BPF_REG_0), value); + + *pprog = prog; + return 0; +} + static int invoke_bpf(const struct btf_func_model *m, u8 **pprog, struct bpf_tramp_progs *tp, int stack_size) { @@ -1449,9 +1473,49 @@ static int invoke_bpf(const struct btf_func_model *m, u8 **pprog, u8 *prog = *pprog; for (i = 0; i < tp->nr_progs; i++) { - if (invoke_bpf_prog(m, &prog, tp->progs[i], stack_size)) + if (invoke_bpf_prog(m, &prog, tp->progs[i], stack_size, false)) + return -EINVAL; + } + *pprog = prog; + return 0; +} + +static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog, + struct bpf_tramp_progs *tp, int stack_size, + u8 **branches) +{ + u8 *prog = *pprog; + int i; + + /* The first fmod_ret program will receive a garbage return value. + * Set this to 0 to avoid confusing the program. + */ + emit_mov_imm32(&prog, false, BPF_REG_0, 0); + emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8); + for (i = 0; i < tp->nr_progs; i++) { + if (invoke_bpf_prog(m, &prog, tp->progs[i], stack_size, true)) return -EINVAL; + + /* Generate a branch: + * + * if (ret != 0) + * goto do_fexit; + * + * If needed this can be extended to any integer value which can + * be passed by user-space when the program is loaded. + */ + if (emit_mod_ret_check_imm8(&prog, 0)) + return -EINVAL; + + /* Save the location of the branch and Generate 6 nops + * (4 bytes for an offset and 2 bytes for the jump) These nops + * are replaced with a conditional jump once do_fexit (i.e. the + * start of the fexit invocation) is finalized. + */ + branches[i] = prog; + emit_nops(&prog, 4 + 2); } + *pprog = prog; return 0; } @@ -1521,10 +1585,12 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, struct bpf_tramp_progs *tprogs, void *orig_call) { - int cnt = 0, nr_args = m->nr_args; + int ret, i, cnt = 0, nr_args = m->nr_args; int stack_size = nr_args * 8; struct bpf_tramp_progs *fentry = &tprogs[BPF_TRAMP_FENTRY]; struct bpf_tramp_progs *fexit = &tprogs[BPF_TRAMP_FEXIT]; + struct bpf_tramp_progs *fmod_ret = &tprogs[BPF_TRAMP_MODIFY_RETURN]; + u8 **branches = NULL; u8 *prog; /* x86-64 supports up to 6 arguments. 7+ can be added in the future */ @@ -1557,24 +1623,60 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, if (invoke_bpf(m, &prog, fentry, stack_size)) return -EINVAL; + if (fmod_ret->nr_progs) { + branches = kcalloc(fmod_ret->nr_progs, sizeof(u8 *), + GFP_KERNEL); + if (!branches) + return -ENOMEM; + + if (invoke_bpf_mod_ret(m, &prog, fmod_ret, stack_size, + branches)) { + ret = -EINVAL; + goto cleanup; + } + } + if (flags & BPF_TRAMP_F_CALL_ORIG) { - if (fentry->nr_progs) + if (fentry->nr_progs || fmod_ret->nr_progs) restore_regs(m, &prog, nr_args, stack_size); /* call original function */ - if (emit_call(&prog, orig_call, prog)) - return -EINVAL; + if (emit_call(&prog, orig_call, prog)) { + ret = -EINVAL; + goto cleanup; + } /* remember return value in a stack for bpf prog to access */ emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8); } + if (fmod_ret->nr_progs) { + /* From Intel 64 and IA-32 Architectures Optimization + * Reference Manual, 3.4.1.4 Code Alignment, Assembly/Compiler + * Coding Rule 11: All branch targets should be 16-byte + * aligned. + */ + emit_align(&prog, 16); + /* Update the branches saved in invoke_bpf_mod_ret with the + * aligned address of do_fexit. + */ + for (i = 0; i < fmod_ret->nr_progs; i++) + emit_cond_near_jump(&branches[i], prog, branches[i], + X86_JNE); + } + if (fexit->nr_progs) - if (invoke_bpf(m, &prog, fexit, stack_size)) - return -EINVAL; + if (invoke_bpf(m, &prog, fexit, stack_size)) { + ret = -EINVAL; + goto cleanup; + } if (flags & BPF_TRAMP_F_RESTORE_REGS) restore_regs(m, &prog, nr_args, stack_size); + /* This needs to be done regardless. If there were fmod_ret programs, + * the return value is only updated on the stack and still needs to be + * restored to R0. + */ if (flags & BPF_TRAMP_F_CALL_ORIG) /* restore original return value back into RAX */ emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8); @@ -1586,9 +1688,15 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, EMIT4(0x48, 0x83, 0xC4, 8); /* add rsp, 8 */ EMIT1(0xC3); /* ret */ /* Make sure the trampoline generation logic doesn't overflow */ - if (WARN_ON_ONCE(prog > (u8 *)image_end - BPF_INSN_SAFETY)) - return -EFAULT; - return prog - (u8 *)image; + if (WARN_ON_ONCE(prog > (u8 *)image_end - BPF_INSN_SAFETY)) { + ret = -EFAULT; + goto cleanup; + } + ret = prog - (u8 *)image; + +cleanup: + kfree(branches); + return ret; } static int emit_fallback_jump(u8 **pprog) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 98ec10b23dbb..f748b31e5888 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -474,6 +474,7 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start); enum bpf_tramp_prog_type { BPF_TRAMP_FENTRY, BPF_TRAMP_FEXIT, + BPF_TRAMP_MODIFY_RETURN, BPF_TRAMP_MAX, BPF_TRAMP_REPLACE, /* more than MAX */ }; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d6b33ea27bcc..40b2d9476268 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -210,6 +210,7 @@ enum bpf_attach_type { BPF_TRACE_RAW_TP, BPF_TRACE_FENTRY, BPF_TRACE_FEXIT, + BPF_MODIFY_RETURN, __MAX_BPF_ATTACH_TYPE }; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 787140095e58..30841fb8b3c0 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3710,7 +3710,8 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, nr_args--; } - if (prog->expected_attach_type == BPF_TRACE_FEXIT && + if ((prog->expected_attach_type == BPF_TRACE_FEXIT || + prog->expected_attach_type == BPF_MODIFY_RETURN) && arg == nr_args) { if (!t) /* Default prog with 5 args. 6th arg is retval. */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 13de65363ba2..7ce0815793dd 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2324,6 +2324,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog) if (prog->expected_attach_type != BPF_TRACE_FENTRY && prog->expected_attach_type != BPF_TRACE_FEXIT && + prog->expected_attach_type != BPF_MODIFY_RETURN && prog->type != BPF_PROG_TYPE_EXT) { err = -EINVAL; goto out_put_prog; diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 546198f6f307..221a17af1f81 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -232,7 +232,8 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) goto out; } - if (tprogs[BPF_TRAMP_FEXIT].nr_progs) + if (tprogs[BPF_TRAMP_FEXIT].nr_progs || + tprogs[BPF_TRAMP_MODIFY_RETURN].nr_progs) flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME; /* Though the second half of trampoline page is unused a task could be @@ -269,6 +270,8 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(enum bpf_attach_type t) switch (t) { case BPF_TRACE_FENTRY: return BPF_TRAMP_FENTRY; + case BPF_MODIFY_RETURN: + return BPF_TRAMP_MODIFY_RETURN; case BPF_TRACE_FEXIT: return BPF_TRAMP_FEXIT; default: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 289383edfc8c..2460c8e6b5be 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9950,6 +9950,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) if (!prog_extension) return -EINVAL; /* fallthrough */ + case BPF_MODIFY_RETURN: case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: if (!btf_type_is_func(t)) { diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index d6b33ea27bcc..40b2d9476268 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -210,6 +210,7 @@ enum bpf_attach_type { BPF_TRACE_RAW_TP, BPF_TRACE_FENTRY, BPF_TRACE_FEXIT, + BPF_MODIFY_RETURN, __MAX_BPF_ATTACH_TYPE }; -- cgit v1.2.3 From da00d2f117a08fbca262db5ea422c80a568b112b Mon Sep 17 00:00:00 2001 From: KP Singh Date: Wed, 4 Mar 2020 20:18:52 +0100 Subject: bpf: Add test ops for BPF_PROG_TYPE_TRACING The current fexit and fentry tests rely on a different program to exercise the functions they attach to. Instead of doing this, implement the test operations for tracing which will also be used for BPF_MODIFY_RETURN in a subsequent patch. Also, clean up the fexit test to use the generated skeleton. Signed-off-by: KP Singh Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200304191853.1529-7-kpsingh@chromium.org --- include/linux/bpf.h | 10 ++++ kernel/trace/bpf_trace.c | 1 + net/bpf/test_run.c | 37 +++++++++--- .../selftests/bpf/prog_tests/fentry_fexit.c | 12 +--- .../testing/selftests/bpf/prog_tests/fentry_test.c | 14 ++--- .../testing/selftests/bpf/prog_tests/fexit_test.c | 69 +++++++--------------- 6 files changed, 67 insertions(+), 76 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f748b31e5888..40c53924571d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1156,6 +1156,9 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); +int bpf_prog_test_run_tracing(struct bpf_prog *prog, + const union bpf_attr *kattr, + union bpf_attr __user *uattr); int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); @@ -1313,6 +1316,13 @@ static inline int bpf_prog_test_run_skb(struct bpf_prog *prog, return -ENOTSUPP; } +static inline int bpf_prog_test_run_tracing(struct bpf_prog *prog, + const union bpf_attr *kattr, + union bpf_attr __user *uattr) +{ + return -ENOTSUPP; +} + static inline int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 07764c761073..363e0a2c75cf 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1266,6 +1266,7 @@ const struct bpf_verifier_ops tracing_verifier_ops = { }; const struct bpf_prog_ops tracing_prog_ops = { + .test_run = bpf_prog_test_run_tracing, }; static bool raw_tp_writable_prog_is_valid_access(int off, int size, diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 1cd7a1c2f8b2..3600f098e7c6 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -160,18 +160,37 @@ static void *bpf_test_init(const union bpf_attr *kattr, u32 size, kfree(data); return ERR_PTR(-EFAULT); } - if (bpf_fentry_test1(1) != 2 || - bpf_fentry_test2(2, 3) != 5 || - bpf_fentry_test3(4, 5, 6) != 15 || - bpf_fentry_test4((void *)7, 8, 9, 10) != 34 || - bpf_fentry_test5(11, (void *)12, 13, 14, 15) != 65 || - bpf_fentry_test6(16, (void *)17, 18, 19, (void *)20, 21) != 111) { - kfree(data); - return ERR_PTR(-EFAULT); - } + return data; } +int bpf_prog_test_run_tracing(struct bpf_prog *prog, + const union bpf_attr *kattr, + union bpf_attr __user *uattr) +{ + int err = -EFAULT; + + switch (prog->expected_attach_type) { + case BPF_TRACE_FENTRY: + case BPF_TRACE_FEXIT: + if (bpf_fentry_test1(1) != 2 || + bpf_fentry_test2(2, 3) != 5 || + bpf_fentry_test3(4, 5, 6) != 15 || + bpf_fentry_test4((void *)7, 8, 9, 10) != 34 || + bpf_fentry_test5(11, (void *)12, 13, 14, 15) != 65 || + bpf_fentry_test6(16, (void *)17, 18, 19, (void *)20, 21) != 111) + goto out; + break; + default: + goto out; + } + + err = 0; +out: + trace_bpf_test_finish(&err); + return err; +} + static void *bpf_ctx_init(const union bpf_attr *kattr, u32 max_size) { void __user *data_in = u64_to_user_ptr(kattr->test.ctx_in); diff --git a/tools/testing/selftests/bpf/prog_tests/fentry_fexit.c b/tools/testing/selftests/bpf/prog_tests/fentry_fexit.c index 235ac4f67f5b..83493bd5745c 100644 --- a/tools/testing/selftests/bpf/prog_tests/fentry_fexit.c +++ b/tools/testing/selftests/bpf/prog_tests/fentry_fexit.c @@ -1,22 +1,17 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2019 Facebook */ #include -#include "test_pkt_access.skel.h" #include "fentry_test.skel.h" #include "fexit_test.skel.h" void test_fentry_fexit(void) { - struct test_pkt_access *pkt_skel = NULL; struct fentry_test *fentry_skel = NULL; struct fexit_test *fexit_skel = NULL; __u64 *fentry_res, *fexit_res; __u32 duration = 0, retval; - int err, pkt_fd, i; + int err, prog_fd, i; - pkt_skel = test_pkt_access__open_and_load(); - if (CHECK(!pkt_skel, "pkt_skel_load", "pkt_access skeleton failed\n")) - return; fentry_skel = fentry_test__open_and_load(); if (CHECK(!fentry_skel, "fentry_skel_load", "fentry skeleton failed\n")) goto close_prog; @@ -31,8 +26,8 @@ void test_fentry_fexit(void) if (CHECK(err, "fexit_attach", "fexit attach failed: %d\n", err)) goto close_prog; - pkt_fd = bpf_program__fd(pkt_skel->progs.test_pkt_access); - err = bpf_prog_test_run(pkt_fd, 1, &pkt_v6, sizeof(pkt_v6), + prog_fd = bpf_program__fd(fexit_skel->progs.test1); + err = bpf_prog_test_run(prog_fd, 1, NULL, 0, NULL, NULL, &retval, &duration); CHECK(err || retval, "ipv6", "err %d errno %d retval %d duration %d\n", @@ -49,7 +44,6 @@ void test_fentry_fexit(void) } close_prog: - test_pkt_access__destroy(pkt_skel); fentry_test__destroy(fentry_skel); fexit_test__destroy(fexit_skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/fentry_test.c b/tools/testing/selftests/bpf/prog_tests/fentry_test.c index 5cc06021f27d..04ebbf1cb390 100644 --- a/tools/testing/selftests/bpf/prog_tests/fentry_test.c +++ b/tools/testing/selftests/bpf/prog_tests/fentry_test.c @@ -1,20 +1,15 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2019 Facebook */ #include -#include "test_pkt_access.skel.h" #include "fentry_test.skel.h" void test_fentry_test(void) { - struct test_pkt_access *pkt_skel = NULL; struct fentry_test *fentry_skel = NULL; - int err, pkt_fd, i; + int err, prog_fd, i; __u32 duration = 0, retval; __u64 *result; - pkt_skel = test_pkt_access__open_and_load(); - if (CHECK(!pkt_skel, "pkt_skel_load", "pkt_access skeleton failed\n")) - return; fentry_skel = fentry_test__open_and_load(); if (CHECK(!fentry_skel, "fentry_skel_load", "fentry skeleton failed\n")) goto cleanup; @@ -23,10 +18,10 @@ void test_fentry_test(void) if (CHECK(err, "fentry_attach", "fentry attach failed: %d\n", err)) goto cleanup; - pkt_fd = bpf_program__fd(pkt_skel->progs.test_pkt_access); - err = bpf_prog_test_run(pkt_fd, 1, &pkt_v6, sizeof(pkt_v6), + prog_fd = bpf_program__fd(fentry_skel->progs.test1); + err = bpf_prog_test_run(prog_fd, 1, NULL, 0, NULL, NULL, &retval, &duration); - CHECK(err || retval, "ipv6", + CHECK(err || retval, "test_run", "err %d errno %d retval %d duration %d\n", err, errno, retval, duration); @@ -39,5 +34,4 @@ void test_fentry_test(void) cleanup: fentry_test__destroy(fentry_skel); - test_pkt_access__destroy(pkt_skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/fexit_test.c b/tools/testing/selftests/bpf/prog_tests/fexit_test.c index d2c3655dd7a3..78d7a2765c27 100644 --- a/tools/testing/selftests/bpf/prog_tests/fexit_test.c +++ b/tools/testing/selftests/bpf/prog_tests/fexit_test.c @@ -1,64 +1,37 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2019 Facebook */ #include +#include "fexit_test.skel.h" void test_fexit_test(void) { - struct bpf_prog_load_attr attr = { - .file = "./fexit_test.o", - }; - - char prog_name[] = "fexit/bpf_fentry_testX"; - struct bpf_object *obj = NULL, *pkt_obj; - int err, pkt_fd, kfree_skb_fd, i; - struct bpf_link *link[6] = {}; - struct bpf_program *prog[6]; + struct fexit_test *fexit_skel = NULL; + int err, prog_fd, i; __u32 duration = 0, retval; - struct bpf_map *data_map; - const int zero = 0; - u64 result[6]; + __u64 *result; - err = bpf_prog_load("./test_pkt_access.o", BPF_PROG_TYPE_SCHED_CLS, - &pkt_obj, &pkt_fd); - if (CHECK(err, "prog_load sched cls", "err %d errno %d\n", err, errno)) - return; - err = bpf_prog_load_xattr(&attr, &obj, &kfree_skb_fd); - if (CHECK(err, "prog_load fail", "err %d errno %d\n", err, errno)) - goto close_prog; + fexit_skel = fexit_test__open_and_load(); + if (CHECK(!fexit_skel, "fexit_skel_load", "fexit skeleton failed\n")) + goto cleanup; - for (i = 0; i < 6; i++) { - prog_name[sizeof(prog_name) - 2] = '1' + i; - prog[i] = bpf_object__find_program_by_title(obj, prog_name); - if (CHECK(!prog[i], "find_prog", "prog %s not found\n", prog_name)) - goto close_prog; - link[i] = bpf_program__attach_trace(prog[i]); - if (CHECK(IS_ERR(link[i]), "attach_trace", "failed to link\n")) - goto close_prog; - } - data_map = bpf_object__find_map_by_name(obj, "fexit_te.bss"); - if (CHECK(!data_map, "find_data_map", "data map not found\n")) - goto close_prog; + err = fexit_test__attach(fexit_skel); + if (CHECK(err, "fexit_attach", "fexit attach failed: %d\n", err)) + goto cleanup; - err = bpf_prog_test_run(pkt_fd, 1, &pkt_v6, sizeof(pkt_v6), + prog_fd = bpf_program__fd(fexit_skel->progs.test1); + err = bpf_prog_test_run(prog_fd, 1, NULL, 0, NULL, NULL, &retval, &duration); - CHECK(err || retval, "ipv6", + CHECK(err || retval, "test_run", "err %d errno %d retval %d duration %d\n", err, errno, retval, duration); - err = bpf_map_lookup_elem(bpf_map__fd(data_map), &zero, &result); - if (CHECK(err, "get_result", - "failed to get output data: %d\n", err)) - goto close_prog; - - for (i = 0; i < 6; i++) - if (CHECK(result[i] != 1, "result", "bpf_fentry_test%d failed err %ld\n", - i + 1, result[i])) - goto close_prog; + result = (__u64 *)fexit_skel->bss; + for (i = 0; i < 6; i++) { + if (CHECK(result[i] != 1, "result", + "fexit_test%d failed err %lld\n", i + 1, result[i])) + goto cleanup; + } -close_prog: - for (i = 0; i < 6; i++) - if (!IS_ERR_OR_NULL(link[i])) - bpf_link__destroy(link[i]); - bpf_object__close(obj); - bpf_object__close(pkt_obj); +cleanup: + fexit_test__destroy(fexit_skel); } -- cgit v1.2.3 From 7b70973d7edb2f005511102d5a2e0116464a46a1 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 9 Mar 2020 11:12:32 +0000 Subject: bpf: sockmap: Only check ULP for TCP sockets The sock map code checks that a socket does not have an active upper layer protocol before inserting it into the map. This requires casting via inet_csk, which isn't valid for UDP sockets. Guard checks for ULP by checking inet_sk(sk)->is_icsk first. Signed-off-by: Lorenz Bauer Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Sitnicki Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200309111243.6982-2-lmb@cloudflare.com --- include/linux/skmsg.h | 8 +++++++- include/net/inet_connection_sock.h | 6 ++++++ net/core/sock_map.c | 6 ++---- net/ipv4/tcp_ulp.c | 7 ------- 4 files changed, 15 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 112765bd146d..4d3d75d63066 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -360,7 +360,13 @@ static inline void sk_psock_restore_proto(struct sock *sk, struct sk_psock *psock) { sk->sk_prot->unhash = psock->saved_unhash; - tcp_update_ulp(sk, psock->sk_proto, psock->saved_write_space); + if (inet_csk_has_ulp(sk)) { + tcp_update_ulp(sk, psock->sk_proto, psock->saved_write_space); + } else { + sk->sk_write_space = psock->saved_write_space; + /* Pairs with lockless read in sk_clone_lock() */ + WRITE_ONCE(sk->sk_prot, psock->sk_proto); + } } static inline void sk_psock_set_state(struct sk_psock *psock, diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 895546058a20..a3f076befa4f 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -335,4 +335,10 @@ static inline void inet_csk_inc_pingpong_cnt(struct sock *sk) if (icsk->icsk_ack.pingpong < U8_MAX) icsk->icsk_ack.pingpong++; } + +static inline bool inet_csk_has_ulp(struct sock *sk) +{ + return inet_sk(sk)->is_icsk && !!inet_csk(sk)->icsk_ulp_ops; +} + #endif /* _INET_CONNECTION_SOCK_H */ diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 2e0f465295c3..cb8f740f7949 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -384,7 +384,6 @@ static int sock_map_update_common(struct bpf_map *map, u32 idx, struct sock *sk, u64 flags) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - struct inet_connection_sock *icsk = inet_csk(sk); struct sk_psock_link *link; struct sk_psock *psock; struct sock *osk; @@ -395,7 +394,7 @@ static int sock_map_update_common(struct bpf_map *map, u32 idx, return -EINVAL; if (unlikely(idx >= map->max_entries)) return -E2BIG; - if (unlikely(rcu_access_pointer(icsk->icsk_ulp_data))) + if (inet_csk_has_ulp(sk)) return -EINVAL; link = sk_psock_init_link(); @@ -738,7 +737,6 @@ static int sock_hash_update_common(struct bpf_map *map, void *key, struct sock *sk, u64 flags) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct inet_connection_sock *icsk = inet_csk(sk); u32 key_size = map->key_size, hash; struct bpf_htab_elem *elem, *elem_new; struct bpf_htab_bucket *bucket; @@ -749,7 +747,7 @@ static int sock_hash_update_common(struct bpf_map *map, void *key, WARN_ON_ONCE(!rcu_read_lock_held()); if (unlikely(flags > BPF_EXIST)) return -EINVAL; - if (unlikely(icsk->icsk_ulp_data)) + if (inet_csk_has_ulp(sk)) return -EINVAL; link = sk_psock_init_link(); diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c index 2703f24c5d1a..7c27aa629af1 100644 --- a/net/ipv4/tcp_ulp.c +++ b/net/ipv4/tcp_ulp.c @@ -105,13 +105,6 @@ void tcp_update_ulp(struct sock *sk, struct proto *proto, { struct inet_connection_sock *icsk = inet_csk(sk); - if (!icsk->icsk_ulp_ops) { - sk->sk_write_space = write_space; - /* Pairs with lockless read in sk_clone_lock() */ - WRITE_ONCE(sk->sk_prot, proto); - return; - } - if (icsk->icsk_ulp_ops->update) icsk->icsk_ulp_ops->update(sk, proto, write_space); } -- cgit v1.2.3 From 1a2e20132db7bb76dd4f97b8364bd167227dd15f Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 9 Mar 2020 11:12:33 +0000 Subject: skmsg: Update saved hooks only once Only update psock->saved_* if psock->sk_proto has not been initialized yet. This allows us to get rid of tcp_bpf_reinit_sk_prot. Signed-off-by: Lorenz Bauer Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Sitnicki Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200309111243.6982-3-lmb@cloudflare.com --- include/linux/skmsg.h | 20 ++++++++++++++++---- net/ipv4/tcp_bpf.c | 16 +--------------- 2 files changed, 17 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 4d3d75d63066..2be51b7a5800 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -347,11 +347,23 @@ static inline void sk_psock_update_proto(struct sock *sk, struct sk_psock *psock, struct proto *ops) { - psock->saved_unhash = sk->sk_prot->unhash; - psock->saved_close = sk->sk_prot->close; - psock->saved_write_space = sk->sk_write_space; + /* Initialize saved callbacks and original proto only once, since this + * function may be called multiple times for a psock, e.g. when + * psock->progs.msg_parser is updated. + * + * Since we've not installed the new proto, psock is not yet in use and + * we can initialize it without synchronization. + */ + if (!psock->sk_proto) { + struct proto *orig = READ_ONCE(sk->sk_prot); + + psock->saved_unhash = orig->unhash; + psock->saved_close = orig->close; + psock->saved_write_space = sk->sk_write_space; + + psock->sk_proto = orig; + } - psock->sk_proto = sk->sk_prot; /* Pairs with lockless read in sk_clone_lock() */ WRITE_ONCE(sk->sk_prot, ops); } diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 7d6e1b75d4d4..3327afa05c3d 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -637,20 +637,6 @@ static void tcp_bpf_update_sk_prot(struct sock *sk, struct sk_psock *psock) sk_psock_update_proto(sk, psock, &tcp_bpf_prots[family][config]); } -static void tcp_bpf_reinit_sk_prot(struct sock *sk, struct sk_psock *psock) -{ - int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4; - int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE; - - /* Reinit occurs when program types change e.g. TCP_BPF_TX is removed - * or added requiring sk_prot hook updates. We keep original saved - * hooks in this case. - * - * Pairs with lockless read in sk_clone_lock(). - */ - WRITE_ONCE(sk->sk_prot, &tcp_bpf_prots[family][config]); -} - static int tcp_bpf_assert_proto_ops(struct proto *ops) { /* In order to avoid retpoline, we make assumptions when we call @@ -670,7 +656,7 @@ void tcp_bpf_reinit(struct sock *sk) rcu_read_lock(); psock = sk_psock(sk); - tcp_bpf_reinit_sk_prot(sk, psock); + tcp_bpf_update_sk_prot(sk, psock); rcu_read_unlock(); } -- cgit v1.2.3 From d19da360ee0f3e6c1375391db1a724b66fd43312 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 9 Mar 2020 11:12:34 +0000 Subject: bpf: tcp: Move assertions into tcp_bpf_get_proto We need to ensure that sk->sk_prot uses certain callbacks, so that code that directly calls e.g. tcp_sendmsg in certain corner cases works. To avoid spurious asserts, we must to do this only if sk_psock_update_proto has not yet been called. The same invariants apply for tcp_bpf_check_v6_needs_rebuild, so move the call as well. Doing so allows us to merge tcp_bpf_init and tcp_bpf_reinit. Signed-off-by: Lorenz Bauer Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Sitnicki Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200309111243.6982-4-lmb@cloudflare.com --- include/net/tcp.h | 1 - net/core/sock_map.c | 25 +++++++++---------------- net/ipv4/tcp_bpf.c | 42 ++++++++++++++++++++++-------------------- 3 files changed, 31 insertions(+), 37 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 07f947cc80e6..ccf39d80b695 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2196,7 +2196,6 @@ struct sk_msg; struct sk_psock; int tcp_bpf_init(struct sock *sk); -void tcp_bpf_reinit(struct sock *sk); int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, u32 bytes, int flags); int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, diff --git a/net/core/sock_map.c b/net/core/sock_map.c index cb8f740f7949..fafcbd22ecba 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -145,8 +145,8 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, struct sock *sk) { struct bpf_prog *msg_parser, *skb_parser, *skb_verdict; - bool skb_progs, sk_psock_is_new = false; struct sk_psock *psock; + bool skb_progs; int ret; skb_verdict = READ_ONCE(progs->skb_verdict); @@ -191,18 +191,14 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, ret = -ENOMEM; goto out_progs; } - sk_psock_is_new = true; } if (msg_parser) psock_set_prog(&psock->progs.msg_parser, msg_parser); - if (sk_psock_is_new) { - ret = tcp_bpf_init(sk); - if (ret < 0) - goto out_drop; - } else { - tcp_bpf_reinit(sk); - } + + ret = tcp_bpf_init(sk); + if (ret < 0) + goto out_drop; write_lock_bh(&sk->sk_callback_lock); if (skb_progs && !psock->parser.enabled) { @@ -239,15 +235,12 @@ static int sock_map_link_no_progs(struct bpf_map *map, struct sock *sk) if (IS_ERR(psock)) return PTR_ERR(psock); - if (psock) { - tcp_bpf_reinit(sk); - return 0; + if (!psock) { + psock = sk_psock_init(sk, map->numa_node); + if (!psock) + return -ENOMEM; } - psock = sk_psock_init(sk, map->numa_node); - if (!psock) - return -ENOMEM; - ret = tcp_bpf_init(sk); if (ret < 0) sk_psock_put(sk, psock); diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 3327afa05c3d..ed8a8f3c9afe 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -629,14 +629,6 @@ static int __init tcp_bpf_v4_build_proto(void) } core_initcall(tcp_bpf_v4_build_proto); -static void tcp_bpf_update_sk_prot(struct sock *sk, struct sk_psock *psock) -{ - int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4; - int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE; - - sk_psock_update_proto(sk, psock, &tcp_bpf_prots[family][config]); -} - static int tcp_bpf_assert_proto_ops(struct proto *ops) { /* In order to avoid retpoline, we make assumptions when we call @@ -648,34 +640,44 @@ static int tcp_bpf_assert_proto_ops(struct proto *ops) ops->sendpage == tcp_sendpage ? 0 : -ENOTSUPP; } -void tcp_bpf_reinit(struct sock *sk) +static struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock) { - struct sk_psock *psock; + int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4; + int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE; - sock_owned_by_me(sk); + if (!psock->sk_proto) { + struct proto *ops = READ_ONCE(sk->sk_prot); - rcu_read_lock(); - psock = sk_psock(sk); - tcp_bpf_update_sk_prot(sk, psock); - rcu_read_unlock(); + if (tcp_bpf_assert_proto_ops(ops)) + return ERR_PTR(-EINVAL); + + tcp_bpf_check_v6_needs_rebuild(sk, ops); + } + + return &tcp_bpf_prots[family][config]; } int tcp_bpf_init(struct sock *sk) { - struct proto *ops = READ_ONCE(sk->sk_prot); struct sk_psock *psock; + struct proto *prot; sock_owned_by_me(sk); rcu_read_lock(); psock = sk_psock(sk); - if (unlikely(!psock || psock->sk_proto || - tcp_bpf_assert_proto_ops(ops))) { + if (unlikely(!psock)) { rcu_read_unlock(); return -EINVAL; } - tcp_bpf_check_v6_needs_rebuild(sk, ops); - tcp_bpf_update_sk_prot(sk, psock); + + prot = tcp_bpf_get_proto(sk, psock); + if (IS_ERR(prot)) { + rcu_read_unlock(); + return PTR_ERR(prot); + } + + sk_psock_update_proto(sk, psock, prot); rcu_read_unlock(); return 0; } -- cgit v1.2.3 From 5da0040442312a2b696748f8240243ce543a4970 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 9 Mar 2020 11:12:35 +0000 Subject: bpf: tcp: Guard declarations with CONFIG_NET_SOCK_MSG tcp_bpf.c is only included in the build if CONFIG_NET_SOCK_MSG is selected. The declaration should therefore be guarded as such. Signed-off-by: Lorenz Bauer Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Sitnicki Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200309111243.6982-5-lmb@cloudflare.com --- include/net/tcp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index ccf39d80b695..ad3abeaa703e 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2195,6 +2195,7 @@ void tcp_update_ulp(struct sock *sk, struct proto *p, struct sk_msg; struct sk_psock; +#ifdef CONFIG_NET_SOCK_MSG int tcp_bpf_init(struct sock *sk); int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, u32 bytes, int flags); @@ -2202,13 +2203,12 @@ int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, int len, int flags); -#ifdef CONFIG_NET_SOCK_MSG void tcp_bpf_clone(const struct sock *sk, struct sock *newsk); #else static inline void tcp_bpf_clone(const struct sock *sk, struct sock *newsk) { } -#endif +#endif /* CONFIG_NET_SOCK_MSG */ /* Call BPF_SOCK_OPS program that returns an int. If the return value * is < 0, then the BPF op failed (for example if the loaded BPF -- cgit v1.2.3 From f747632b608f90217a4e9ebb1deba8a37612aa32 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 9 Mar 2020 11:12:36 +0000 Subject: bpf: sockmap: Move generic sockmap hooks from BPF TCP The init, close and unhash handlers from TCP sockmap are generic, and can be reused by UDP sockmap. Move the helpers into the sockmap code base and expose them. This requires tcp_bpf_get_proto and tcp_bpf_clone to be conditional on BPF_STREAM_PARSER. The moved functions are unmodified, except that sk_psock_unlink is renamed to sock_map_unlink to better match its behaviour. Signed-off-by: Lorenz Bauer Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Sitnicki Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200309111243.6982-6-lmb@cloudflare.com --- include/linux/bpf.h | 4 +- include/linux/skmsg.h | 28 ------------- include/net/tcp.h | 15 ++++--- net/core/sock_map.c | 106 +++++++++++++++++++++++++++++++++++++++++++++++--- net/ipv4/tcp_bpf.c | 84 +++------------------------------------ 5 files changed, 118 insertions(+), 119 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 40c53924571d..94a329b9da81 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1419,6 +1419,8 @@ static inline void bpf_map_offload_map_free(struct bpf_map *map) #if defined(CONFIG_BPF_STREAM_PARSER) int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, u32 which); int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog); +void sock_map_unhash(struct sock *sk); +void sock_map_close(struct sock *sk, long timeout); #else static inline int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, u32 which) @@ -1431,7 +1433,7 @@ static inline int sock_map_get_from_fd(const union bpf_attr *attr, { return -EINVAL; } -#endif +#endif /* CONFIG_BPF_STREAM_PARSER */ #if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) void bpf_sk_reuseport_detach(struct sock *sk); diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 2be51b7a5800..8a709f63c5e5 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -323,14 +323,6 @@ static inline void sk_psock_free_link(struct sk_psock_link *link) } struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock); -#if defined(CONFIG_BPF_STREAM_PARSER) -void sk_psock_unlink(struct sock *sk, struct sk_psock_link *link); -#else -static inline void sk_psock_unlink(struct sock *sk, - struct sk_psock_link *link) -{ -} -#endif void __sk_psock_purge_ingress_msg(struct sk_psock *psock); @@ -399,26 +391,6 @@ static inline bool sk_psock_test_state(const struct sk_psock *psock, return test_bit(bit, &psock->state); } -static inline struct sk_psock *sk_psock_get_checked(struct sock *sk) -{ - struct sk_psock *psock; - - rcu_read_lock(); - psock = sk_psock(sk); - if (psock) { - if (sk->sk_prot->recvmsg != tcp_bpf_recvmsg) { - psock = ERR_PTR(-EBUSY); - goto out; - } - - if (!refcount_inc_not_zero(&psock->refcnt)) - psock = ERR_PTR(-EBUSY); - } -out: - rcu_read_unlock(); - return psock; -} - static inline struct sk_psock *sk_psock_get(struct sock *sk) { struct sk_psock *psock; diff --git a/include/net/tcp.h b/include/net/tcp.h index ad3abeaa703e..43fa07a36fa6 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2195,19 +2195,22 @@ void tcp_update_ulp(struct sock *sk, struct proto *p, struct sk_msg; struct sk_psock; +#ifdef CONFIG_BPF_STREAM_PARSER +struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock); +void tcp_bpf_clone(const struct sock *sk, struct sock *newsk); +#else +static inline void tcp_bpf_clone(const struct sock *sk, struct sock *newsk) +{ +} +#endif /* CONFIG_BPF_STREAM_PARSER */ + #ifdef CONFIG_NET_SOCK_MSG -int tcp_bpf_init(struct sock *sk); int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, u32 bytes, int flags); int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, int len, int flags); -void tcp_bpf_clone(const struct sock *sk, struct sock *newsk); -#else -static inline void tcp_bpf_clone(const struct sock *sk, struct sock *newsk) -{ -} #endif /* CONFIG_NET_SOCK_MSG */ /* Call BPF_SOCK_OPS program that returns an int. If the return value diff --git a/net/core/sock_map.c b/net/core/sock_map.c index fafcbd22ecba..cb240d87e068 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -141,6 +141,51 @@ static void sock_map_unref(struct sock *sk, void *link_raw) } } +static int sock_map_init_proto(struct sock *sk) +{ + struct sk_psock *psock; + struct proto *prot; + + sock_owned_by_me(sk); + + rcu_read_lock(); + psock = sk_psock(sk); + if (unlikely(!psock)) { + rcu_read_unlock(); + return -EINVAL; + } + + prot = tcp_bpf_get_proto(sk, psock); + if (IS_ERR(prot)) { + rcu_read_unlock(); + return PTR_ERR(prot); + } + + sk_psock_update_proto(sk, psock, prot); + rcu_read_unlock(); + return 0; +} + +static struct sk_psock *sock_map_psock_get_checked(struct sock *sk) +{ + struct sk_psock *psock; + + rcu_read_lock(); + psock = sk_psock(sk); + if (psock) { + if (sk->sk_prot->recvmsg != tcp_bpf_recvmsg) { + psock = ERR_PTR(-EBUSY); + goto out; + } + + if (!refcount_inc_not_zero(&psock->refcnt)) + psock = ERR_PTR(-EBUSY); + } +out: + rcu_read_unlock(); + return psock; +} + static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, struct sock *sk) { @@ -172,7 +217,7 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, } } - psock = sk_psock_get_checked(sk); + psock = sock_map_psock_get_checked(sk); if (IS_ERR(psock)) { ret = PTR_ERR(psock); goto out_progs; @@ -196,7 +241,7 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, if (msg_parser) psock_set_prog(&psock->progs.msg_parser, msg_parser); - ret = tcp_bpf_init(sk); + ret = sock_map_init_proto(sk); if (ret < 0) goto out_drop; @@ -231,7 +276,7 @@ static int sock_map_link_no_progs(struct bpf_map *map, struct sock *sk) struct sk_psock *psock; int ret; - psock = sk_psock_get_checked(sk); + psock = sock_map_psock_get_checked(sk); if (IS_ERR(psock)) return PTR_ERR(psock); @@ -241,7 +286,7 @@ static int sock_map_link_no_progs(struct bpf_map *map, struct sock *sk) return -ENOMEM; } - ret = tcp_bpf_init(sk); + ret = sock_map_init_proto(sk); if (ret < 0) sk_psock_put(sk, psock); return ret; @@ -1120,7 +1165,7 @@ int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, return 0; } -void sk_psock_unlink(struct sock *sk, struct sk_psock_link *link) +static void sock_map_unlink(struct sock *sk, struct sk_psock_link *link) { switch (link->map->map_type) { case BPF_MAP_TYPE_SOCKMAP: @@ -1133,3 +1178,54 @@ void sk_psock_unlink(struct sock *sk, struct sk_psock_link *link) break; } } + +static void sock_map_remove_links(struct sock *sk, struct sk_psock *psock) +{ + struct sk_psock_link *link; + + while ((link = sk_psock_link_pop(psock))) { + sock_map_unlink(sk, link); + sk_psock_free_link(link); + } +} + +void sock_map_unhash(struct sock *sk) +{ + void (*saved_unhash)(struct sock *sk); + struct sk_psock *psock; + + rcu_read_lock(); + psock = sk_psock(sk); + if (unlikely(!psock)) { + rcu_read_unlock(); + if (sk->sk_prot->unhash) + sk->sk_prot->unhash(sk); + return; + } + + saved_unhash = psock->saved_unhash; + sock_map_remove_links(sk, psock); + rcu_read_unlock(); + saved_unhash(sk); +} + +void sock_map_close(struct sock *sk, long timeout) +{ + void (*saved_close)(struct sock *sk, long timeout); + struct sk_psock *psock; + + lock_sock(sk); + rcu_read_lock(); + psock = sk_psock(sk); + if (unlikely(!psock)) { + rcu_read_unlock(); + release_sock(sk); + return sk->sk_prot->close(sk, timeout); + } + + saved_close = psock->saved_close; + sock_map_remove_links(sk, psock); + rcu_read_unlock(); + release_sock(sk); + saved_close(sk, timeout); +} diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index ed8a8f3c9afe..fe7b4fbc31c1 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -528,57 +528,7 @@ out_err: return copied ? copied : err; } -static void tcp_bpf_remove(struct sock *sk, struct sk_psock *psock) -{ - struct sk_psock_link *link; - - while ((link = sk_psock_link_pop(psock))) { - sk_psock_unlink(sk, link); - sk_psock_free_link(link); - } -} - -static void tcp_bpf_unhash(struct sock *sk) -{ - void (*saved_unhash)(struct sock *sk); - struct sk_psock *psock; - - rcu_read_lock(); - psock = sk_psock(sk); - if (unlikely(!psock)) { - rcu_read_unlock(); - if (sk->sk_prot->unhash) - sk->sk_prot->unhash(sk); - return; - } - - saved_unhash = psock->saved_unhash; - tcp_bpf_remove(sk, psock); - rcu_read_unlock(); - saved_unhash(sk); -} - -static void tcp_bpf_close(struct sock *sk, long timeout) -{ - void (*saved_close)(struct sock *sk, long timeout); - struct sk_psock *psock; - - lock_sock(sk); - rcu_read_lock(); - psock = sk_psock(sk); - if (unlikely(!psock)) { - rcu_read_unlock(); - release_sock(sk); - return sk->sk_prot->close(sk, timeout); - } - - saved_close = psock->saved_close; - tcp_bpf_remove(sk, psock); - rcu_read_unlock(); - release_sock(sk); - saved_close(sk, timeout); -} - +#ifdef CONFIG_BPF_STREAM_PARSER enum { TCP_BPF_IPV4, TCP_BPF_IPV6, @@ -599,8 +549,8 @@ static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS], struct proto *base) { prot[TCP_BPF_BASE] = *base; - prot[TCP_BPF_BASE].unhash = tcp_bpf_unhash; - prot[TCP_BPF_BASE].close = tcp_bpf_close; + prot[TCP_BPF_BASE].unhash = sock_map_unhash; + prot[TCP_BPF_BASE].close = sock_map_close; prot[TCP_BPF_BASE].recvmsg = tcp_bpf_recvmsg; prot[TCP_BPF_BASE].stream_memory_read = tcp_bpf_stream_read; @@ -640,7 +590,7 @@ static int tcp_bpf_assert_proto_ops(struct proto *ops) ops->sendpage == tcp_sendpage ? 0 : -ENOTSUPP; } -static struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock) +struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock) { int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4; int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE; @@ -657,31 +607,6 @@ static struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock) return &tcp_bpf_prots[family][config]; } -int tcp_bpf_init(struct sock *sk) -{ - struct sk_psock *psock; - struct proto *prot; - - sock_owned_by_me(sk); - - rcu_read_lock(); - psock = sk_psock(sk); - if (unlikely(!psock)) { - rcu_read_unlock(); - return -EINVAL; - } - - prot = tcp_bpf_get_proto(sk, psock); - if (IS_ERR(prot)) { - rcu_read_unlock(); - return PTR_ERR(prot); - } - - sk_psock_update_proto(sk, psock, prot); - rcu_read_unlock(); - return 0; -} - /* If a child got cloned from a listening socket that had tcp_bpf * protocol callbacks installed, we need to restore the callbacks to * the default ones because the child does not inherit the psock state @@ -695,3 +620,4 @@ void tcp_bpf_clone(const struct sock *sk, struct sock *newsk) if (prot == &tcp_bpf_prots[family][TCP_BPF_BASE]) newsk->sk_prot = sk->sk_prot_creator; } +#endif /* CONFIG_BPF_STREAM_PARSER */ -- cgit v1.2.3 From edc6741cc66059532ba621928e3f1b02a53a2f39 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 9 Mar 2020 11:12:38 +0000 Subject: bpf: Add sockmap hooks for UDP sockets Add basic psock hooks for UDP sockets. This allows adding and removing sockets, as well as automatic removal on unhash and close. Signed-off-by: Lorenz Bauer Signed-off-by: Jakub Sitnicki Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200309111243.6982-8-lmb@cloudflare.com --- MAINTAINERS | 1 + include/net/udp.h | 5 +++++ net/ipv4/Makefile | 1 + net/ipv4/udp_bpf.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 60 insertions(+) create mode 100644 net/ipv4/udp_bpf.c (limited to 'include') diff --git a/MAINTAINERS b/MAINTAINERS index c23884e084be..14554bde1c06 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9370,6 +9370,7 @@ F: include/linux/skmsg.h F: net/core/skmsg.c F: net/core/sock_map.c F: net/ipv4/tcp_bpf.c +F: net/ipv4/udp_bpf.c LANTIQ / INTEL Ethernet drivers M: Hauke Mehrtens diff --git a/include/net/udp.h b/include/net/udp.h index e55d5f765807..a8fa6c0c6ded 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -503,4 +503,9 @@ static inline struct sk_buff *udp_rcv_segment(struct sock *sk, return segs; } +#ifdef CONFIG_BPF_STREAM_PARSER +struct sk_psock; +struct proto *udp_bpf_get_proto(struct sock *sk, struct sk_psock *psock); +#endif /* BPF_STREAM_PARSER */ + #endif /* _UDP_H */ diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 9d97bace13c8..9e1a186a3671 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -61,6 +61,7 @@ obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o obj-$(CONFIG_NET_SOCK_MSG) += tcp_bpf.o +obj-$(CONFIG_BPF_STREAM_PARSER) += udp_bpf.o obj-$(CONFIG_NETLABEL) += cipso_ipv4.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c new file mode 100644 index 000000000000..eddd973e6575 --- /dev/null +++ b/net/ipv4/udp_bpf.c @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Cloudflare Ltd https://cloudflare.com */ + +#include +#include +#include + +enum { + UDP_BPF_IPV4, + UDP_BPF_IPV6, + UDP_BPF_NUM_PROTS, +}; + +static struct proto *udpv6_prot_saved __read_mostly; +static DEFINE_SPINLOCK(udpv6_prot_lock); +static struct proto udp_bpf_prots[UDP_BPF_NUM_PROTS]; + +static void udp_bpf_rebuild_protos(struct proto *prot, const struct proto *base) +{ + *prot = *base; + prot->unhash = sock_map_unhash; + prot->close = sock_map_close; +} + +static void udp_bpf_check_v6_needs_rebuild(struct sock *sk, struct proto *ops) +{ + if (sk->sk_family == AF_INET6 && + unlikely(ops != smp_load_acquire(&udpv6_prot_saved))) { + spin_lock_bh(&udpv6_prot_lock); + if (likely(ops != udpv6_prot_saved)) { + udp_bpf_rebuild_protos(&udp_bpf_prots[UDP_BPF_IPV6], ops); + smp_store_release(&udpv6_prot_saved, ops); + } + spin_unlock_bh(&udpv6_prot_lock); + } +} + +static int __init udp_bpf_v4_build_proto(void) +{ + udp_bpf_rebuild_protos(&udp_bpf_prots[UDP_BPF_IPV4], &udp_prot); + return 0; +} +core_initcall(udp_bpf_v4_build_proto); + +struct proto *udp_bpf_get_proto(struct sock *sk, struct sk_psock *psock) +{ + int family = sk->sk_family == AF_INET ? UDP_BPF_IPV4 : UDP_BPF_IPV6; + + if (!psock->sk_proto) + udp_bpf_check_v6_needs_rebuild(sk, READ_ONCE(sk->sk_prot)); + + return &udp_bpf_prots[family]; +} -- cgit v1.2.3 From babf3164095b0670435910340c2a1eec37757b57 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 9 Mar 2020 16:10:51 -0700 Subject: bpf: Add bpf_link_new_file that doesn't install FD Add bpf_link_new_file() API for cases when we need to ensure anon_inode is successfully created before we proceed with expensive BPF program attachment procedure, which will require equally (if not more so) expensive and potentially failing compensation detachment procedure just because anon_inode creation failed. This API allows to simplify code by ensuring first that anon_inode is created and after BPF program is attached proceed with fd_install() that can't fail. After anon_inode file is created, link can't be just kfree()'d anymore, because its destruction will be performed by deferred file_operations->release call. For this, bpf_link API required specifying two separate operations: release() and dealloc(), former performing detachment only, while the latter frees memory used by bpf_link itself. dealloc() needs to be specified, because struct bpf_link is frequently embedded into link type-specific container struct (e.g., struct bpf_raw_tp_link), so bpf_link itself doesn't know how to properly free the memory. In case when anon_inode file was successfully created, but subsequent BPF attachment failed, bpf_link needs to be marked as "defunct", so that file's release() callback will perform only memory deallocation, but no detachment. Convert raw tracepoint and tracing attachment to new API and eliminate detachment from error handling path. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200309231051.1270337-1-andriin@fb.com --- include/linux/bpf.h | 3 ++ kernel/bpf/syscall.c | 122 +++++++++++++++++++++++++++++++++++++-------------- 2 files changed, 91 insertions(+), 34 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 94a329b9da81..4fd91b7c95ea 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1070,13 +1070,16 @@ struct bpf_link; struct bpf_link_ops { void (*release)(struct bpf_link *link); + void (*dealloc)(struct bpf_link *link); }; void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops, struct bpf_prog *prog); +void bpf_link_defunct(struct bpf_link *link); void bpf_link_inc(struct bpf_link *link); void bpf_link_put(struct bpf_link *link); int bpf_link_new_fd(struct bpf_link *link); +struct file *bpf_link_new_file(struct bpf_link *link, int *reserved_fd); struct bpf_link *bpf_link_get_from_fd(u32 ufd); int bpf_obj_pin_user(u32 ufd, const char __user *pathname); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7ce0815793dd..b2f73ecacced 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2188,6 +2188,11 @@ void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops, link->prog = prog; } +void bpf_link_defunct(struct bpf_link *link) +{ + link->prog = NULL; +} + void bpf_link_inc(struct bpf_link *link) { atomic64_inc(&link->refcnt); @@ -2196,14 +2201,13 @@ void bpf_link_inc(struct bpf_link *link) /* bpf_link_free is guaranteed to be called from process context */ static void bpf_link_free(struct bpf_link *link) { - struct bpf_prog *prog; - - /* remember prog locally, because release below will free link memory */ - prog = link->prog; - /* extra clean up and kfree of container link struct */ - link->ops->release(link); - /* no more accesing of link members after this point */ - bpf_prog_put(prog); + if (link->prog) { + /* detach BPF program, clean up used resources */ + link->ops->release(link); + bpf_prog_put(link->prog); + } + /* free bpf_link and its containing memory */ + link->ops->dealloc(link); } static void bpf_link_put_deferred(struct work_struct *work) @@ -2281,6 +2285,33 @@ int bpf_link_new_fd(struct bpf_link *link) return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC); } +/* Similar to bpf_link_new_fd, create anon_inode for given bpf_link, but + * instead of immediately installing fd in fdtable, just reserve it and + * return. Caller then need to either install it with fd_install(fd, file) or + * release with put_unused_fd(fd). + * This is useful for cases when bpf_link attachment/detachment are + * complicated and expensive operations and should be delayed until all the fd + * reservation and anon_inode creation succeeds. + */ +struct file *bpf_link_new_file(struct bpf_link *link, int *reserved_fd) +{ + struct file *file; + int fd; + + fd = get_unused_fd_flags(O_CLOEXEC); + if (fd < 0) + return ERR_PTR(fd); + + file = anon_inode_getfile("bpf_link", &bpf_link_fops, link, O_CLOEXEC); + if (IS_ERR(file)) { + put_unused_fd(fd); + return file; + } + + *reserved_fd = fd; + return file; +} + struct bpf_link *bpf_link_get_from_fd(u32 ufd) { struct fd f = fdget(ufd); @@ -2305,21 +2336,27 @@ struct bpf_tracing_link { }; static void bpf_tracing_link_release(struct bpf_link *link) +{ + WARN_ON_ONCE(bpf_trampoline_unlink_prog(link->prog)); +} + +static void bpf_tracing_link_dealloc(struct bpf_link *link) { struct bpf_tracing_link *tr_link = container_of(link, struct bpf_tracing_link, link); - WARN_ON_ONCE(bpf_trampoline_unlink_prog(link->prog)); kfree(tr_link); } static const struct bpf_link_ops bpf_tracing_link_lops = { .release = bpf_tracing_link_release, + .dealloc = bpf_tracing_link_dealloc, }; static int bpf_tracing_prog_attach(struct bpf_prog *prog) { struct bpf_tracing_link *link; + struct file *link_file; int link_fd, err; if (prog->expected_attach_type != BPF_TRACE_FENTRY && @@ -2337,20 +2374,24 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog) } bpf_link_init(&link->link, &bpf_tracing_link_lops, prog); - err = bpf_trampoline_link_prog(prog); - if (err) - goto out_free_link; + link_file = bpf_link_new_file(&link->link, &link_fd); + if (IS_ERR(link_file)) { + kfree(link); + err = PTR_ERR(link_file); + goto out_put_prog; + } - link_fd = bpf_link_new_fd(&link->link); - if (link_fd < 0) { - WARN_ON_ONCE(bpf_trampoline_unlink_prog(prog)); - err = link_fd; - goto out_free_link; + err = bpf_trampoline_link_prog(prog); + if (err) { + bpf_link_defunct(&link->link); + fput(link_file); + put_unused_fd(link_fd); + goto out_put_prog; } + + fd_install(link_fd, link_file); return link_fd; -out_free_link: - kfree(link); out_put_prog: bpf_prog_put(prog); return err; @@ -2368,19 +2409,28 @@ static void bpf_raw_tp_link_release(struct bpf_link *link) bpf_probe_unregister(raw_tp->btp, raw_tp->link.prog); bpf_put_raw_tracepoint(raw_tp->btp); +} + +static void bpf_raw_tp_link_dealloc(struct bpf_link *link) +{ + struct bpf_raw_tp_link *raw_tp = + container_of(link, struct bpf_raw_tp_link, link); + kfree(raw_tp); } static const struct bpf_link_ops bpf_raw_tp_lops = { .release = bpf_raw_tp_link_release, + .dealloc = bpf_raw_tp_link_dealloc, }; #define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd static int bpf_raw_tracepoint_open(const union bpf_attr *attr) { - struct bpf_raw_tp_link *raw_tp; + struct bpf_raw_tp_link *link; struct bpf_raw_event_map *btp; + struct file *link_file; struct bpf_prog *prog; const char *tp_name; char buf[128]; @@ -2431,28 +2481,32 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) goto out_put_prog; } - raw_tp = kzalloc(sizeof(*raw_tp), GFP_USER); - if (!raw_tp) { + link = kzalloc(sizeof(*link), GFP_USER); + if (!link) { err = -ENOMEM; goto out_put_btp; } - bpf_link_init(&raw_tp->link, &bpf_raw_tp_lops, prog); - raw_tp->btp = btp; + bpf_link_init(&link->link, &bpf_raw_tp_lops, prog); + link->btp = btp; - err = bpf_probe_register(raw_tp->btp, prog); - if (err) - goto out_free_tp; + link_file = bpf_link_new_file(&link->link, &link_fd); + if (IS_ERR(link_file)) { + kfree(link); + err = PTR_ERR(link_file); + goto out_put_btp; + } - link_fd = bpf_link_new_fd(&raw_tp->link); - if (link_fd < 0) { - bpf_probe_unregister(raw_tp->btp, prog); - err = link_fd; - goto out_free_tp; + err = bpf_probe_register(link->btp, prog); + if (err) { + bpf_link_defunct(&link->link); + fput(link_file); + put_unused_fd(link_fd); + goto out_put_btp; } + + fd_install(link_fd, link_file); return link_fd; -out_free_tp: - kfree(raw_tp); out_put_btp: bpf_put_raw_tracepoint(btp); out_put_prog: -- cgit v1.2.3 From 1e2328e762548c7d17b7ba8ded9f409d05710dd1 Mon Sep 17 00:00:00 2001 From: Carlos Neira Date: Wed, 4 Mar 2020 17:41:55 -0300 Subject: fs/nsfs.c: Added ns_match ns_match returns true if the namespace inode and dev_t matches the ones provided by the caller. Signed-off-by: Carlos Neira Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200304204157.58695-2-cneirabustos@gmail.com --- fs/nsfs.c | 14 ++++++++++++++ include/linux/proc_ns.h | 2 ++ 2 files changed, 16 insertions(+) (limited to 'include') diff --git a/fs/nsfs.c b/fs/nsfs.c index b13bfd406820..4f1205725cfe 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -247,6 +247,20 @@ out_invalid: return ERR_PTR(-EINVAL); } +/** + * ns_match() - Returns true if current namespace matches dev/ino provided. + * @ns_common: current ns + * @dev: dev_t from nsfs that will be matched against current nsfs + * @ino: ino_t from nsfs that will be matched against current nsfs + * + * Return: true if dev and ino matches the current nsfs. + */ +bool ns_match(const struct ns_common *ns, dev_t dev, ino_t ino) +{ + return (ns->inum == ino) && (nsfs_mnt->mnt_sb->s_dev == dev); +} + + static int nsfs_show_path(struct seq_file *seq, struct dentry *dentry) { struct inode *inode = d_inode(dentry); diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index 4626b1ac3b6c..adff08bfecf9 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -85,6 +85,8 @@ typedef struct ns_common *ns_get_path_helper_t(void *); extern int ns_get_path_cb(struct path *path, ns_get_path_helper_t ns_get_cb, void *private_data); +extern bool ns_match(const struct ns_common *ns, dev_t dev, ino_t ino); + extern int ns_get_name(char *buf, size_t size, struct task_struct *task, const struct proc_ns_operations *ns_ops); extern void nsfs_init(void); -- cgit v1.2.3 From b4490c5c4e023f09b7d27c9a9d3e7ad7d09ea6bf Mon Sep 17 00:00:00 2001 From: Carlos Neira Date: Wed, 4 Mar 2020 17:41:56 -0300 Subject: bpf: Added new helper bpf_get_ns_current_pid_tgid New bpf helper bpf_get_ns_current_pid_tgid, This helper will return pid and tgid from current task which namespace matches dev_t and inode number provided, this will allows us to instrument a process inside a container. Signed-off-by: Carlos Neira Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200304204157.58695-3-cneirabustos@gmail.com --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 20 ++++++++++++++++++- kernel/bpf/core.c | 1 + kernel/bpf/helpers.c | 45 ++++++++++++++++++++++++++++++++++++++++++ kernel/trace/bpf_trace.c | 2 ++ scripts/bpf_helpers_doc.py | 1 + tools/include/uapi/linux/bpf.h | 20 ++++++++++++++++++- 7 files changed, 88 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4fd91b7c95ea..4ec835334a1f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1497,6 +1497,7 @@ extern const struct bpf_func_proto bpf_strtol_proto; extern const struct bpf_func_proto bpf_strtoul_proto; extern const struct bpf_func_proto bpf_tcp_sock_proto; extern const struct bpf_func_proto bpf_jiffies64_proto; +extern const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto; /* Shared helpers among cBPF and eBPF. */ void bpf_user_rnd_init_once(void); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 40b2d9476268..15b239da775b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2914,6 +2914,19 @@ union bpf_attr { * of sizeof(struct perf_branch_entry). * * **-ENOENT** if architecture does not support branch records. + * + * int bpf_get_ns_current_pid_tgid(u64 dev, u64 ino, struct bpf_pidns_info *nsdata, u32 size) + * Description + * Returns 0 on success, values for *pid* and *tgid* as seen from the current + * *namespace* will be returned in *nsdata*. + * + * On failure, the returned value is one of the following: + * + * **-EINVAL** if dev and inum supplied don't match dev_t and inode number + * with nsfs of current task, or if dev conversion to dev_t lost high bits. + * + * **-ENOENT** if pidns does not exists for the current task. + * */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3035,7 +3048,8 @@ union bpf_attr { FN(tcp_send_ack), \ FN(send_signal_thread), \ FN(jiffies64), \ - FN(read_branch_records), + FN(read_branch_records), \ + FN(get_ns_current_pid_tgid), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -3829,4 +3843,8 @@ struct bpf_sockopt { __s32 retval; }; +struct bpf_pidns_info { + __u32 pid; + __u32 tgid; +}; #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 973a20d49749..0f9ca46e1978 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2149,6 +2149,7 @@ const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak; const struct bpf_func_proto bpf_get_current_comm_proto __weak; const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak; const struct bpf_func_proto bpf_get_local_storage_proto __weak; +const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto __weak; const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) { diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index d8b7b110a1c5..01878db15eaf 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -12,6 +12,8 @@ #include #include #include +#include +#include #include "../../lib/kstrtox.h" @@ -499,3 +501,46 @@ const struct bpf_func_proto bpf_strtoul_proto = { .arg4_type = ARG_PTR_TO_LONG, }; #endif + +BPF_CALL_4(bpf_get_ns_current_pid_tgid, u64, dev, u64, ino, + struct bpf_pidns_info *, nsdata, u32, size) +{ + struct task_struct *task = current; + struct pid_namespace *pidns; + int err = -EINVAL; + + if (unlikely(size != sizeof(struct bpf_pidns_info))) + goto clear; + + if (unlikely((u64)(dev_t)dev != dev)) + goto clear; + + if (unlikely(!task)) + goto clear; + + pidns = task_active_pid_ns(task); + if (unlikely(!pidns)) { + err = -ENOENT; + goto clear; + } + + if (!ns_match(&pidns->ns, (dev_t)dev, ino)) + goto clear; + + nsdata->pid = task_pid_nr_ns(task, pidns); + nsdata->tgid = task_tgid_nr_ns(task, pidns); + return 0; +clear: + memset((void *)nsdata, 0, (size_t) size); + return err; +} + +const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto = { + .func = bpf_get_ns_current_pid_tgid, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, +}; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 6a490d8ce9de..b5071c7e93ca 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -843,6 +843,8 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_send_signal_thread_proto; case BPF_FUNC_perf_event_read_value: return &bpf_perf_event_read_value_proto; + case BPF_FUNC_get_ns_current_pid_tgid: + return &bpf_get_ns_current_pid_tgid_proto; default: return NULL; } diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index cebed6fb5bbb..c1e2b5410faa 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -435,6 +435,7 @@ class PrinterHelpers(Printer): 'struct bpf_fib_lookup', 'struct bpf_perf_event_data', 'struct bpf_perf_event_value', + 'struct bpf_pidns_info', 'struct bpf_sock', 'struct bpf_sock_addr', 'struct bpf_sock_ops', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 40b2d9476268..15b239da775b 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2914,6 +2914,19 @@ union bpf_attr { * of sizeof(struct perf_branch_entry). * * **-ENOENT** if architecture does not support branch records. + * + * int bpf_get_ns_current_pid_tgid(u64 dev, u64 ino, struct bpf_pidns_info *nsdata, u32 size) + * Description + * Returns 0 on success, values for *pid* and *tgid* as seen from the current + * *namespace* will be returned in *nsdata*. + * + * On failure, the returned value is one of the following: + * + * **-EINVAL** if dev and inum supplied don't match dev_t and inode number + * with nsfs of current task, or if dev conversion to dev_t lost high bits. + * + * **-ENOENT** if pidns does not exists for the current task. + * */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3035,7 +3048,8 @@ union bpf_attr { FN(tcp_send_ack), \ FN(send_signal_thread), \ FN(jiffies64), \ - FN(read_branch_records), + FN(read_branch_records), \ + FN(get_ns_current_pid_tgid), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -3829,4 +3843,8 @@ struct bpf_sockopt { __s32 retval; }; +struct bpf_pidns_info { + __u32 pid; + __u32 tgid; +}; #endif /* _UAPI__LINUX_BPF_H__ */ -- cgit v1.2.3 From d831ee84bfc9173eecf30dbbc2553ae81b996c60 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Fri, 6 Mar 2020 08:59:23 +0000 Subject: bpf: Add bpf_xdp_output() helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce new helper that reuses existing xdp perf_event output implementation, but can be called from raw_tracepoint programs that receive 'struct xdp_buff *' as a tracepoint argument. Signed-off-by: Eelco Chaudron Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/158348514556.2239.11050972434793741444.stgit@xdp-tutorial --- include/uapi/linux/bpf.h | 26 ++++++++++- kernel/bpf/verifier.c | 4 +- kernel/trace/bpf_trace.c | 3 ++ net/core/filter.c | 16 ++++++- tools/include/uapi/linux/bpf.h | 26 ++++++++++- .../testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c | 53 ++++++++++++++++++++++ .../testing/selftests/bpf/progs/test_xdp_bpf2bpf.c | 24 ++++++++++ 7 files changed, 148 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 15b239da775b..5d01c5c7e598 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2927,6 +2927,29 @@ union bpf_attr { * * **-ENOENT** if pidns does not exists for the current task. * + * int bpf_xdp_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * Description + * Write raw *data* blob into a special BPF perf event held by + * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf + * event must have the following attributes: **PERF_SAMPLE_RAW** + * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and + * **PERF_COUNT_SW_BPF_OUTPUT** as **config**. + * + * The *flags* are used to indicate the index in *map* for which + * the value must be put, masked with **BPF_F_INDEX_MASK**. + * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU** + * to indicate that the index of the current CPU core should be + * used. + * + * The value to write, of *size*, is passed through eBPF stack and + * pointed by *data*. + * + * *ctx* is a pointer to in-kernel struct xdp_buff. + * + * This helper is similar to **bpf_perf_eventoutput**\ () but + * restricted to raw_tracepoint bpf programs. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3049,7 +3072,8 @@ union bpf_attr { FN(send_signal_thread), \ FN(jiffies64), \ FN(read_branch_records), \ - FN(get_ns_current_pid_tgid), + FN(get_ns_current_pid_tgid), \ + FN(xdp_output), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 55d376c53f7d..745f3cfdf3b2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3650,7 +3650,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (func_id != BPF_FUNC_perf_event_read && func_id != BPF_FUNC_perf_event_output && func_id != BPF_FUNC_skb_output && - func_id != BPF_FUNC_perf_event_read_value) + func_id != BPF_FUNC_perf_event_read_value && + func_id != BPF_FUNC_xdp_output) goto error; break; case BPF_MAP_TYPE_STACK_TRACE: @@ -3740,6 +3741,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, case BPF_FUNC_perf_event_output: case BPF_FUNC_perf_event_read_value: case BPF_FUNC_skb_output: + case BPF_FUNC_xdp_output: if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY) goto error; break; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index b5071c7e93ca..e619eedb5919 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1145,6 +1145,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = { }; extern const struct bpf_func_proto bpf_skb_output_proto; +extern const struct bpf_func_proto bpf_xdp_output_proto; BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args, struct bpf_map *, map, u64, flags) @@ -1220,6 +1221,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) #ifdef CONFIG_NET case BPF_FUNC_skb_output: return &bpf_skb_output_proto; + case BPF_FUNC_xdp_output: + return &bpf_xdp_output_proto; #endif default: return raw_tp_prog_func_proto(func_id, prog); diff --git a/net/core/filter.c b/net/core/filter.c index cd0a532db4e7..22219544410f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4061,7 +4061,8 @@ BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map, if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) return -EINVAL; - if (unlikely(xdp_size > (unsigned long)(xdp->data_end - xdp->data))) + if (unlikely(!xdp || + xdp_size > (unsigned long)(xdp->data_end - xdp->data))) return -EFAULT; return bpf_event_output(map, flags, meta, meta_size, xdp->data, @@ -4079,6 +4080,19 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = { .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; +static int bpf_xdp_output_btf_ids[5]; +const struct bpf_func_proto bpf_xdp_output_proto = { + .func = bpf_xdp_event_output, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, + .btf_id = bpf_xdp_output_btf_ids, +}; + BPF_CALL_1(bpf_get_socket_cookie, struct sk_buff *, skb) { return skb->sk ? sock_gen_cookie(skb->sk) : 0; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 15b239da775b..5d01c5c7e598 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2927,6 +2927,29 @@ union bpf_attr { * * **-ENOENT** if pidns does not exists for the current task. * + * int bpf_xdp_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * Description + * Write raw *data* blob into a special BPF perf event held by + * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf + * event must have the following attributes: **PERF_SAMPLE_RAW** + * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and + * **PERF_COUNT_SW_BPF_OUTPUT** as **config**. + * + * The *flags* are used to indicate the index in *map* for which + * the value must be put, masked with **BPF_F_INDEX_MASK**. + * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU** + * to indicate that the index of the current CPU core should be + * used. + * + * The value to write, of *size*, is passed through eBPF stack and + * pointed by *data*. + * + * *ctx* is a pointer to in-kernel struct xdp_buff. + * + * This helper is similar to **bpf_perf_eventoutput**\ () but + * restricted to raw_tracepoint bpf programs. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3049,7 +3072,8 @@ union bpf_attr { FN(send_signal_thread), \ FN(jiffies64), \ FN(read_branch_records), \ - FN(get_ns_current_pid_tgid), + FN(get_ns_current_pid_tgid), \ + FN(xdp_output), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c b/tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c index 4ba011031d4c..a0f688c37023 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c @@ -4,17 +4,51 @@ #include "test_xdp.skel.h" #include "test_xdp_bpf2bpf.skel.h" +struct meta { + int ifindex; + int pkt_len; +}; + +static void on_sample(void *ctx, int cpu, void *data, __u32 size) +{ + int duration = 0; + struct meta *meta = (struct meta *)data; + struct ipv4_packet *trace_pkt_v4 = data + sizeof(*meta); + + if (CHECK(size < sizeof(pkt_v4) + sizeof(*meta), + "check_size", "size %u < %zu\n", + size, sizeof(pkt_v4) + sizeof(*meta))) + return; + + if (CHECK(meta->ifindex != if_nametoindex("lo"), "check_meta_ifindex", + "meta->ifindex = %d\n", meta->ifindex)) + return; + + if (CHECK(meta->pkt_len != sizeof(pkt_v4), "check_meta_pkt_len", + "meta->pkt_len = %zd\n", sizeof(pkt_v4))) + return; + + if (CHECK(memcmp(trace_pkt_v4, &pkt_v4, sizeof(pkt_v4)), + "check_packet_content", "content not the same\n")) + return; + + *(bool *)ctx = true; +} + void test_xdp_bpf2bpf(void) { __u32 duration = 0, retval, size; char buf[128]; int err, pkt_fd, map_fd; + bool passed = false; struct iphdr *iph = (void *)buf + sizeof(struct ethhdr); struct iptnl_info value4 = {.family = AF_INET}; struct test_xdp *pkt_skel = NULL; struct test_xdp_bpf2bpf *ftrace_skel = NULL; struct vip key4 = {.protocol = 6, .family = AF_INET}; struct bpf_program *prog; + struct perf_buffer *pb = NULL; + struct perf_buffer_opts pb_opts = {}; /* Load XDP program to introspect */ pkt_skel = test_xdp__open_and_load(); @@ -50,6 +84,14 @@ void test_xdp_bpf2bpf(void) if (CHECK(err, "ftrace_attach", "ftrace attach failed: %d\n", err)) goto out; + /* Set up perf buffer */ + pb_opts.sample_cb = on_sample; + pb_opts.ctx = &passed; + pb = perf_buffer__new(bpf_map__fd(ftrace_skel->maps.perf_buf_map), + 1, &pb_opts); + if (CHECK(IS_ERR(pb), "perf_buf__new", "err %ld\n", PTR_ERR(pb))) + goto out; + /* Run test program */ err = bpf_prog_test_run(pkt_fd, 1, &pkt_v4, sizeof(pkt_v4), buf, &size, &retval, &duration); @@ -60,6 +102,15 @@ void test_xdp_bpf2bpf(void) err, errno, retval, size)) goto out; + /* Make sure bpf_xdp_output() was triggered and it sent the expected + * data to the perf ring buffer. + */ + err = perf_buffer__poll(pb, 100); + if (CHECK(err < 0, "perf_buffer__poll", "err %d\n", err)) + goto out; + + CHECK_FAIL(!passed); + /* Verify test results */ if (CHECK(ftrace_skel->bss->test_result_fentry != if_nametoindex("lo"), "result", "fentry failed err %llu\n", @@ -70,6 +121,8 @@ void test_xdp_bpf2bpf(void) "fexit failed err %llu\n", ftrace_skel->bss->test_result_fexit); out: + if (pb) + perf_buffer__free(pb); test_xdp__destroy(pkt_skel); test_xdp_bpf2bpf__destroy(ftrace_skel); } diff --git a/tools/testing/selftests/bpf/progs/test_xdp_bpf2bpf.c b/tools/testing/selftests/bpf/progs/test_xdp_bpf2bpf.c index 42dd2fedd588..a038e827f850 100644 --- a/tools/testing/selftests/bpf/progs/test_xdp_bpf2bpf.c +++ b/tools/testing/selftests/bpf/progs/test_xdp_bpf2bpf.c @@ -3,6 +3,8 @@ #include #include +char _license[] SEC("license") = "GPL"; + struct net_device { /* Structure does not need to contain all entries, * as "preserve_access_index" will use BTF to fix this... @@ -27,10 +29,32 @@ struct xdp_buff { struct xdp_rxq_info *rxq; } __attribute__((preserve_access_index)); +struct meta { + int ifindex; + int pkt_len; +}; + +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(int)); +} perf_buf_map SEC(".maps"); + __u64 test_result_fentry = 0; SEC("fentry/FUNC") int BPF_PROG(trace_on_entry, struct xdp_buff *xdp) { + struct meta meta; + void *data_end = (void *)(long)xdp->data_end; + void *data = (void *)(long)xdp->data; + + meta.ifindex = xdp->rxq->dev->ifindex; + meta.pkt_len = data_end - data; + bpf_xdp_output(xdp, &perf_buf_map, + ((__u64) meta.pkt_len << 32) | + BPF_F_CURRENT_CPU, + &meta, sizeof(meta)); + test_result_fentry = xdp->rxq->dev->ifindex; return 0; } -- cgit v1.2.3 From 98868668367b24487c0b0b3298d7ca98409baf07 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 12 Mar 2020 17:21:28 -0700 Subject: bpf: Abstract away entire bpf_link clean up procedure Instead of requiring users to do three steps for cleaning up bpf_link, its anon_inode file, and unused fd, abstract that away into bpf_link_cleanup() helper. bpf_link_defunct() is removed, as it shouldn't be needed as an individual operation anymore. v1->v2: - keep bpf_link_cleanup() static for now (Daniel). Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200313002128.2028680-1-andriin@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 - kernel/bpf/syscall.c | 18 +++++++++++------- 2 files changed, 11 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4ec835334a1f..c2f815e9f7d0 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1075,7 +1075,6 @@ struct bpf_link_ops { void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops, struct bpf_prog *prog); -void bpf_link_defunct(struct bpf_link *link); void bpf_link_inc(struct bpf_link *link); void bpf_link_put(struct bpf_link *link); int bpf_link_new_fd(struct bpf_link *link); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b2f73ecacced..85567a6ea5f9 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2188,9 +2188,17 @@ void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops, link->prog = prog; } -void bpf_link_defunct(struct bpf_link *link) +/* Clean up bpf_link and corresponding anon_inode file and FD. After + * anon_inode is created, bpf_link can't be just kfree()'d due to deferred + * anon_inode's release() call. This helper manages marking bpf_link as + * defunct, releases anon_inode file and puts reserved FD. + */ +static void bpf_link_cleanup(struct bpf_link *link, struct file *link_file, + int link_fd) { link->prog = NULL; + fput(link_file); + put_unused_fd(link_fd); } void bpf_link_inc(struct bpf_link *link) @@ -2383,9 +2391,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog) err = bpf_trampoline_link_prog(prog); if (err) { - bpf_link_defunct(&link->link); - fput(link_file); - put_unused_fd(link_fd); + bpf_link_cleanup(&link->link, link_file, link_fd); goto out_put_prog; } @@ -2498,9 +2504,7 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) err = bpf_probe_register(link->btp, prog); if (err) { - bpf_link_defunct(&link->link); - fput(link_file); - put_unused_fd(link_fd); + bpf_link_cleanup(&link->link, link_file, link_fd); goto out_put_btp; } -- cgit v1.2.3 From 6a64037d4bf252bb8cf13917320c8e001da8997a Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Thu, 12 Mar 2020 20:55:57 +0100 Subject: bpf: Add bpf_trampoline_ name prefix for DECLARE_BPF_DISPATCHER MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adding bpf_trampoline_ name prefix for DECLARE_BPF_DISPATCHER, so all the dispatchers have the common name prefix. And also a small '_' cleanup for bpf_dispatcher_nopfunc function name. Signed-off-by: Björn Töpel Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20200312195610.346362-3-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 21 +++++++++++---------- include/linux/filter.h | 9 ++++----- net/core/filter.c | 5 ++--- 3 files changed, 17 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c2f815e9f7d0..fe1f8b075378 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -522,7 +522,7 @@ struct bpf_dispatcher { u32 image_off; }; -static __always_inline unsigned int bpf_dispatcher_nopfunc( +static __always_inline unsigned int bpf_dispatcher_nop_func( const void *ctx, const struct bpf_insn *insnsi, unsigned int (*bpf_func)(const void *, @@ -537,7 +537,7 @@ int bpf_trampoline_unlink_prog(struct bpf_prog *prog); void bpf_trampoline_put(struct bpf_trampoline *tr); #define BPF_DISPATCHER_INIT(name) { \ .mutex = __MUTEX_INITIALIZER(name.mutex), \ - .func = &name##func, \ + .func = &name##_func, \ .progs = {}, \ .num_progs = 0, \ .image = NULL, \ @@ -545,7 +545,7 @@ void bpf_trampoline_put(struct bpf_trampoline *tr); } #define DEFINE_BPF_DISPATCHER(name) \ - noinline unsigned int name##func( \ + noinline unsigned int bpf_dispatcher_##name##_func( \ const void *ctx, \ const struct bpf_insn *insnsi, \ unsigned int (*bpf_func)(const void *, \ @@ -553,17 +553,18 @@ void bpf_trampoline_put(struct bpf_trampoline *tr); { \ return bpf_func(ctx, insnsi); \ } \ - EXPORT_SYMBOL(name##func); \ - struct bpf_dispatcher name = BPF_DISPATCHER_INIT(name); + EXPORT_SYMBOL(bpf_dispatcher_##name##_func); \ + struct bpf_dispatcher bpf_dispatcher_##name = \ + BPF_DISPATCHER_INIT(bpf_dispatcher_##name); #define DECLARE_BPF_DISPATCHER(name) \ - unsigned int name##func( \ + unsigned int bpf_dispatcher_##name##_func( \ const void *ctx, \ const struct bpf_insn *insnsi, \ unsigned int (*bpf_func)(const void *, \ const struct bpf_insn *)); \ - extern struct bpf_dispatcher name; -#define BPF_DISPATCHER_FUNC(name) name##func -#define BPF_DISPATCHER_PTR(name) (&name) + extern struct bpf_dispatcher bpf_dispatcher_##name; +#define BPF_DISPATCHER_FUNC(name) bpf_dispatcher_##name##_func +#define BPF_DISPATCHER_PTR(name) (&bpf_dispatcher_##name) void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, struct bpf_prog *to); struct bpf_image { @@ -589,7 +590,7 @@ static inline int bpf_trampoline_unlink_prog(struct bpf_prog *prog) static inline void bpf_trampoline_put(struct bpf_trampoline *tr) {} #define DEFINE_BPF_DISPATCHER(name) #define DECLARE_BPF_DISPATCHER(name) -#define BPF_DISPATCHER_FUNC(name) bpf_dispatcher_nopfunc +#define BPF_DISPATCHER_FUNC(name) bpf_dispatcher_nop_func #define BPF_DISPATCHER_PTR(name) NULL static inline void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, diff --git a/include/linux/filter.h b/include/linux/filter.h index 43b5e455d2f5..6249679275b3 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -577,7 +577,7 @@ DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key); ret; }) #define BPF_PROG_RUN(prog, ctx) \ - __BPF_PROG_RUN(prog, ctx, bpf_dispatcher_nopfunc) + __BPF_PROG_RUN(prog, ctx, bpf_dispatcher_nop_func) /* * Use in preemptible and therefore migratable context to make sure that @@ -596,7 +596,7 @@ static inline u32 bpf_prog_run_pin_on_cpu(const struct bpf_prog *prog, u32 ret; migrate_disable(); - ret = __BPF_PROG_RUN(prog, ctx, bpf_dispatcher_nopfunc); + ret = __BPF_PROG_RUN(prog, ctx, bpf_dispatcher_nop_func); migrate_enable(); return ret; } @@ -722,7 +722,7 @@ static inline u32 bpf_prog_run_clear_cb(const struct bpf_prog *prog, return res; } -DECLARE_BPF_DISPATCHER(bpf_dispatcher_xdp) +DECLARE_BPF_DISPATCHER(xdp) static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog, struct xdp_buff *xdp) @@ -733,8 +733,7 @@ static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog, * already takes rcu_read_lock() when fetching the program, so * it's not necessary here anymore. */ - return __BPF_PROG_RUN(prog, xdp, - BPF_DISPATCHER_FUNC(bpf_dispatcher_xdp)); + return __BPF_PROG_RUN(prog, xdp, BPF_DISPATCHER_FUNC(xdp)); } void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog); diff --git a/net/core/filter.c b/net/core/filter.c index 22219544410f..96350a743539 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -8859,10 +8859,9 @@ const struct bpf_prog_ops sk_reuseport_prog_ops = { }; #endif /* CONFIG_INET */ -DEFINE_BPF_DISPATCHER(bpf_dispatcher_xdp) +DEFINE_BPF_DISPATCHER(xdp) void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog) { - bpf_dispatcher_change_prog(BPF_DISPATCHER_PTR(bpf_dispatcher_xdp), - prev_prog, prog); + bpf_dispatcher_change_prog(BPF_DISPATCHER_PTR(xdp), prev_prog, prog); } -- cgit v1.2.3 From 535911c80ad4f5801700e9d827a1985bbff41519 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 12 Mar 2020 20:55:58 +0100 Subject: bpf: Add struct bpf_ksym Adding 'struct bpf_ksym' object that will carry the kallsym information for bpf symbol. Adding the start and end address to begin with. It will be used by bpf_prog, bpf_trampoline, bpf_dispatcher objects. The symbol_start/symbol_end values were originally used to sort bpf_prog objects. For the address displayed in /proc/kallsyms we are using prog->bpf_func value. I'm using the bpf_func value for program symbol start instead of the symbol_start, because it makes no difference for sorting bpf_prog objects and we can use it directly as an address to display it in /proc/kallsyms. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20200312195610.346362-4-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 ++++++ kernel/bpf/core.c | 28 ++++++++++++---------------- 2 files changed, 18 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index fe1f8b075378..6ca3d5c8ccf3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -471,6 +471,11 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, u64 notrace __bpf_prog_enter(void); void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start); +struct bpf_ksym { + unsigned long start; + unsigned long end; +}; + enum bpf_tramp_prog_type { BPF_TRAMP_FENTRY, BPF_TRAMP_FEXIT, @@ -653,6 +658,7 @@ struct bpf_prog_aux { u32 size_poke_tab; struct latch_tree_node ksym_tnode; struct list_head ksym_lnode; + struct bpf_ksym ksym; const struct bpf_prog_ops *ops; struct bpf_map **used_maps; struct bpf_prog *prog; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 0f9ca46e1978..e587d6306d7c 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -523,18 +523,16 @@ int bpf_jit_kallsyms __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON); int bpf_jit_harden __read_mostly; long bpf_jit_limit __read_mostly; -static __always_inline void -bpf_get_prog_addr_region(const struct bpf_prog *prog, - unsigned long *symbol_start, - unsigned long *symbol_end) +static void +bpf_prog_ksym_set_addr(struct bpf_prog *prog) { const struct bpf_binary_header *hdr = bpf_jit_binary_hdr(prog); unsigned long addr = (unsigned long)hdr; WARN_ON_ONCE(!bpf_prog_ebpf_jited(prog)); - *symbol_start = addr; - *symbol_end = addr + hdr->pages * PAGE_SIZE; + prog->aux->ksym.start = (unsigned long) prog->bpf_func; + prog->aux->ksym.end = addr + hdr->pages * PAGE_SIZE; } void bpf_get_prog_name(const struct bpf_prog *prog, char *sym) @@ -575,13 +573,10 @@ void bpf_get_prog_name(const struct bpf_prog *prog, char *sym) static __always_inline unsigned long bpf_get_prog_addr_start(struct latch_tree_node *n) { - unsigned long symbol_start, symbol_end; const struct bpf_prog_aux *aux; aux = container_of(n, struct bpf_prog_aux, ksym_tnode); - bpf_get_prog_addr_region(aux->prog, &symbol_start, &symbol_end); - - return symbol_start; + return aux->ksym.start; } static __always_inline bool bpf_tree_less(struct latch_tree_node *a, @@ -593,15 +588,13 @@ static __always_inline bool bpf_tree_less(struct latch_tree_node *a, static __always_inline int bpf_tree_comp(void *key, struct latch_tree_node *n) { unsigned long val = (unsigned long)key; - unsigned long symbol_start, symbol_end; const struct bpf_prog_aux *aux; aux = container_of(n, struct bpf_prog_aux, ksym_tnode); - bpf_get_prog_addr_region(aux->prog, &symbol_start, &symbol_end); - if (val < symbol_start) + if (val < aux->ksym.start) return -1; - if (val >= symbol_end) + if (val >= aux->ksym.end) return 1; return 0; @@ -649,6 +642,8 @@ void bpf_prog_kallsyms_add(struct bpf_prog *fp) !capable(CAP_SYS_ADMIN)) return; + bpf_prog_ksym_set_addr(fp); + spin_lock_bh(&bpf_lock); bpf_prog_ksym_node_add(fp->aux); spin_unlock_bh(&bpf_lock); @@ -677,14 +672,15 @@ static struct bpf_prog *bpf_prog_kallsyms_find(unsigned long addr) const char *__bpf_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char *sym) { - unsigned long symbol_start, symbol_end; struct bpf_prog *prog; char *ret = NULL; rcu_read_lock(); prog = bpf_prog_kallsyms_find(addr); if (prog) { - bpf_get_prog_addr_region(prog, &symbol_start, &symbol_end); + unsigned long symbol_start = prog->aux->ksym.start; + unsigned long symbol_end = prog->aux->ksym.end; + bpf_get_prog_name(prog, sym); ret = sym; -- cgit v1.2.3 From bfea9a8574f34597581f74f792d044d38497b775 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 12 Mar 2020 20:55:59 +0100 Subject: bpf: Add name to struct bpf_ksym Adding name to 'struct bpf_ksym' object to carry the name of the symbol for bpf_prog, bpf_trampoline, bpf_dispatcher objects. The current benefit is that name is now generated only when the symbol is added to the list, so we don't need to generate it every time it's accessed. The future benefit is that we will have all the bpf objects symbols represented by struct bpf_ksym. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20200312195610.346362-5-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ include/linux/filter.h | 6 ------ kernel/bpf/core.c | 9 ++++++--- kernel/events/core.c | 9 ++++----- 4 files changed, 12 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6ca3d5c8ccf3..047b44deb3c5 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -18,6 +18,7 @@ #include #include #include +#include struct bpf_verifier_env; struct bpf_verifier_log; @@ -474,6 +475,7 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start); struct bpf_ksym { unsigned long start; unsigned long end; + char name[KSYM_NAME_LEN]; }; enum bpf_tramp_prog_type { diff --git a/include/linux/filter.h b/include/linux/filter.h index 6249679275b3..9b5aa5c483cc 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1083,7 +1083,6 @@ bpf_address_lookup(unsigned long addr, unsigned long *size, void bpf_prog_kallsyms_add(struct bpf_prog *fp); void bpf_prog_kallsyms_del(struct bpf_prog *fp); -void bpf_get_prog_name(const struct bpf_prog *prog, char *sym); #else /* CONFIG_BPF_JIT */ @@ -1152,11 +1151,6 @@ static inline void bpf_prog_kallsyms_del(struct bpf_prog *fp) { } -static inline void bpf_get_prog_name(const struct bpf_prog *prog, char *sym) -{ - sym[0] = '\0'; -} - #endif /* CONFIG_BPF_JIT */ void bpf_prog_kallsyms_del_all(struct bpf_prog *fp); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index e587d6306d7c..f6800c2d4b01 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -535,8 +535,10 @@ bpf_prog_ksym_set_addr(struct bpf_prog *prog) prog->aux->ksym.end = addr + hdr->pages * PAGE_SIZE; } -void bpf_get_prog_name(const struct bpf_prog *prog, char *sym) +static void +bpf_prog_ksym_set_name(struct bpf_prog *prog) { + char *sym = prog->aux->ksym.name; const char *end = sym + KSYM_NAME_LEN; const struct btf_type *type; const char *func_name; @@ -643,6 +645,7 @@ void bpf_prog_kallsyms_add(struct bpf_prog *fp) return; bpf_prog_ksym_set_addr(fp); + bpf_prog_ksym_set_name(fp); spin_lock_bh(&bpf_lock); bpf_prog_ksym_node_add(fp->aux); @@ -681,7 +684,7 @@ const char *__bpf_address_lookup(unsigned long addr, unsigned long *size, unsigned long symbol_start = prog->aux->ksym.start; unsigned long symbol_end = prog->aux->ksym.end; - bpf_get_prog_name(prog, sym); + strncpy(sym, prog->aux->ksym.name, KSYM_NAME_LEN); ret = sym; if (size) @@ -738,7 +741,7 @@ int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, if (it++ != symnum) continue; - bpf_get_prog_name(aux->prog, sym); + strncpy(sym, aux->ksym.name, KSYM_NAME_LEN); *value = (unsigned long)aux->prog->bpf_func; *type = BPF_SYM_ELF_TYPE; diff --git a/kernel/events/core.c b/kernel/events/core.c index bbdfac0182f4..9b89ef176247 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8255,23 +8255,22 @@ static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog, enum perf_bpf_event_type type) { bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD; - char sym[KSYM_NAME_LEN]; int i; if (prog->aux->func_cnt == 0) { - bpf_get_prog_name(prog, sym); perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, (u64)(unsigned long)prog->bpf_func, - prog->jited_len, unregister, sym); + prog->jited_len, unregister, + prog->aux->ksym.name); } else { for (i = 0; i < prog->aux->func_cnt; i++) { struct bpf_prog *subprog = prog->aux->func[i]; - bpf_get_prog_name(subprog, sym); perf_event_ksymbol( PERF_RECORD_KSYMBOL_TYPE_BPF, (u64)(unsigned long)subprog->bpf_func, - subprog->jited_len, unregister, sym); + subprog->jited_len, unregister, + prog->aux->ksym.name); } } } -- cgit v1.2.3 From ecb60d1c670e9b205197d8e4381b19e77bc2d834 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 12 Mar 2020 20:56:00 +0100 Subject: bpf: Move lnode list node to struct bpf_ksym Adding lnode list node to 'struct bpf_ksym' object, so the struct bpf_ksym itself can be chained and used in other objects like bpf_trampoline and bpf_dispatcher. Changing iterator to bpf_ksym in bpf_get_kallsym function. The ksym->start is holding the prog->bpf_func value, so it's ok to use it as value in bpf_get_kallsym. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20200312195610.346362-6-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 +- kernel/bpf/core.c | 22 +++++++++++----------- 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 047b44deb3c5..4fad2fa4135c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -476,6 +476,7 @@ struct bpf_ksym { unsigned long start; unsigned long end; char name[KSYM_NAME_LEN]; + struct list_head lnode; }; enum bpf_tramp_prog_type { @@ -659,7 +660,6 @@ struct bpf_prog_aux { struct bpf_jit_poke_descriptor *poke_tab; u32 size_poke_tab; struct latch_tree_node ksym_tnode; - struct list_head ksym_lnode; struct bpf_ksym ksym; const struct bpf_prog_ops *ops; struct bpf_map **used_maps; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index f6800c2d4b01..5eb5d5bb7a95 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -97,7 +97,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag fp->aux->prog = fp; fp->jit_requested = ebpf_jit_enabled(); - INIT_LIST_HEAD_RCU(&fp->aux->ksym_lnode); + INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode); return fp; } @@ -613,18 +613,18 @@ static struct latch_tree_root bpf_tree __cacheline_aligned; static void bpf_prog_ksym_node_add(struct bpf_prog_aux *aux) { - WARN_ON_ONCE(!list_empty(&aux->ksym_lnode)); - list_add_tail_rcu(&aux->ksym_lnode, &bpf_kallsyms); + WARN_ON_ONCE(!list_empty(&aux->ksym.lnode)); + list_add_tail_rcu(&aux->ksym.lnode, &bpf_kallsyms); latch_tree_insert(&aux->ksym_tnode, &bpf_tree, &bpf_tree_ops); } static void bpf_prog_ksym_node_del(struct bpf_prog_aux *aux) { - if (list_empty(&aux->ksym_lnode)) + if (list_empty(&aux->ksym.lnode)) return; latch_tree_erase(&aux->ksym_tnode, &bpf_tree, &bpf_tree_ops); - list_del_rcu(&aux->ksym_lnode); + list_del_rcu(&aux->ksym.lnode); } static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp) @@ -634,8 +634,8 @@ static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp) static bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp) { - return list_empty(&fp->aux->ksym_lnode) || - fp->aux->ksym_lnode.prev == LIST_POISON2; + return list_empty(&fp->aux->ksym.lnode) || + fp->aux->ksym.lnode.prev == LIST_POISON2; } void bpf_prog_kallsyms_add(struct bpf_prog *fp) @@ -729,7 +729,7 @@ out: int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, char *sym) { - struct bpf_prog_aux *aux; + struct bpf_ksym *ksym; unsigned int it = 0; int ret = -ERANGE; @@ -737,13 +737,13 @@ int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, return ret; rcu_read_lock(); - list_for_each_entry_rcu(aux, &bpf_kallsyms, ksym_lnode) { + list_for_each_entry_rcu(ksym, &bpf_kallsyms, lnode) { if (it++ != symnum) continue; - strncpy(sym, aux->ksym.name, KSYM_NAME_LEN); + strncpy(sym, ksym->name, KSYM_NAME_LEN); - *value = (unsigned long)aux->prog->bpf_func; + *value = ksym->start; *type = BPF_SYM_ELF_TYPE; ret = 0; -- cgit v1.2.3 From ca4424c920f574b7246ff1b6d83cfdfd709e42c8 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 12 Mar 2020 20:56:01 +0100 Subject: bpf: Move ksym_tnode to bpf_ksym Moving ksym_tnode list node to 'struct bpf_ksym' object, so the symbol itself can be chained and used in other objects like bpf_trampoline and bpf_dispatcher. We need bpf_ksym object to be linked both in bpf_kallsyms via lnode for /proc/kallsyms and in bpf_tree via tnode for bpf address lookup functions like __bpf_address_lookup or bpf_prog_kallsyms_find. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200312195610.346362-7-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 +- kernel/bpf/core.c | 24 ++++++++++-------------- 2 files changed, 11 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4fad2fa4135c..68d66b0078df 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -477,6 +477,7 @@ struct bpf_ksym { unsigned long end; char name[KSYM_NAME_LEN]; struct list_head lnode; + struct latch_tree_node tnode; }; enum bpf_tramp_prog_type { @@ -659,7 +660,6 @@ struct bpf_prog_aux { void *jit_data; /* JIT specific data. arch dependent */ struct bpf_jit_poke_descriptor *poke_tab; u32 size_poke_tab; - struct latch_tree_node ksym_tnode; struct bpf_ksym ksym; const struct bpf_prog_ops *ops; struct bpf_map **used_maps; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 5eb5d5bb7a95..ab1846c34167 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -572,31 +572,27 @@ bpf_prog_ksym_set_name(struct bpf_prog *prog) *sym = 0; } -static __always_inline unsigned long -bpf_get_prog_addr_start(struct latch_tree_node *n) +static unsigned long bpf_get_ksym_start(struct latch_tree_node *n) { - const struct bpf_prog_aux *aux; - - aux = container_of(n, struct bpf_prog_aux, ksym_tnode); - return aux->ksym.start; + return container_of(n, struct bpf_ksym, tnode)->start; } static __always_inline bool bpf_tree_less(struct latch_tree_node *a, struct latch_tree_node *b) { - return bpf_get_prog_addr_start(a) < bpf_get_prog_addr_start(b); + return bpf_get_ksym_start(a) < bpf_get_ksym_start(b); } static __always_inline int bpf_tree_comp(void *key, struct latch_tree_node *n) { unsigned long val = (unsigned long)key; - const struct bpf_prog_aux *aux; + const struct bpf_ksym *ksym; - aux = container_of(n, struct bpf_prog_aux, ksym_tnode); + ksym = container_of(n, struct bpf_ksym, tnode); - if (val < aux->ksym.start) + if (val < ksym->start) return -1; - if (val >= aux->ksym.end) + if (val >= ksym->end) return 1; return 0; @@ -615,7 +611,7 @@ static void bpf_prog_ksym_node_add(struct bpf_prog_aux *aux) { WARN_ON_ONCE(!list_empty(&aux->ksym.lnode)); list_add_tail_rcu(&aux->ksym.lnode, &bpf_kallsyms); - latch_tree_insert(&aux->ksym_tnode, &bpf_tree, &bpf_tree_ops); + latch_tree_insert(&aux->ksym.tnode, &bpf_tree, &bpf_tree_ops); } static void bpf_prog_ksym_node_del(struct bpf_prog_aux *aux) @@ -623,7 +619,7 @@ static void bpf_prog_ksym_node_del(struct bpf_prog_aux *aux) if (list_empty(&aux->ksym.lnode)) return; - latch_tree_erase(&aux->ksym_tnode, &bpf_tree, &bpf_tree_ops); + latch_tree_erase(&aux->ksym.tnode, &bpf_tree, &bpf_tree_ops); list_del_rcu(&aux->ksym.lnode); } @@ -668,7 +664,7 @@ static struct bpf_prog *bpf_prog_kallsyms_find(unsigned long addr) n = latch_tree_find((void *)addr, &bpf_tree, &bpf_tree_ops); return n ? - container_of(n, struct bpf_prog_aux, ksym_tnode)->prog : + container_of(n, struct bpf_prog_aux, ksym.tnode)->prog : NULL; } -- cgit v1.2.3 From cbd76f8d5ac9c4e99c4ffe5e39a1e907cdf5a76f Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 12 Mar 2020 20:56:03 +0100 Subject: bpf: Add prog flag to struct bpf_ksym object Adding 'prog' bool flag to 'struct bpf_ksym' to mark that this object belongs to bpf_prog object. This change allows having bpf_prog objects together with other types (trampolines and dispatchers) in the single bpf_tree. It's used when searching for bpf_prog exception tables by the bpf_prog_ksym_find function, where we need to get the bpf_prog pointer. >From now we can safely add bpf_ksym support for trampoline or dispatcher objects, because we can differentiate them from bpf_prog objects. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200312195610.346362-9-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + kernel/bpf/core.c | 22 +++++++++++----------- 2 files changed, 12 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 68d66b0078df..a0cef664c1a9 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -478,6 +478,7 @@ struct bpf_ksym { char name[KSYM_NAME_LEN]; struct list_head lnode; struct latch_tree_node tnode; + bool prog; }; enum bpf_tramp_prog_type { diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index cd380f7f015c..7516cbc65996 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -642,6 +642,7 @@ void bpf_prog_kallsyms_add(struct bpf_prog *fp) bpf_prog_ksym_set_addr(fp); bpf_prog_ksym_set_name(fp); + fp->aux->ksym.prog = true; spin_lock_bh(&bpf_lock); bpf_prog_ksym_node_add(fp->aux); @@ -658,16 +659,6 @@ void bpf_prog_kallsyms_del(struct bpf_prog *fp) spin_unlock_bh(&bpf_lock); } -static struct bpf_prog *bpf_prog_kallsyms_find(unsigned long addr) -{ - struct latch_tree_node *n; - - n = latch_tree_find((void *)addr, &bpf_tree, &bpf_tree_ops); - return n ? - container_of(n, struct bpf_prog_aux, ksym.tnode)->prog : - NULL; -} - static struct bpf_ksym *bpf_ksym_find(unsigned long addr) { struct latch_tree_node *n; @@ -712,13 +703,22 @@ bool is_bpf_text_address(unsigned long addr) return ret; } +static struct bpf_prog *bpf_prog_ksym_find(unsigned long addr) +{ + struct bpf_ksym *ksym = bpf_ksym_find(addr); + + return ksym && ksym->prog ? + container_of(ksym, struct bpf_prog_aux, ksym)->prog : + NULL; +} + const struct exception_table_entry *search_bpf_extables(unsigned long addr) { const struct exception_table_entry *e = NULL; struct bpf_prog *prog; rcu_read_lock(); - prog = bpf_prog_kallsyms_find(addr); + prog = bpf_prog_ksym_find(addr); if (!prog) goto out; if (!prog->aux->num_exentries) -- cgit v1.2.3 From dba122fb5e122e8e07e2f11cdebc10ba4f425cf7 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 12 Mar 2020 20:56:04 +0100 Subject: bpf: Add bpf_ksym_add/del functions Separating /proc/kallsyms add/del code and adding bpf_ksym_add/del functions for that. Moving bpf_prog_ksym_node_add/del functions to __bpf_ksym_add/del and changing their argument to 'struct bpf_ksym' object. This way we can call them for other bpf objects types like trampoline and dispatcher. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200312195610.346362-10-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 +++ kernel/bpf/core.c | 33 +++++++++++++++++++-------------- 2 files changed, 22 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a0cef664c1a9..ec1de88b8487 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -584,6 +584,9 @@ struct bpf_image { #define BPF_IMAGE_SIZE (PAGE_SIZE - sizeof(struct bpf_image)) bool is_bpf_image_address(unsigned long address); void *bpf_image_alloc(void); +/* Called only from JIT-enabled code, so there's no need for stubs. */ +void bpf_ksym_add(struct bpf_ksym *ksym); +void bpf_ksym_del(struct bpf_ksym *ksym); #else static inline struct bpf_trampoline *bpf_trampoline_lookup(u64 key) { diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 7516cbc65996..914f3463aa41 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -607,20 +607,29 @@ static DEFINE_SPINLOCK(bpf_lock); static LIST_HEAD(bpf_kallsyms); static struct latch_tree_root bpf_tree __cacheline_aligned; -static void bpf_prog_ksym_node_add(struct bpf_prog_aux *aux) +void bpf_ksym_add(struct bpf_ksym *ksym) { - WARN_ON_ONCE(!list_empty(&aux->ksym.lnode)); - list_add_tail_rcu(&aux->ksym.lnode, &bpf_kallsyms); - latch_tree_insert(&aux->ksym.tnode, &bpf_tree, &bpf_tree_ops); + spin_lock_bh(&bpf_lock); + WARN_ON_ONCE(!list_empty(&ksym->lnode)); + list_add_tail_rcu(&ksym->lnode, &bpf_kallsyms); + latch_tree_insert(&ksym->tnode, &bpf_tree, &bpf_tree_ops); + spin_unlock_bh(&bpf_lock); } -static void bpf_prog_ksym_node_del(struct bpf_prog_aux *aux) +static void __bpf_ksym_del(struct bpf_ksym *ksym) { - if (list_empty(&aux->ksym.lnode)) + if (list_empty(&ksym->lnode)) return; - latch_tree_erase(&aux->ksym.tnode, &bpf_tree, &bpf_tree_ops); - list_del_rcu(&aux->ksym.lnode); + latch_tree_erase(&ksym->tnode, &bpf_tree, &bpf_tree_ops); + list_del_rcu(&ksym->lnode); +} + +void bpf_ksym_del(struct bpf_ksym *ksym) +{ + spin_lock_bh(&bpf_lock); + __bpf_ksym_del(ksym); + spin_unlock_bh(&bpf_lock); } static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp) @@ -644,9 +653,7 @@ void bpf_prog_kallsyms_add(struct bpf_prog *fp) bpf_prog_ksym_set_name(fp); fp->aux->ksym.prog = true; - spin_lock_bh(&bpf_lock); - bpf_prog_ksym_node_add(fp->aux); - spin_unlock_bh(&bpf_lock); + bpf_ksym_add(&fp->aux->ksym); } void bpf_prog_kallsyms_del(struct bpf_prog *fp) @@ -654,9 +661,7 @@ void bpf_prog_kallsyms_del(struct bpf_prog *fp) if (!bpf_prog_kallsyms_candidate(fp)) return; - spin_lock_bh(&bpf_lock); - bpf_prog_ksym_node_del(fp->aux); - spin_unlock_bh(&bpf_lock); + bpf_ksym_del(&fp->aux->ksym); } static struct bpf_ksym *bpf_ksym_find(unsigned long addr) -- cgit v1.2.3 From a108f7dcfa010e3da825af90d77ac0a6a0240992 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 12 Mar 2020 20:56:05 +0100 Subject: bpf: Add trampolines to kallsyms Adding trampolines to kallsyms. It's displayed as bpf_trampoline_ [bpf] where ID is the BTF id of the trampoline function. Adding bpf_image_ksym_add/del functions that setup the start/end values and call KSYMBOL perf events handlers. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200312195610.346362-11-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 +++ kernel/bpf/trampoline.c | 28 ++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ec1de88b8487..083860be1944 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -513,6 +513,7 @@ struct bpf_trampoline { /* Executable image of trampoline */ void *image; u64 selector; + struct bpf_ksym ksym; }; #define BPF_DISPATCHER_MAX 48 /* Fits in 2048B */ @@ -585,6 +586,8 @@ struct bpf_image { bool is_bpf_image_address(unsigned long address); void *bpf_image_alloc(void); /* Called only from JIT-enabled code, so there's no need for stubs. */ +void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym); +void bpf_image_ksym_del(struct bpf_ksym *ksym); void bpf_ksym_add(struct bpf_ksym *ksym); void bpf_ksym_del(struct bpf_ksym *ksym); #else diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 221a17af1f81..36549c9afec4 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -5,6 +5,7 @@ #include #include #include +#include /* dummy _ops. The verifier will operate on target program's ops. */ const struct bpf_verifier_ops bpf_extension_verifier_ops = { @@ -96,6 +97,30 @@ bool is_bpf_image_address(unsigned long addr) return ret; } +void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym) +{ + ksym->start = (unsigned long) data; + ksym->end = ksym->start + BPF_IMAGE_SIZE; + bpf_ksym_add(ksym); + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, ksym->start, + BPF_IMAGE_SIZE, false, ksym->name); +} + +void bpf_image_ksym_del(struct bpf_ksym *ksym) +{ + bpf_ksym_del(ksym); + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, ksym->start, + BPF_IMAGE_SIZE, true, ksym->name); +} + +static void bpf_trampoline_ksym_add(struct bpf_trampoline *tr) +{ + struct bpf_ksym *ksym = &tr->ksym; + + snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu", tr->key); + bpf_image_ksym_add(tr->image, ksym); +} + struct bpf_trampoline *bpf_trampoline_lookup(u64 key) { struct bpf_trampoline *tr; @@ -131,6 +156,8 @@ struct bpf_trampoline *bpf_trampoline_lookup(u64 key) for (i = 0; i < BPF_TRAMP_MAX; i++) INIT_HLIST_HEAD(&tr->progs_hlist[i]); tr->image = image; + INIT_LIST_HEAD_RCU(&tr->ksym.lnode); + bpf_trampoline_ksym_add(tr); out: mutex_unlock(&trampoline_mutex); return tr; @@ -368,6 +395,7 @@ void bpf_trampoline_put(struct bpf_trampoline *tr) goto out; if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT]))) goto out; + bpf_image_ksym_del(&tr->ksym); image = container_of(tr->image, struct bpf_image, data); latch_tree_erase(&image->tnode, &image_tree, &image_tree_ops); /* wait for tasks to get out of trampoline before freeing it */ -- cgit v1.2.3 From 517b75e44c7be9c776aa5f7beaa85baff3868f80 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 12 Mar 2020 20:56:06 +0100 Subject: bpf: Add dispatchers to kallsyms Adding dispatchers to kallsyms. It's displayed as bpf_dispatcher_ where NAME is the name of dispatcher. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200312195610.346362-12-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 19 ++++++++++++------- kernel/bpf/dispatcher.c | 1 + 2 files changed, 13 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 083860be1944..86cacb54ba23 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -531,6 +531,7 @@ struct bpf_dispatcher { int num_progs; void *image; u32 image_off; + struct bpf_ksym ksym; }; static __always_inline unsigned int bpf_dispatcher_nop_func( @@ -546,13 +547,17 @@ struct bpf_trampoline *bpf_trampoline_lookup(u64 key); int bpf_trampoline_link_prog(struct bpf_prog *prog); int bpf_trampoline_unlink_prog(struct bpf_prog *prog); void bpf_trampoline_put(struct bpf_trampoline *tr); -#define BPF_DISPATCHER_INIT(name) { \ - .mutex = __MUTEX_INITIALIZER(name.mutex), \ - .func = &name##_func, \ - .progs = {}, \ - .num_progs = 0, \ - .image = NULL, \ - .image_off = 0 \ +#define BPF_DISPATCHER_INIT(_name) { \ + .mutex = __MUTEX_INITIALIZER(_name.mutex), \ + .func = &_name##_func, \ + .progs = {}, \ + .num_progs = 0, \ + .image = NULL, \ + .image_off = 0, \ + .ksym = { \ + .name = #_name, \ + .lnode = LIST_HEAD_INIT(_name.ksym.lnode), \ + }, \ } #define DEFINE_BPF_DISPATCHER(name) \ diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c index b3e5b214fed8..a2679bae9e73 100644 --- a/kernel/bpf/dispatcher.c +++ b/kernel/bpf/dispatcher.c @@ -143,6 +143,7 @@ void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, d->image = bpf_image_alloc(); if (!d->image) goto out; + bpf_image_ksym_add(d->image, &d->ksym); } prev_num_progs = d->num_progs; -- cgit v1.2.3 From 7ac88eba185b4d0e06a71678e54bc092edcd3af3 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 12 Mar 2020 20:56:07 +0100 Subject: bpf: Remove bpf_image tree Now that we have all the objects (bpf_prog, bpf_trampoline, bpf_dispatcher) linked in bpf_tree, there's no need to have separate bpf_image tree for images. Reverting the bpf_image tree together with struct bpf_image, because it's no longer needed. Also removing bpf_image_alloc function and adding the original bpf_jit_alloc_exec_page interface instead. The kernel_text_address function can now rely only on is_bpf_text_address, because it checks the bpf_tree that contains all the objects. Keeping bpf_image_ksym_add and bpf_image_ksym_del because they are useful wrappers with perf's ksymbol interface calls. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200312195610.346362-13-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 8 +---- kernel/bpf/dispatcher.c | 4 +-- kernel/bpf/trampoline.c | 83 ++++++------------------------------------------- kernel/extable.c | 2 -- 4 files changed, 13 insertions(+), 84 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 86cacb54ba23..bdb981c204fa 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -583,14 +583,8 @@ void bpf_trampoline_put(struct bpf_trampoline *tr); #define BPF_DISPATCHER_PTR(name) (&bpf_dispatcher_##name) void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, struct bpf_prog *to); -struct bpf_image { - struct latch_tree_node tnode; - unsigned char data[]; -}; -#define BPF_IMAGE_SIZE (PAGE_SIZE - sizeof(struct bpf_image)) -bool is_bpf_image_address(unsigned long address); -void *bpf_image_alloc(void); /* Called only from JIT-enabled code, so there's no need for stubs. */ +void *bpf_jit_alloc_exec_page(void); void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym); void bpf_image_ksym_del(struct bpf_ksym *ksym); void bpf_ksym_add(struct bpf_ksym *ksym); diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c index a2679bae9e73..2444bd15cc2d 100644 --- a/kernel/bpf/dispatcher.c +++ b/kernel/bpf/dispatcher.c @@ -113,7 +113,7 @@ static void bpf_dispatcher_update(struct bpf_dispatcher *d, int prev_num_progs) noff = 0; } else { old = d->image + d->image_off; - noff = d->image_off ^ (BPF_IMAGE_SIZE / 2); + noff = d->image_off ^ (PAGE_SIZE / 2); } new = d->num_progs ? d->image + noff : NULL; @@ -140,7 +140,7 @@ void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, mutex_lock(&d->mutex); if (!d->image) { - d->image = bpf_image_alloc(); + d->image = bpf_jit_alloc_exec_page(); if (!d->image) goto out; bpf_image_ksym_add(d->image, &d->ksym); diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 36549c9afec4..f42f700c1d28 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -18,12 +18,11 @@ const struct bpf_prog_ops bpf_extension_prog_ops = { #define TRAMPOLINE_TABLE_SIZE (1 << TRAMPOLINE_HASH_BITS) static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE]; -static struct latch_tree_root image_tree __cacheline_aligned; -/* serializes access to trampoline_table and image_tree */ +/* serializes access to trampoline_table */ static DEFINE_MUTEX(trampoline_mutex); -static void *bpf_jit_alloc_exec_page(void) +void *bpf_jit_alloc_exec_page(void) { void *image; @@ -39,78 +38,20 @@ static void *bpf_jit_alloc_exec_page(void) return image; } -static __always_inline bool image_tree_less(struct latch_tree_node *a, - struct latch_tree_node *b) -{ - struct bpf_image *ia = container_of(a, struct bpf_image, tnode); - struct bpf_image *ib = container_of(b, struct bpf_image, tnode); - - return ia < ib; -} - -static __always_inline int image_tree_comp(void *addr, struct latch_tree_node *n) -{ - void *image = container_of(n, struct bpf_image, tnode); - - if (addr < image) - return -1; - if (addr >= image + PAGE_SIZE) - return 1; - - return 0; -} - -static const struct latch_tree_ops image_tree_ops = { - .less = image_tree_less, - .comp = image_tree_comp, -}; - -static void *__bpf_image_alloc(bool lock) -{ - struct bpf_image *image; - - image = bpf_jit_alloc_exec_page(); - if (!image) - return NULL; - - if (lock) - mutex_lock(&trampoline_mutex); - latch_tree_insert(&image->tnode, &image_tree, &image_tree_ops); - if (lock) - mutex_unlock(&trampoline_mutex); - return image->data; -} - -void *bpf_image_alloc(void) -{ - return __bpf_image_alloc(true); -} - -bool is_bpf_image_address(unsigned long addr) -{ - bool ret; - - rcu_read_lock(); - ret = latch_tree_find((void *) addr, &image_tree, &image_tree_ops) != NULL; - rcu_read_unlock(); - - return ret; -} - void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym) { ksym->start = (unsigned long) data; - ksym->end = ksym->start + BPF_IMAGE_SIZE; + ksym->end = ksym->start + PAGE_SIZE; bpf_ksym_add(ksym); perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, ksym->start, - BPF_IMAGE_SIZE, false, ksym->name); + PAGE_SIZE, false, ksym->name); } void bpf_image_ksym_del(struct bpf_ksym *ksym) { bpf_ksym_del(ksym); perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, ksym->start, - BPF_IMAGE_SIZE, true, ksym->name); + PAGE_SIZE, true, ksym->name); } static void bpf_trampoline_ksym_add(struct bpf_trampoline *tr) @@ -141,7 +82,7 @@ struct bpf_trampoline *bpf_trampoline_lookup(u64 key) goto out; /* is_root was checked earlier. No need for bpf_jit_charge_modmem() */ - image = __bpf_image_alloc(false); + image = bpf_jit_alloc_exec_page(); if (!image) { kfree(tr); tr = NULL; @@ -243,8 +184,8 @@ bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total) static int bpf_trampoline_update(struct bpf_trampoline *tr) { - void *old_image = tr->image + ((tr->selector + 1) & 1) * BPF_IMAGE_SIZE/2; - void *new_image = tr->image + (tr->selector & 1) * BPF_IMAGE_SIZE/2; + void *old_image = tr->image + ((tr->selector + 1) & 1) * PAGE_SIZE/2; + void *new_image = tr->image + (tr->selector & 1) * PAGE_SIZE/2; struct bpf_tramp_progs *tprogs; u32 flags = BPF_TRAMP_F_RESTORE_REGS; int err, total; @@ -272,7 +213,7 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) synchronize_rcu_tasks(); - err = arch_prepare_bpf_trampoline(new_image, new_image + BPF_IMAGE_SIZE / 2, + err = arch_prepare_bpf_trampoline(new_image, new_image + PAGE_SIZE / 2, &tr->func.model, flags, tprogs, tr->func.addr); if (err < 0) @@ -383,8 +324,6 @@ out: void bpf_trampoline_put(struct bpf_trampoline *tr) { - struct bpf_image *image; - if (!tr) return; mutex_lock(&trampoline_mutex); @@ -396,11 +335,9 @@ void bpf_trampoline_put(struct bpf_trampoline *tr) if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT]))) goto out; bpf_image_ksym_del(&tr->ksym); - image = container_of(tr->image, struct bpf_image, data); - latch_tree_erase(&image->tnode, &image_tree, &image_tree_ops); /* wait for tasks to get out of trampoline before freeing it */ synchronize_rcu_tasks(); - bpf_jit_free_exec(image); + bpf_jit_free_exec(tr->image); hlist_del(&tr->hlist); kfree(tr); out: diff --git a/kernel/extable.c b/kernel/extable.c index a0024f27d3a1..7681f87e89dd 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -149,8 +149,6 @@ int kernel_text_address(unsigned long addr) goto out; if (is_bpf_text_address(addr)) goto out; - if (is_bpf_image_address(addr)) - goto out; ret = 0; out: if (no_rcu) -- cgit v1.2.3