diff options
author | David S. Miller <davem@davemloft.net> | 2019-04-22 21:35:55 -0700 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2019-04-22 21:35:55 -0700 |
commit | 2843ba2ec75948e274d2c4f0a9390980e68a6461 (patch) | |
tree | b2fde70246956609ac43ad62026a4fdb48e33fbc /kernel | |
parent | be659b8d3c79afc54e087ebf8d849685d7b0d395 (diff) | |
parent | f79b464fd6b56b6256de43bc4c7d5968c0e52968 (diff) | |
download | linux-2843ba2ec75948e274d2c4f0a9390980e68a6461.tar.bz2 |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
Alexei Starovoitov says:
====================
pull-request: bpf-next 2019-04-22
The following pull-request contains BPF updates for your *net-next* tree.
The main changes are:
1) allow stack/queue helpers from more bpf program types, from Alban.
2) allow parallel verification of root bpf programs, from Alexei.
3) introduce bpf sysctl hook for trusted root cases, from Andrey.
4) recognize var/datasec in btf deduplication, from Andrii.
5) cpumap performance optimizations, from Jesper.
6) verifier prep for alu32 optimization, from Jiong.
7) libbpf xsk cleanup, from Magnus.
8) other various fixes and cleanups.
====================
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/bpf/cgroup.c | 370 | ||||
-rw-r--r-- | kernel/bpf/cpumap.c | 53 | ||||
-rw-r--r-- | kernel/bpf/helpers.c | 131 | ||||
-rw-r--r-- | kernel/bpf/syscall.c | 7 | ||||
-rw-r--r-- | kernel/bpf/verifier.c | 134 | ||||
-rw-r--r-- | kernel/trace/bpf_trace.c | 6 |
6 files changed, 641 insertions, 60 deletions
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 4e807973aa80..fcde0f7b2585 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -11,7 +11,10 @@ #include <linux/kernel.h> #include <linux/atomic.h> #include <linux/cgroup.h> +#include <linux/filter.h> #include <linux/slab.h> +#include <linux/sysctl.h> +#include <linux/string.h> #include <linux/bpf.h> #include <linux/bpf-cgroup.h> #include <net/sock.h> @@ -701,7 +704,7 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission); static const struct bpf_func_proto * -cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_map_lookup_elem: @@ -710,6 +713,12 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_map_update_elem_proto; case BPF_FUNC_map_delete_elem: return &bpf_map_delete_elem_proto; + case BPF_FUNC_map_push_elem: + return &bpf_map_push_elem_proto; + case BPF_FUNC_map_pop_elem: + return &bpf_map_pop_elem_proto; + case BPF_FUNC_map_peek_elem: + return &bpf_map_peek_elem_proto; case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; case BPF_FUNC_get_local_storage: @@ -725,6 +734,12 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } +static const struct bpf_func_proto * +cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + return cgroup_base_func_proto(func_id, prog); +} + static bool cgroup_dev_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, @@ -762,3 +777,356 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = { .get_func_proto = cgroup_dev_func_proto, .is_valid_access = cgroup_dev_is_valid_access, }; + +/** + * __cgroup_bpf_run_filter_sysctl - Run a program on sysctl + * + * @head: sysctl table header + * @table: sysctl table + * @write: sysctl is being read (= 0) or written (= 1) + * @buf: pointer to buffer passed by user space + * @pcount: value-result argument: value is size of buffer pointed to by @buf, + * result is size of @new_buf if program set new value, initial value + * otherwise + * @ppos: value-result argument: value is position at which read from or write + * to sysctl is happening, result is new position if program overrode it, + * initial value otherwise + * @new_buf: pointer to pointer to new buffer that will be allocated if program + * overrides new value provided by user space on sysctl write + * NOTE: it's caller responsibility to free *new_buf if it was set + * @type: type of program to be executed + * + * Program is run when sysctl is being accessed, either read or written, and + * can allow or deny such access. + * + * This function will return %-EPERM if an attached program is found and + * returned value != 1 during execution. In all other cases 0 is returned. + */ +int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, + struct ctl_table *table, int write, + void __user *buf, size_t *pcount, + loff_t *ppos, void **new_buf, + enum bpf_attach_type type) +{ + struct bpf_sysctl_kern ctx = { + .head = head, + .table = table, + .write = write, + .ppos = ppos, + .cur_val = NULL, + .cur_len = PAGE_SIZE, + .new_val = NULL, + .new_len = 0, + .new_updated = 0, + }; + struct cgroup *cgrp; + int ret; + + ctx.cur_val = kmalloc_track_caller(ctx.cur_len, GFP_KERNEL); + if (ctx.cur_val) { + mm_segment_t old_fs; + loff_t pos = 0; + + old_fs = get_fs(); + set_fs(KERNEL_DS); + if (table->proc_handler(table, 0, (void __user *)ctx.cur_val, + &ctx.cur_len, &pos)) { + /* Let BPF program decide how to proceed. */ + ctx.cur_len = 0; + } + set_fs(old_fs); + } else { + /* Let BPF program decide how to proceed. */ + ctx.cur_len = 0; + } + + if (write && buf && *pcount) { + /* BPF program should be able to override new value with a + * buffer bigger than provided by user. + */ + ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL); + ctx.new_len = min_t(size_t, PAGE_SIZE, *pcount); + if (!ctx.new_val || + copy_from_user(ctx.new_val, buf, ctx.new_len)) + /* Let BPF program decide how to proceed. */ + ctx.new_len = 0; + } + + rcu_read_lock(); + cgrp = task_dfl_cgroup(current); + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN); + rcu_read_unlock(); + + kfree(ctx.cur_val); + + if (ret == 1 && ctx.new_updated) { + *new_buf = ctx.new_val; + *pcount = ctx.new_len; + } else { + kfree(ctx.new_val); + } + + return ret == 1 ? 0 : -EPERM; +} +EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl); + +static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp, + size_t *lenp) +{ + ssize_t tmp_ret = 0, ret; + + if (dir->header.parent) { + tmp_ret = sysctl_cpy_dir(dir->header.parent, bufp, lenp); + if (tmp_ret < 0) + return tmp_ret; + } + + ret = strscpy(*bufp, dir->header.ctl_table[0].procname, *lenp); + if (ret < 0) + return ret; + *bufp += ret; + *lenp -= ret; + ret += tmp_ret; + + /* Avoid leading slash. */ + if (!ret) + return ret; + + tmp_ret = strscpy(*bufp, "/", *lenp); + if (tmp_ret < 0) + return tmp_ret; + *bufp += tmp_ret; + *lenp -= tmp_ret; + + return ret + tmp_ret; +} + +BPF_CALL_4(bpf_sysctl_get_name, struct bpf_sysctl_kern *, ctx, char *, buf, + size_t, buf_len, u64, flags) +{ + ssize_t tmp_ret = 0, ret; + + if (!buf) + return -EINVAL; + + if (!(flags & BPF_F_SYSCTL_BASE_NAME)) { + if (!ctx->head) + return -EINVAL; + tmp_ret = sysctl_cpy_dir(ctx->head->parent, &buf, &buf_len); + if (tmp_ret < 0) + return tmp_ret; + } + + ret = strscpy(buf, ctx->table->procname, buf_len); + + return ret < 0 ? ret : tmp_ret + ret; +} + +static const struct bpf_func_proto bpf_sysctl_get_name_proto = { + .func = bpf_sysctl_get_name, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, +}; + +static int copy_sysctl_value(char *dst, size_t dst_len, char *src, + size_t src_len) +{ + if (!dst) + return -EINVAL; + + if (!dst_len) + return -E2BIG; + + if (!src || !src_len) { + memset(dst, 0, dst_len); + return -EINVAL; + } + + memcpy(dst, src, min(dst_len, src_len)); + + if (dst_len > src_len) { + memset(dst + src_len, '\0', dst_len - src_len); + return src_len; + } + + dst[dst_len - 1] = '\0'; + + return -E2BIG; +} + +BPF_CALL_3(bpf_sysctl_get_current_value, struct bpf_sysctl_kern *, ctx, + char *, buf, size_t, buf_len) +{ + return copy_sysctl_value(buf, buf_len, ctx->cur_val, ctx->cur_len); +} + +static const struct bpf_func_proto bpf_sysctl_get_current_value_proto = { + .func = bpf_sysctl_get_current_value, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + +BPF_CALL_3(bpf_sysctl_get_new_value, struct bpf_sysctl_kern *, ctx, char *, buf, + size_t, buf_len) +{ + if (!ctx->write) { + if (buf && buf_len) + memset(buf, '\0', buf_len); + return -EINVAL; + } + return copy_sysctl_value(buf, buf_len, ctx->new_val, ctx->new_len); +} + +static const struct bpf_func_proto bpf_sysctl_get_new_value_proto = { + .func = bpf_sysctl_get_new_value, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + +BPF_CALL_3(bpf_sysctl_set_new_value, struct bpf_sysctl_kern *, ctx, + const char *, buf, size_t, buf_len) +{ + if (!ctx->write || !ctx->new_val || !ctx->new_len || !buf || !buf_len) + return -EINVAL; + + if (buf_len > PAGE_SIZE - 1) + return -E2BIG; + + memcpy(ctx->new_val, buf, buf_len); + ctx->new_len = buf_len; + ctx->new_updated = 1; + + return 0; +} + +static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = { + .func = bpf_sysctl_set_new_value, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + +static const struct bpf_func_proto * +sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_strtol: + return &bpf_strtol_proto; + case BPF_FUNC_strtoul: + return &bpf_strtoul_proto; + case BPF_FUNC_sysctl_get_name: + return &bpf_sysctl_get_name_proto; + case BPF_FUNC_sysctl_get_current_value: + return &bpf_sysctl_get_current_value_proto; + case BPF_FUNC_sysctl_get_new_value: + return &bpf_sysctl_get_new_value_proto; + case BPF_FUNC_sysctl_set_new_value: + return &bpf_sysctl_set_new_value_proto; + default: + return cgroup_base_func_proto(func_id, prog); + } +} + +static bool sysctl_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + const int size_default = sizeof(__u32); + + if (off < 0 || off + size > sizeof(struct bpf_sysctl) || off % size) + return false; + + switch (off) { + case offsetof(struct bpf_sysctl, write): + if (type != BPF_READ) + return false; + bpf_ctx_record_field_size(info, size_default); + return bpf_ctx_narrow_access_ok(off, size, size_default); + case offsetof(struct bpf_sysctl, file_pos): + if (type == BPF_READ) { + bpf_ctx_record_field_size(info, size_default); + return bpf_ctx_narrow_access_ok(off, size, size_default); + } else { + return size == size_default; + } + default: + return false; + } +} + +static u32 sysctl_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + + switch (si->off) { + case offsetof(struct bpf_sysctl, write): + *insn++ = BPF_LDX_MEM( + BPF_SIZE(si->code), si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sysctl_kern, write, + FIELD_SIZEOF(struct bpf_sysctl_kern, + write), + target_size)); + break; + case offsetof(struct bpf_sysctl, file_pos): + /* ppos is a pointer so it should be accessed via indirect + * loads and stores. Also for stores additional temporary + * register is used since neither src_reg nor dst_reg can be + * overridden. + */ + if (type == BPF_WRITE) { + int treg = BPF_REG_9; + + if (si->src_reg == treg || si->dst_reg == treg) + --treg; + if (si->src_reg == treg || si->dst_reg == treg) + --treg; + *insn++ = BPF_STX_MEM( + BPF_DW, si->dst_reg, treg, + offsetof(struct bpf_sysctl_kern, tmp_reg)); + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos), + treg, si->dst_reg, + offsetof(struct bpf_sysctl_kern, ppos)); + *insn++ = BPF_STX_MEM( + BPF_SIZEOF(u32), treg, si->src_reg, 0); + *insn++ = BPF_LDX_MEM( + BPF_DW, treg, si->dst_reg, + offsetof(struct bpf_sysctl_kern, tmp_reg)); + } else { + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sysctl_kern, ppos)); + *insn++ = BPF_LDX_MEM( + BPF_SIZE(si->code), si->dst_reg, si->dst_reg, 0); + } + *target_size = sizeof(u32); + break; + } + + return insn - insn_buf; +} + +const struct bpf_verifier_ops cg_sysctl_verifier_ops = { + .get_func_proto = sysctl_func_proto, + .is_valid_access = sysctl_is_valid_access, + .convert_ctx_access = sysctl_convert_ctx_access, +}; + +const struct bpf_prog_ops cg_sysctl_prog_ops = { +}; diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 3c18260403dd..cf727d77c6c6 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -160,12 +160,12 @@ static void cpu_map_kthread_stop(struct work_struct *work) } static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, - struct xdp_frame *xdpf) + struct xdp_frame *xdpf, + struct sk_buff *skb) { unsigned int hard_start_headroom; unsigned int frame_size; void *pkt_data_start; - struct sk_buff *skb; /* Part of headroom was reserved to xdpf */ hard_start_headroom = sizeof(struct xdp_frame) + xdpf->headroom; @@ -191,8 +191,8 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); pkt_data_start = xdpf->data - hard_start_headroom; - skb = build_skb(pkt_data_start, frame_size); - if (!skb) + skb = build_skb_around(skb, pkt_data_start, frame_size); + if (unlikely(!skb)) return NULL; skb_reserve(skb, hard_start_headroom); @@ -240,6 +240,8 @@ static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) } } +#define CPUMAP_BATCH 8 + static int cpu_map_kthread_run(void *data) { struct bpf_cpu_map_entry *rcpu = data; @@ -252,8 +254,11 @@ static int cpu_map_kthread_run(void *data) * kthread_stop signal until queue is empty. */ while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) { - unsigned int processed = 0, drops = 0, sched = 0; - struct xdp_frame *xdpf; + unsigned int drops = 0, sched = 0; + void *frames[CPUMAP_BATCH]; + void *skbs[CPUMAP_BATCH]; + gfp_t gfp = __GFP_ZERO | GFP_ATOMIC; + int i, n, m; /* Release CPU reschedule checks */ if (__ptr_ring_empty(rcpu->queue)) { @@ -269,18 +274,38 @@ static int cpu_map_kthread_run(void *data) sched = cond_resched(); } - /* Process packets in rcpu->queue */ - local_bh_disable(); /* * The bpf_cpu_map_entry is single consumer, with this * kthread CPU pinned. Lockless access to ptr_ring * consume side valid as no-resize allowed of queue. */ - while ((xdpf = __ptr_ring_consume(rcpu->queue))) { - struct sk_buff *skb; + n = ptr_ring_consume_batched(rcpu->queue, frames, CPUMAP_BATCH); + + for (i = 0; i < n; i++) { + void *f = frames[i]; + struct page *page = virt_to_page(f); + + /* Bring struct page memory area to curr CPU. Read by + * build_skb_around via page_is_pfmemalloc(), and when + * freed written by page_frag_free call. + */ + prefetchw(page); + } + + m = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, n, skbs); + if (unlikely(m == 0)) { + for (i = 0; i < n; i++) + skbs[i] = NULL; /* effect: xdp_return_frame */ + drops = n; + } + + local_bh_disable(); + for (i = 0; i < n; i++) { + struct xdp_frame *xdpf = frames[i]; + struct sk_buff *skb = skbs[i]; int ret; - skb = cpu_map_build_skb(rcpu, xdpf); + skb = cpu_map_build_skb(rcpu, xdpf, skb); if (!skb) { xdp_return_frame(xdpf); continue; @@ -290,13 +315,9 @@ static int cpu_map_kthread_run(void *data) ret = netif_receive_skb_core(skb); if (ret == NET_RX_DROP) drops++; - - /* Limit BH-disable period */ - if (++processed == 8) - break; } /* Feedback loop via tracepoint */ - trace_xdp_cpumap_kthread(rcpu->map_id, processed, drops, sched); + trace_xdp_cpumap_kthread(rcpu->map_id, n, drops, sched); local_bh_enable(); /* resched point, may call do_softirq() */ } diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index a411fc17d265..4266ffde07ca 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -18,6 +18,9 @@ #include <linux/sched.h> #include <linux/uidgid.h> #include <linux/filter.h> +#include <linux/ctype.h> + +#include "../../lib/kstrtox.h" /* If kernel subsystem is allowing eBPF programs to call this function, * inside its own verifier_ops->get_func_proto() callback it should return @@ -363,4 +366,132 @@ const struct bpf_func_proto bpf_get_local_storage_proto = { .arg2_type = ARG_ANYTHING, }; #endif + +#define BPF_STRTOX_BASE_MASK 0x1F + +static int __bpf_strtoull(const char *buf, size_t buf_len, u64 flags, + unsigned long long *res, bool *is_negative) +{ + unsigned int base = flags & BPF_STRTOX_BASE_MASK; + const char *cur_buf = buf; + size_t cur_len = buf_len; + unsigned int consumed; + size_t val_len; + char str[64]; + + if (!buf || !buf_len || !res || !is_negative) + return -EINVAL; + + if (base != 0 && base != 8 && base != 10 && base != 16) + return -EINVAL; + + if (flags & ~BPF_STRTOX_BASE_MASK) + return -EINVAL; + + while (cur_buf < buf + buf_len && isspace(*cur_buf)) + ++cur_buf; + + *is_negative = (cur_buf < buf + buf_len && *cur_buf == '-'); + if (*is_negative) + ++cur_buf; + + consumed = cur_buf - buf; + cur_len -= consumed; + if (!cur_len) + return -EINVAL; + + cur_len = min(cur_len, sizeof(str) - 1); + memcpy(str, cur_buf, cur_len); + str[cur_len] = '\0'; + cur_buf = str; + + cur_buf = _parse_integer_fixup_radix(cur_buf, &base); + val_len = _parse_integer(cur_buf, base, res); + + if (val_len & KSTRTOX_OVERFLOW) + return -ERANGE; + + if (val_len == 0) + return -EINVAL; + + cur_buf += val_len; + consumed += cur_buf - str; + + return consumed; +} + +static int __bpf_strtoll(const char *buf, size_t buf_len, u64 flags, + long long *res) +{ + unsigned long long _res; + bool is_negative; + int err; + + err = __bpf_strtoull(buf, buf_len, flags, &_res, &is_negative); + if (err < 0) + return err; + if (is_negative) { + if ((long long)-_res > 0) + return -ERANGE; + *res = -_res; + } else { + if ((long long)_res < 0) + return -ERANGE; + *res = _res; + } + return err; +} + +BPF_CALL_4(bpf_strtol, const char *, buf, size_t, buf_len, u64, flags, + long *, res) +{ + long long _res; + int err; + + err = __bpf_strtoll(buf, buf_len, flags, &_res); + if (err < 0) + return err; + if (_res != (long)_res) + return -ERANGE; + *res = _res; + return err; +} + +const struct bpf_func_proto bpf_strtol_proto = { + .func = bpf_strtol, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_CONST_SIZE, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_LONG, +}; + +BPF_CALL_4(bpf_strtoul, const char *, buf, size_t, buf_len, u64, flags, + unsigned long *, res) +{ + unsigned long long _res; + bool is_negative; + int err; + + err = __bpf_strtoull(buf, buf_len, flags, &_res, &is_negative); + if (err < 0) + return err; + if (is_negative) + return -EINVAL; + if (_res != (unsigned long)_res) + return -ERANGE; + *res = _res; + return err; +} + +const struct bpf_func_proto bpf_strtoul_proto = { + .func = bpf_strtoul, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_CONST_SIZE, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_LONG, +}; #endif diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index d995eedfdd16..92c9b8a32b50 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1888,6 +1888,9 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_FLOW_DISSECTOR: ptype = BPF_PROG_TYPE_FLOW_DISSECTOR; break; + case BPF_CGROUP_SYSCTL: + ptype = BPF_PROG_TYPE_CGROUP_SYSCTL; + break; default: return -EINVAL; } @@ -1966,6 +1969,9 @@ static int bpf_prog_detach(const union bpf_attr *attr) return lirc_prog_detach(attr); case BPF_FLOW_DISSECTOR: return skb_flow_dissector_bpf_prog_detach(attr); + case BPF_CGROUP_SYSCTL: + ptype = BPF_PROG_TYPE_CGROUP_SYSCTL; + break; default: return -EINVAL; } @@ -1999,6 +2005,7 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_SOCK_OPS: case BPF_CGROUP_DEVICE: + case BPF_CGROUP_SYSCTL: break; case BPF_LIRC_MODE2: return lirc_prog_query(attr, uattr); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f25b7c9c20ba..423f242a5efb 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1177,30 +1177,32 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, { struct bpf_verifier_state *vstate = env->cur_state; struct bpf_func_state *state = vstate->frame[vstate->curframe]; - struct bpf_reg_state *regs = state->regs; + struct bpf_reg_state *reg, *regs = state->regs; if (regno >= MAX_BPF_REG) { verbose(env, "R%d is invalid\n", regno); return -EINVAL; } + reg = ®s[regno]; if (t == SRC_OP) { /* check whether register used as source operand can be read */ - if (regs[regno].type == NOT_INIT) { + if (reg->type == NOT_INIT) { verbose(env, "R%d !read_ok\n", regno); return -EACCES; } /* We don't need to worry about FP liveness because it's read-only */ - if (regno != BPF_REG_FP) - return mark_reg_read(env, ®s[regno], - regs[regno].parent); + if (regno == BPF_REG_FP) + return 0; + + return mark_reg_read(env, reg, reg->parent); } else { /* check whether register used as dest operand can be written to */ if (regno == BPF_REG_FP) { verbose(env, "frame pointer is read only\n"); return -EACCES; } - regs[regno].live |= REG_LIVE_WRITTEN; + reg->live |= REG_LIVE_WRITTEN; if (t == DST_OP) mark_reg_unknown(env, regs, regno); } @@ -2462,6 +2464,22 @@ static bool arg_type_is_mem_size(enum bpf_arg_type type) type == ARG_CONST_SIZE_OR_ZERO; } +static bool arg_type_is_int_ptr(enum bpf_arg_type type) +{ + return type == ARG_PTR_TO_INT || + type == ARG_PTR_TO_LONG; +} + +static int int_ptr_type_to_size(enum bpf_arg_type type) +{ + if (type == ARG_PTR_TO_INT) + return sizeof(u32); + else if (type == ARG_PTR_TO_LONG) + return sizeof(u64); + + return -EINVAL; +} + static int check_func_arg(struct bpf_verifier_env *env, u32 regno, enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta) @@ -2554,6 +2572,12 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, type != expected_type) goto err_type; meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM; + } else if (arg_type_is_int_ptr(arg_type)) { + expected_type = PTR_TO_STACK; + if (!type_is_pkt_pointer(type) && + type != PTR_TO_MAP_VALUE && + type != expected_type) + goto err_type; } else { verbose(env, "unsupported arg_type %d\n", arg_type); return -EFAULT; @@ -2635,6 +2659,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, err = check_helper_mem_access(env, regno - 1, reg->umax_value, zero_size_allowed, meta); + } else if (arg_type_is_int_ptr(arg_type)) { + int size = int_ptr_type_to_size(arg_type); + + err = check_helper_mem_access(env, regno, size, false, meta); + if (err) + return err; + err = check_ptr_alignment(env, reg, 0, size, true); } return err; @@ -5267,6 +5298,7 @@ static int check_return_code(struct bpf_verifier_env *env) case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: case BPF_PROG_TYPE_SOCK_OPS: case BPF_PROG_TYPE_CGROUP_DEVICE: + case BPF_PROG_TYPE_CGROUP_SYSCTL: break; default: return 0; @@ -5337,10 +5369,6 @@ enum { #define STATE_LIST_MARK ((struct bpf_verifier_state_list *) -1L) -static int *insn_stack; /* stack of insns to process */ -static int cur_stack; /* current stack index */ -static int *insn_state; - /* t, w, e - match pseudo-code above: * t - index of current instruction * w - next instruction @@ -5348,6 +5376,9 @@ static int *insn_state; */ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) { + int *insn_stack = env->cfg.insn_stack; + int *insn_state = env->cfg.insn_state; + if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH)) return 0; @@ -5368,9 +5399,9 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) /* tree-edge */ insn_state[t] = DISCOVERED | e; insn_state[w] = DISCOVERED; - if (cur_stack >= env->prog->len) + if (env->cfg.cur_stack >= env->prog->len) return -E2BIG; - insn_stack[cur_stack++] = w; + insn_stack[env->cfg.cur_stack++] = w; return 1; } else if ((insn_state[w] & 0xF0) == DISCOVERED) { verbose_linfo(env, t, "%d: ", t); @@ -5394,14 +5425,15 @@ static int check_cfg(struct bpf_verifier_env *env) { struct bpf_insn *insns = env->prog->insnsi; int insn_cnt = env->prog->len; + int *insn_stack, *insn_state; int ret = 0; int i, t; - insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); + insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); if (!insn_state) return -ENOMEM; - insn_stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); + insn_stack = env->cfg.insn_stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); if (!insn_stack) { kvfree(insn_state); return -ENOMEM; @@ -5409,12 +5441,12 @@ static int check_cfg(struct bpf_verifier_env *env) insn_state[0] = DISCOVERED; /* mark 1st insn as discovered */ insn_stack[0] = 0; /* 0 is the first instruction */ - cur_stack = 1; + env->cfg.cur_stack = 1; peek_stack: - if (cur_stack == 0) + if (env->cfg.cur_stack == 0) goto check_state; - t = insn_stack[cur_stack - 1]; + t = insn_stack[env->cfg.cur_stack - 1]; if (BPF_CLASS(insns[t].code) == BPF_JMP || BPF_CLASS(insns[t].code) == BPF_JMP32) { @@ -5483,7 +5515,7 @@ peek_stack: mark_explored: insn_state[t] = EXPLORED; - if (cur_stack-- <= 0) { + if (env->cfg.cur_stack-- <= 0) { verbose(env, "pop stack internal bug\n"); ret = -EFAULT; goto err_free; @@ -5503,6 +5535,7 @@ check_state: err_free: kvfree(insn_state); kvfree(insn_stack); + env->cfg.insn_state = env->cfg.insn_stack = NULL; return ret; } @@ -6191,6 +6224,22 @@ static bool states_equal(struct bpf_verifier_env *env, return true; } +static int propagate_liveness_reg(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, + struct bpf_reg_state *parent_reg) +{ + int err; + + if (parent_reg->live & REG_LIVE_READ || !(reg->live & REG_LIVE_READ)) + return 0; + + err = mark_reg_read(env, reg, parent_reg); + if (err) + return err; + + return 0; +} + /* A write screens off any subsequent reads; but write marks come from the * straight-line code between a state and its parent. When we arrive at an * equivalent state (jump target or such) we didn't arrive by the straight-line @@ -6202,8 +6251,9 @@ static int propagate_liveness(struct bpf_verifier_env *env, const struct bpf_verifier_state *vstate, struct bpf_verifier_state *vparent) { - int i, frame, err = 0; + struct bpf_reg_state *state_reg, *parent_reg; struct bpf_func_state *state, *parent; + int i, frame, err = 0; if (vparent->curframe != vstate->curframe) { WARN(1, "propagate_live: parent frame %d current frame %d\n", @@ -6213,30 +6263,27 @@ static int propagate_liveness(struct bpf_verifier_env *env, /* Propagate read liveness of registers... */ BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG); for (frame = 0; frame <= vstate->curframe; frame++) { + parent = vparent->frame[frame]; + state = vstate->frame[frame]; + parent_reg = parent->regs; + state_reg = state->regs; /* We don't need to worry about FP liveness, it's read-only */ for (i = frame < vstate->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) { - if (vparent->frame[frame]->regs[i].live & REG_LIVE_READ) - continue; - if (vstate->frame[frame]->regs[i].live & REG_LIVE_READ) { - err = mark_reg_read(env, &vstate->frame[frame]->regs[i], - &vparent->frame[frame]->regs[i]); - if (err) - return err; - } + err = propagate_liveness_reg(env, &state_reg[i], + &parent_reg[i]); + if (err) + return err; } - } - /* ... and stack slots */ - for (frame = 0; frame <= vstate->curframe; frame++) { - state = vstate->frame[frame]; - parent = vparent->frame[frame]; + /* Propagate stack slots. */ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE && i < parent->allocated_stack / BPF_REG_SIZE; i++) { - if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ) - continue; - if (state->stack[i].spilled_ptr.live & REG_LIVE_READ) - mark_reg_read(env, &state->stack[i].spilled_ptr, - &parent->stack[i].spilled_ptr); + parent_reg = &parent->stack[i].spilled_ptr; + state_reg = &state->stack[i].spilled_ptr; + err = propagate_liveness_reg(env, state_reg, + parent_reg); + if (err) + return err; } } return err; @@ -7601,9 +7648,8 @@ static int jit_subprogs(struct bpf_verifier_env *env) insn->src_reg != BPF_PSEUDO_CALL) continue; subprog = insn->off; - insn->imm = (u64 (*)(u64, u64, u64, u64, u64)) - func[subprog]->bpf_func - - __bpf_call_base; + insn->imm = BPF_CAST_CALL(func[subprog]->bpf_func) - + __bpf_call_base; } /* we use the aux data to keep a list of the start addresses @@ -8086,9 +8132,11 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, env->insn_aux_data[i].orig_idx = i; env->prog = *prog; env->ops = bpf_verifier_ops[env->prog->type]; + is_priv = capable(CAP_SYS_ADMIN); /* grab the mutex to protect few globals used by verifier */ - mutex_lock(&bpf_verifier_lock); + if (!is_priv) + mutex_lock(&bpf_verifier_lock); if (attr->log_level || attr->log_buf || attr->log_size) { /* user requested verbose verifier output @@ -8111,7 +8159,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, if (attr->prog_flags & BPF_F_ANY_ALIGNMENT) env->strict_alignment = false; - is_priv = capable(CAP_SYS_ADMIN); env->allow_ptr_leaks = is_priv; ret = replace_map_fd_with_map_ptr(env); @@ -8224,7 +8271,8 @@ err_release_maps: release_maps(env); *prog = env->prog; err_unlock: - mutex_unlock(&bpf_verifier_lock); + if (!is_priv) + mutex_unlock(&bpf_verifier_lock); vfree(env->insn_aux_data); err_free_env: kfree(env); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index d64c00afceb5..91800be0c8eb 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -569,6 +569,12 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_map_update_elem_proto; case BPF_FUNC_map_delete_elem: return &bpf_map_delete_elem_proto; + case BPF_FUNC_map_push_elem: + return &bpf_map_push_elem_proto; + case BPF_FUNC_map_pop_elem: + return &bpf_map_pop_elem_proto; + case BPF_FUNC_map_peek_elem: + return &bpf_map_peek_elem_proto; case BPF_FUNC_probe_read: return &bpf_probe_read_proto; case BPF_FUNC_ktime_get_ns: |