diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-06-03 16:27:18 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-06-03 16:27:18 -0700 |
commit | cb8e59cc87201af93dfbb6c3dccc8fcad72a09c2 (patch) | |
tree | a334db9022f89654b777bbce8c4c6632e65b9031 /kernel | |
parent | 2e63f6ce7ed2c4ff83ba30ad9ccad422289a6c63 (diff) | |
parent | 065fcfd49763ec71ae345bb5c5a74f961031e70e (diff) | |
download | linux-cb8e59cc87201af93dfbb6c3dccc8fcad72a09c2.tar.bz2 |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from David Miller:
1) Allow setting bluetooth L2CAP modes via socket option, from Luiz
Augusto von Dentz.
2) Add GSO partial support to igc, from Sasha Neftin.
3) Several cleanups and improvements to r8169 from Heiner Kallweit.
4) Add IF_OPER_TESTING link state and use it when ethtool triggers a
device self-test. From Andrew Lunn.
5) Start moving away from custom driver versions, use the globally
defined kernel version instead, from Leon Romanovsky.
6) Support GRO vis gro_cells in DSA layer, from Alexander Lobakin.
7) Allow hard IRQ deferral during NAPI, from Eric Dumazet.
8) Add sriov and vf support to hinic, from Luo bin.
9) Support Media Redundancy Protocol (MRP) in the bridging code, from
Horatiu Vultur.
10) Support netmap in the nft_nat code, from Pablo Neira Ayuso.
11) Allow UDPv6 encapsulation of ESP in the ipsec code, from Sabrina
Dubroca. Also add ipv6 support for espintcp.
12) Lots of ReST conversions of the networking documentation, from Mauro
Carvalho Chehab.
13) Support configuration of ethtool rxnfc flows in bcmgenet driver,
from Doug Berger.
14) Allow to dump cgroup id and filter by it in inet_diag code, from
Dmitry Yakunin.
15) Add infrastructure to export netlink attribute policies to
userspace, from Johannes Berg.
16) Several optimizations to sch_fq scheduler, from Eric Dumazet.
17) Fallback to the default qdisc if qdisc init fails because otherwise
a packet scheduler init failure will make a device inoperative. From
Jesper Dangaard Brouer.
18) Several RISCV bpf jit optimizations, from Luke Nelson.
19) Correct the return type of the ->ndo_start_xmit() method in several
drivers, it's netdev_tx_t but many drivers were using
'int'. From Yunjian Wang.
20) Add an ethtool interface for PHY master/slave config, from Oleksij
Rempel.
21) Add BPF iterators, from Yonghang Song.
22) Add cable test infrastructure, including ethool interfaces, from
Andrew Lunn. Marvell PHY driver is the first to support this
facility.
23) Remove zero-length arrays all over, from Gustavo A. R. Silva.
24) Calculate and maintain an explicit frame size in XDP, from Jesper
Dangaard Brouer.
25) Add CAP_BPF, from Alexei Starovoitov.
26) Support terse dumps in the packet scheduler, from Vlad Buslov.
27) Support XDP_TX bulking in dpaa2 driver, from Ioana Ciornei.
28) Add devm_register_netdev(), from Bartosz Golaszewski.
29) Minimize qdisc resets, from Cong Wang.
30) Get rid of kernel_getsockopt and kernel_setsockopt in order to
eliminate set_fs/get_fs calls. From Christoph Hellwig.
* git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (2517 commits)
selftests: net: ip_defrag: ignore EPERM
net_failover: fixed rollback in net_failover_open()
Revert "tipc: Fix potential tipc_aead refcnt leak in tipc_crypto_rcv"
Revert "tipc: Fix potential tipc_node refcnt leak in tipc_rcv"
vmxnet3: allow rx flow hash ops only when rss is enabled
hinic: add set_channels ethtool_ops support
selftests/bpf: Add a default $(CXX) value
tools/bpf: Don't use $(COMPILE.c)
bpf, selftests: Use bpf_probe_read_kernel
s390/bpf: Use bcr 0,%0 as tail call nop filler
s390/bpf: Maintain 8-byte stack alignment
selftests/bpf: Fix verifier test
selftests/bpf: Fix sample_cnt shared between two threads
bpf, selftests: Adapt cls_redirect to call csum_level helper
bpf: Add csum_level helper for fixing up csum levels
bpf: Fix up bpf_skb_adjust_room helper's skb csum setting
sfc: add missing annotation for efx_ef10_try_update_nic_stats_vf()
crypto/chtls: IPv6 support for inline TLS
Crypto/chcr: Fixes a coccinile check error
Crypto/chcr: Fixes compilations warnings
...
Diffstat (limited to 'kernel')
45 files changed, 4826 insertions, 2215 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index f2d7be596966..1131a921e1a6 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -2,9 +2,9 @@ obj-y := core.o CFLAGS_core.o += $(call cc-disable-warning, override-init) -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o -obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o +obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o obj-$(CONFIG_BPF_JIT) += trampoline.o obj-$(CONFIG_BPF_SYSCALL) += btf.o @@ -12,10 +12,8 @@ obj-$(CONFIG_BPF_JIT) += dispatcher.o ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o obj-$(CONFIG_BPF_SYSCALL) += cpumap.o -ifeq ($(CONFIG_XDP_SOCKETS),y) -obj-$(CONFIG_BPF_SYSCALL) += xskmap.o -endif obj-$(CONFIG_BPF_SYSCALL) += offload.o +obj-$(CONFIG_BPF_SYSCALL) += net_namespace.o endif ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 1d6120fd5ba6..11584618e861 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -77,7 +77,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; int ret, numa_node = bpf_map_attr_numa_node(attr); u32 elem_size, index_mask, max_entries; - bool unpriv = !capable(CAP_SYS_ADMIN); + bool bypass_spec_v1 = bpf_bypass_spec_v1(); u64 cost, array_size, mask64; struct bpf_map_memory mem; struct bpf_array *array; @@ -95,7 +95,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) mask64 -= 1; index_mask = mask64; - if (unpriv) { + if (!bypass_spec_v1) { /* round up array size to nearest power of 2, * since cpu will speculate within index_mask limits */ @@ -149,7 +149,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) return ERR_PTR(-ENOMEM); } array->index_mask = index_mask; - array->map.unpriv_array = unpriv; + array->map.bypass_spec_v1 = bypass_spec_v1; /* copy mandatory map attributes */ bpf_map_init_from_attr(&array->map, attr); @@ -219,7 +219,7 @@ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); - if (map->unpriv_array) { + if (!map->bypass_spec_v1) { *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 4); *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); } else { @@ -1058,7 +1058,7 @@ static u32 array_of_map_gen_lookup(struct bpf_map *map, *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); - if (map->unpriv_array) { + if (!map->bypass_spec_v1) { *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 6); *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); } else { diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c new file mode 100644 index 000000000000..dd612b80b9fe --- /dev/null +++ b/kernel/bpf/bpf_iter.c @@ -0,0 +1,539 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2020 Facebook */ + +#include <linux/fs.h> +#include <linux/anon_inodes.h> +#include <linux/filter.h> +#include <linux/bpf.h> + +struct bpf_iter_target_info { + struct list_head list; + const struct bpf_iter_reg *reg_info; + u32 btf_id; /* cached value */ +}; + +struct bpf_iter_link { + struct bpf_link link; + struct bpf_iter_target_info *tinfo; +}; + +struct bpf_iter_priv_data { + struct bpf_iter_target_info *tinfo; + struct bpf_prog *prog; + u64 session_id; + u64 seq_num; + bool done_stop; + u8 target_private[] __aligned(8); +}; + +static struct list_head targets = LIST_HEAD_INIT(targets); +static DEFINE_MUTEX(targets_mutex); + +/* protect bpf_iter_link changes */ +static DEFINE_MUTEX(link_mutex); + +/* incremented on every opened seq_file */ +static atomic64_t session_id; + +static int prepare_seq_file(struct file *file, struct bpf_iter_link *link); + +static void bpf_iter_inc_seq_num(struct seq_file *seq) +{ + struct bpf_iter_priv_data *iter_priv; + + iter_priv = container_of(seq->private, struct bpf_iter_priv_data, + target_private); + iter_priv->seq_num++; +} + +static void bpf_iter_dec_seq_num(struct seq_file *seq) +{ + struct bpf_iter_priv_data *iter_priv; + + iter_priv = container_of(seq->private, struct bpf_iter_priv_data, + target_private); + iter_priv->seq_num--; +} + +static void bpf_iter_done_stop(struct seq_file *seq) +{ + struct bpf_iter_priv_data *iter_priv; + + iter_priv = container_of(seq->private, struct bpf_iter_priv_data, + target_private); + iter_priv->done_stop = true; +} + +/* bpf_seq_read, a customized and simpler version for bpf iterator. + * no_llseek is assumed for this file. + * The following are differences from seq_read(): + * . fixed buffer size (PAGE_SIZE) + * . assuming no_llseek + * . stop() may call bpf program, handling potential overflow there + */ +static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size, + loff_t *ppos) +{ + struct seq_file *seq = file->private_data; + size_t n, offs, copied = 0; + int err = 0; + void *p; + + mutex_lock(&seq->lock); + + if (!seq->buf) { + seq->size = PAGE_SIZE; + seq->buf = kmalloc(seq->size, GFP_KERNEL); + if (!seq->buf) { + err = -ENOMEM; + goto done; + } + } + + if (seq->count) { + n = min(seq->count, size); + err = copy_to_user(buf, seq->buf + seq->from, n); + if (err) { + err = -EFAULT; + goto done; + } + seq->count -= n; + seq->from += n; + copied = n; + goto done; + } + + seq->from = 0; + p = seq->op->start(seq, &seq->index); + if (!p) + goto stop; + if (IS_ERR(p)) { + err = PTR_ERR(p); + seq->op->stop(seq, p); + seq->count = 0; + goto done; + } + + err = seq->op->show(seq, p); + if (err > 0) { + /* object is skipped, decrease seq_num, so next + * valid object can reuse the same seq_num. + */ + bpf_iter_dec_seq_num(seq); + seq->count = 0; + } else if (err < 0 || seq_has_overflowed(seq)) { + if (!err) + err = -E2BIG; + seq->op->stop(seq, p); + seq->count = 0; + goto done; + } + + while (1) { + loff_t pos = seq->index; + + offs = seq->count; + p = seq->op->next(seq, p, &seq->index); + if (pos == seq->index) { + pr_info_ratelimited("buggy seq_file .next function %ps " + "did not updated position index\n", + seq->op->next); + seq->index++; + } + + if (IS_ERR_OR_NULL(p)) + break; + + /* got a valid next object, increase seq_num */ + bpf_iter_inc_seq_num(seq); + + if (seq->count >= size) + break; + + err = seq->op->show(seq, p); + if (err > 0) { + bpf_iter_dec_seq_num(seq); + seq->count = offs; + } else if (err < 0 || seq_has_overflowed(seq)) { + seq->count = offs; + if (offs == 0) { + if (!err) + err = -E2BIG; + seq->op->stop(seq, p); + goto done; + } + break; + } + } +stop: + offs = seq->count; + /* bpf program called if !p */ + seq->op->stop(seq, p); + if (!p) { + if (!seq_has_overflowed(seq)) { + bpf_iter_done_stop(seq); + } else { + seq->count = offs; + if (offs == 0) { + err = -E2BIG; + goto done; + } + } + } + + n = min(seq->count, size); + err = copy_to_user(buf, seq->buf, n); + if (err) { + err = -EFAULT; + goto done; + } + copied = n; + seq->count -= n; + seq->from = n; +done: + if (!copied) + copied = err; + else + *ppos += copied; + mutex_unlock(&seq->lock); + return copied; +} + +static int iter_open(struct inode *inode, struct file *file) +{ + struct bpf_iter_link *link = inode->i_private; + + return prepare_seq_file(file, link); +} + +static int iter_release(struct inode *inode, struct file *file) +{ + struct bpf_iter_priv_data *iter_priv; + struct seq_file *seq; + + seq = file->private_data; + if (!seq) + return 0; + + iter_priv = container_of(seq->private, struct bpf_iter_priv_data, + target_private); + + if (iter_priv->tinfo->reg_info->fini_seq_private) + iter_priv->tinfo->reg_info->fini_seq_private(seq->private); + + bpf_prog_put(iter_priv->prog); + seq->private = iter_priv; + + return seq_release_private(inode, file); +} + +const struct file_operations bpf_iter_fops = { + .open = iter_open, + .llseek = no_llseek, + .read = bpf_seq_read, + .release = iter_release, +}; + +/* The argument reg_info will be cached in bpf_iter_target_info. + * The common practice is to declare target reg_info as + * a const static variable and passed as an argument to + * bpf_iter_reg_target(). + */ +int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info) +{ + struct bpf_iter_target_info *tinfo; + + tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); + if (!tinfo) + return -ENOMEM; + + tinfo->reg_info = reg_info; + INIT_LIST_HEAD(&tinfo->list); + + mutex_lock(&targets_mutex); + list_add(&tinfo->list, &targets); + mutex_unlock(&targets_mutex); + + return 0; +} + +void bpf_iter_unreg_target(const struct bpf_iter_reg *reg_info) +{ + struct bpf_iter_target_info *tinfo; + bool found = false; + + mutex_lock(&targets_mutex); + list_for_each_entry(tinfo, &targets, list) { + if (reg_info == tinfo->reg_info) { + list_del(&tinfo->list); + kfree(tinfo); + found = true; + break; + } + } + mutex_unlock(&targets_mutex); + + WARN_ON(found == false); +} + +static void cache_btf_id(struct bpf_iter_target_info *tinfo, + struct bpf_prog *prog) +{ + tinfo->btf_id = prog->aux->attach_btf_id; +} + +bool bpf_iter_prog_supported(struct bpf_prog *prog) +{ + const char *attach_fname = prog->aux->attach_func_name; + u32 prog_btf_id = prog->aux->attach_btf_id; + const char *prefix = BPF_ITER_FUNC_PREFIX; + struct bpf_iter_target_info *tinfo; + int prefix_len = strlen(prefix); + bool supported = false; + + if (strncmp(attach_fname, prefix, prefix_len)) + return false; + + mutex_lock(&targets_mutex); + list_for_each_entry(tinfo, &targets, list) { + if (tinfo->btf_id && tinfo->btf_id == prog_btf_id) { + supported = true; + break; + } + if (!strcmp(attach_fname + prefix_len, tinfo->reg_info->target)) { + cache_btf_id(tinfo, prog); + supported = true; + break; + } + } + mutex_unlock(&targets_mutex); + + if (supported) { + prog->aux->ctx_arg_info_size = tinfo->reg_info->ctx_arg_info_size; + prog->aux->ctx_arg_info = tinfo->reg_info->ctx_arg_info; + } + + return supported; +} + +static void bpf_iter_link_release(struct bpf_link *link) +{ +} + +static void bpf_iter_link_dealloc(struct bpf_link *link) +{ + struct bpf_iter_link *iter_link = + container_of(link, struct bpf_iter_link, link); + + kfree(iter_link); +} + +static int bpf_iter_link_replace(struct bpf_link *link, + struct bpf_prog *new_prog, + struct bpf_prog *old_prog) +{ + int ret = 0; + + mutex_lock(&link_mutex); + if (old_prog && link->prog != old_prog) { + ret = -EPERM; + goto out_unlock; + } + + if (link->prog->type != new_prog->type || + link->prog->expected_attach_type != new_prog->expected_attach_type || + link->prog->aux->attach_btf_id != new_prog->aux->attach_btf_id) { + ret = -EINVAL; + goto out_unlock; + } + + old_prog = xchg(&link->prog, new_prog); + bpf_prog_put(old_prog); + +out_unlock: + mutex_unlock(&link_mutex); + return ret; +} + +static const struct bpf_link_ops bpf_iter_link_lops = { + .release = bpf_iter_link_release, + .dealloc = bpf_iter_link_dealloc, + .update_prog = bpf_iter_link_replace, +}; + +bool bpf_link_is_iter(struct bpf_link *link) +{ + return link->ops == &bpf_iter_link_lops; +} + +int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct bpf_link_primer link_primer; + struct bpf_iter_target_info *tinfo; + struct bpf_iter_link *link; + bool existed = false; + u32 prog_btf_id; + int err; + + if (attr->link_create.target_fd || attr->link_create.flags) + return -EINVAL; + + prog_btf_id = prog->aux->attach_btf_id; + mutex_lock(&targets_mutex); + list_for_each_entry(tinfo, &targets, list) { + if (tinfo->btf_id == prog_btf_id) { + existed = true; + break; + } + } + mutex_unlock(&targets_mutex); + if (!existed) + return -ENOENT; + + link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN); + if (!link) + return -ENOMEM; + + bpf_link_init(&link->link, BPF_LINK_TYPE_ITER, &bpf_iter_link_lops, prog); + link->tinfo = tinfo; + + err = bpf_link_prime(&link->link, &link_primer); + if (err) { + kfree(link); + return err; + } + + return bpf_link_settle(&link_primer); +} + +static void init_seq_meta(struct bpf_iter_priv_data *priv_data, + struct bpf_iter_target_info *tinfo, + struct bpf_prog *prog) +{ + priv_data->tinfo = tinfo; + priv_data->prog = prog; + priv_data->session_id = atomic64_inc_return(&session_id); + priv_data->seq_num = 0; + priv_data->done_stop = false; +} + +static int prepare_seq_file(struct file *file, struct bpf_iter_link *link) +{ + struct bpf_iter_priv_data *priv_data; + struct bpf_iter_target_info *tinfo; + struct bpf_prog *prog; + u32 total_priv_dsize; + struct seq_file *seq; + int err = 0; + + mutex_lock(&link_mutex); + prog = link->link.prog; + bpf_prog_inc(prog); + mutex_unlock(&link_mutex); + + tinfo = link->tinfo; + total_priv_dsize = offsetof(struct bpf_iter_priv_data, target_private) + + tinfo->reg_info->seq_priv_size; + priv_data = __seq_open_private(file, tinfo->reg_info->seq_ops, + total_priv_dsize); + if (!priv_data) { + err = -ENOMEM; + goto release_prog; + } + + if (tinfo->reg_info->init_seq_private) { + err = tinfo->reg_info->init_seq_private(priv_data->target_private); + if (err) + goto release_seq_file; + } + + init_seq_meta(priv_data, tinfo, prog); + seq = file->private_data; + seq->private = priv_data->target_private; + + return 0; + +release_seq_file: + seq_release_private(file->f_inode, file); + file->private_data = NULL; +release_prog: + bpf_prog_put(prog); + return err; +} + +int bpf_iter_new_fd(struct bpf_link *link) +{ + struct file *file; + unsigned int flags; + int err, fd; + + if (link->ops != &bpf_iter_link_lops) + return -EINVAL; + + flags = O_RDONLY | O_CLOEXEC; + fd = get_unused_fd_flags(flags); + if (fd < 0) + return fd; + + file = anon_inode_getfile("bpf_iter", &bpf_iter_fops, NULL, flags); + if (IS_ERR(file)) { + err = PTR_ERR(file); + goto free_fd; + } + + err = prepare_seq_file(file, + container_of(link, struct bpf_iter_link, link)); + if (err) + goto free_file; + + fd_install(fd, file); + return fd; + +free_file: + fput(file); +free_fd: + put_unused_fd(fd); + return err; +} + +struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop) +{ + struct bpf_iter_priv_data *iter_priv; + struct seq_file *seq; + void *seq_priv; + + seq = meta->seq; + if (seq->file->f_op != &bpf_iter_fops) + return NULL; + + seq_priv = seq->private; + iter_priv = container_of(seq_priv, struct bpf_iter_priv_data, + target_private); + + if (in_stop && iter_priv->done_stop) + return NULL; + + meta->session_id = iter_priv->session_id; + meta->seq_num = iter_priv->seq_num; + + return iter_priv->prog; +} + +int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx) +{ + int ret; + + rcu_read_lock(); + migrate_disable(); + ret = BPF_PROG_RUN(prog, ctx); + migrate_enable(); + rcu_read_unlock(); + + /* bpf program can only return 0 or 1: + * 0 : okay + * 1 : retry the same object + * The bpf_iter_run_prog() return value + * will be seq_ops->show() return value. + */ + return ret == 0 ? 0 : -EAGAIN; +} diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 19636703b24e..fb278144e9fd 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -49,6 +49,6 @@ const struct bpf_prog_ops lsm_prog_ops = { }; const struct bpf_verifier_ops lsm_verifier_ops = { - .get_func_proto = bpf_tracing_func_proto, + .get_func_proto = tracing_prog_func_proto, .is_valid_access = btf_ctx_access, }; diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 26cb51f2db72..c6b0decaa46a 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -557,7 +557,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) struct bpf_map *map; int err; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return ERR_PTR(-EPERM); st_ops = bpf_struct_ops_find_value(attr->btf_vmlinux_value_type_id); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index d65c6912bdaf..58c9af1d4808 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3482,6 +3482,7 @@ extern char __weak __stop_BTF[]; extern struct btf *btf_vmlinux; #define BPF_MAP_TYPE(_id, _ops) +#define BPF_LINK_TYPE(_id, _name) static union { struct bpf_ctx_convert { #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ @@ -3508,6 +3509,7 @@ static u8 bpf_ctx_convert_map[] = { 0, /* avoid empty array */ }; #undef BPF_MAP_TYPE +#undef BPF_LINK_TYPE static const struct btf_member * btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf, @@ -3692,7 +3694,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, struct bpf_verifier_log *log = info->log; const struct btf_param *args; u32 nr_args, arg; - int ret; + int i, ret; if (off % 8) { bpf_log(log, "func '%s' offset %d is not multiple of 8\n", @@ -3789,6 +3791,14 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, /* this is a pointer to another type */ info->reg_type = PTR_TO_BTF_ID; + for (i = 0; i < prog->aux->ctx_arg_info_size; i++) { + const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i]; + + if (ctx_arg_info->offset == off) { + info->reg_type = ctx_arg_info->reg_type; + break; + } + } if (tgt_prog) { ret = btf_translate_to_vmlinux(log, btf, t, tgt_prog->type, arg); @@ -3828,6 +3838,7 @@ int btf_struct_access(struct bpf_verifier_log *log, const struct btf_type *mtype, *elem_type = NULL; const struct btf_member *member; const char *tname, *mname; + u32 vlen; again: tname = __btf_name_by_offset(btf_vmlinux, t->name_off); @@ -3836,7 +3847,43 @@ again: return -EINVAL; } + vlen = btf_type_vlen(t); if (off + size > t->size) { + /* If the last element is a variable size array, we may + * need to relax the rule. + */ + struct btf_array *array_elem; + + if (vlen == 0) + goto error; + + member = btf_type_member(t) + vlen - 1; + mtype = btf_type_skip_modifiers(btf_vmlinux, member->type, + NULL); + if (!btf_type_is_array(mtype)) + goto error; + + array_elem = (struct btf_array *)(mtype + 1); + if (array_elem->nelems != 0) + goto error; + + moff = btf_member_bit_offset(t, member) / 8; + if (off < moff) + goto error; + + /* Only allow structure for now, can be relaxed for + * other types later. + */ + elem_type = btf_type_skip_modifiers(btf_vmlinux, + array_elem->type, NULL); + if (!btf_type_is_struct(elem_type)) + goto error; + + off = (off - moff) % elem_type->size; + return btf_struct_access(log, elem_type, off, size, atype, + next_btf_id); + +error: bpf_log(log, "access beyond struct %s at off %u size %u\n", tname, off, size); return -EACCES; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index cb305e71e7de..fdf7836750a3 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -557,8 +557,9 @@ found: * * Must be called with cgroup_mutex held. */ -int __cgroup_bpf_replace(struct cgroup *cgrp, struct bpf_cgroup_link *link, - struct bpf_prog *new_prog) +static int __cgroup_bpf_replace(struct cgroup *cgrp, + struct bpf_cgroup_link *link, + struct bpf_prog *new_prog) { struct list_head *progs = &cgrp->bpf.progs[link->type]; struct bpf_prog *old_prog; @@ -583,6 +584,30 @@ int __cgroup_bpf_replace(struct cgroup *cgrp, struct bpf_cgroup_link *link, return 0; } +static int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *new_prog, + struct bpf_prog *old_prog) +{ + struct bpf_cgroup_link *cg_link; + int ret; + + cg_link = container_of(link, struct bpf_cgroup_link, link); + + mutex_lock(&cgroup_mutex); + /* link might have been auto-released by dying cgroup, so fail */ + if (!cg_link->cgroup) { + ret = -ENOLINK; + goto out_unlock; + } + if (old_prog && link->prog != old_prog) { + ret = -EPERM; + goto out_unlock; + } + ret = __cgroup_bpf_replace(cg_link->cgroup, cg_link, new_prog); +out_unlock: + mutex_unlock(&cgroup_mutex); + return ret; +} + static struct bpf_prog_list *find_detach_entry(struct list_head *progs, struct bpf_prog *prog, struct bpf_cgroup_link *link, @@ -808,17 +833,56 @@ static void bpf_cgroup_link_dealloc(struct bpf_link *link) kfree(cg_link); } -const struct bpf_link_ops bpf_cgroup_link_lops = { +static void bpf_cgroup_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_cgroup_link *cg_link = + container_of(link, struct bpf_cgroup_link, link); + u64 cg_id = 0; + + mutex_lock(&cgroup_mutex); + if (cg_link->cgroup) + cg_id = cgroup_id(cg_link->cgroup); + mutex_unlock(&cgroup_mutex); + + seq_printf(seq, + "cgroup_id:\t%llu\n" + "attach_type:\t%d\n", + cg_id, + cg_link->type); +} + +static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + struct bpf_cgroup_link *cg_link = + container_of(link, struct bpf_cgroup_link, link); + u64 cg_id = 0; + + mutex_lock(&cgroup_mutex); + if (cg_link->cgroup) + cg_id = cgroup_id(cg_link->cgroup); + mutex_unlock(&cgroup_mutex); + + info->cgroup.cgroup_id = cg_id; + info->cgroup.attach_type = cg_link->type; + return 0; +} + +static const struct bpf_link_ops bpf_cgroup_link_lops = { .release = bpf_cgroup_link_release, .dealloc = bpf_cgroup_link_dealloc, + .update_prog = cgroup_bpf_replace, + .show_fdinfo = bpf_cgroup_link_show_fdinfo, + .fill_link_info = bpf_cgroup_link_fill_link_info, }; int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { + struct bpf_link_primer link_primer; struct bpf_cgroup_link *link; - struct file *link_file; struct cgroup *cgrp; - int err, link_fd; + int err; if (attr->link_create.flags) return -EINVAL; @@ -832,26 +896,25 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) err = -ENOMEM; goto out_put_cgroup; } - bpf_link_init(&link->link, &bpf_cgroup_link_lops, prog); + bpf_link_init(&link->link, BPF_LINK_TYPE_CGROUP, &bpf_cgroup_link_lops, + prog); link->cgroup = cgrp; link->type = attr->link_create.attach_type; - link_file = bpf_link_new_file(&link->link, &link_fd); - if (IS_ERR(link_file)) { + err = bpf_link_prime(&link->link, &link_primer); + if (err) { kfree(link); - err = PTR_ERR(link_file); goto out_put_cgroup; } err = cgroup_bpf_attach(cgrp, NULL, NULL, link, link->type, BPF_F_ALLOW_MULTI); if (err) { - bpf_link_cleanup(&link->link, link_file, link_fd); + bpf_link_cleanup(&link_primer); goto out_put_cgroup; } - fd_install(link_fd, link_file); - return link_fd; + return bpf_link_settle(&link_primer); out_put_cgroup: cgroup_put(cgrp); @@ -1054,36 +1117,21 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, return !allow; } -EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission); static const struct bpf_func_proto * cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { - case BPF_FUNC_map_lookup_elem: - return &bpf_map_lookup_elem_proto; - case BPF_FUNC_map_update_elem: - return &bpf_map_update_elem_proto; - case BPF_FUNC_map_delete_elem: - return &bpf_map_delete_elem_proto; - case BPF_FUNC_map_push_elem: - return &bpf_map_push_elem_proto; - case BPF_FUNC_map_pop_elem: - return &bpf_map_pop_elem_proto; - case BPF_FUNC_map_peek_elem: - return &bpf_map_peek_elem_proto; case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; case BPF_FUNC_get_current_cgroup_id: return &bpf_get_current_cgroup_id_proto; - case BPF_FUNC_trace_printk: - if (capable(CAP_SYS_ADMIN)) - return bpf_get_trace_printk_proto(); - /* fall through */ + case BPF_FUNC_perf_event_output: + return &bpf_event_output_data_proto; default: - return NULL; + return bpf_base_func_proto(func_id); } } @@ -1137,16 +1185,13 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = { * @head: sysctl table header * @table: sysctl table * @write: sysctl is being read (= 0) or written (= 1) - * @buf: pointer to buffer passed by user space + * @buf: pointer to buffer (in and out) * @pcount: value-result argument: value is size of buffer pointed to by @buf, * result is size of @new_buf if program set new value, initial value * otherwise * @ppos: value-result argument: value is position at which read from or write * to sysctl is happening, result is new position if program overrode it, * initial value otherwise - * @new_buf: pointer to pointer to new buffer that will be allocated if program - * overrides new value provided by user space on sysctl write - * NOTE: it's caller responsibility to free *new_buf if it was set * @type: type of program to be executed * * Program is run when sysctl is being accessed, either read or written, and @@ -1157,8 +1202,7 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = { */ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, struct ctl_table *table, int write, - void __user *buf, size_t *pcount, - loff_t *ppos, void **new_buf, + void **buf, size_t *pcount, loff_t *ppos, enum bpf_attach_type type) { struct bpf_sysctl_kern ctx = { @@ -1173,36 +1217,28 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, .new_updated = 0, }; struct cgroup *cgrp; + loff_t pos = 0; int ret; ctx.cur_val = kmalloc_track_caller(ctx.cur_len, GFP_KERNEL); - if (ctx.cur_val) { - mm_segment_t old_fs; - loff_t pos = 0; - - old_fs = get_fs(); - set_fs(KERNEL_DS); - if (table->proc_handler(table, 0, (void __user *)ctx.cur_val, - &ctx.cur_len, &pos)) { - /* Let BPF program decide how to proceed. */ - ctx.cur_len = 0; - } - set_fs(old_fs); - } else { + if (!ctx.cur_val || + table->proc_handler(table, 0, ctx.cur_val, &ctx.cur_len, &pos)) { /* Let BPF program decide how to proceed. */ ctx.cur_len = 0; } - if (write && buf && *pcount) { + if (write && *buf && *pcount) { /* BPF program should be able to override new value with a * buffer bigger than provided by user. */ ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL); ctx.new_len = min_t(size_t, PAGE_SIZE, *pcount); - if (!ctx.new_val || - copy_from_user(ctx.new_val, buf, ctx.new_len)) + if (ctx.new_val) { + memcpy(ctx.new_val, *buf, ctx.new_len); + } else { /* Let BPF program decide how to proceed. */ ctx.new_len = 0; + } } rcu_read_lock(); @@ -1213,7 +1249,8 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, kfree(ctx.cur_val); if (ret == 1 && ctx.new_updated) { - *new_buf = ctx.new_val; + kfree(*buf); + *buf = ctx.new_val; *pcount = ctx.new_len; } else { kfree(ctx.new_val); @@ -1221,7 +1258,6 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, return ret == 1 ? 0 : -EPERM; } -EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl); #ifdef CONFIG_NET static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp, @@ -1326,7 +1362,6 @@ out: sockopt_free_buf(&ctx); return ret; } -EXPORT_SYMBOL(__cgroup_bpf_run_filter_setsockopt); int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, int optname, char __user *optval, @@ -1413,7 +1448,6 @@ out: sockopt_free_buf(&ctx); return ret; } -EXPORT_SYMBOL(__cgroup_bpf_run_filter_getsockopt); #endif static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp, diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index cf6fe9107f5c..9df4cc9a2907 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -646,7 +646,7 @@ static bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp) void bpf_prog_kallsyms_add(struct bpf_prog *fp) { if (!bpf_prog_kallsyms_candidate(fp) || - !capable(CAP_SYS_ADMIN)) + !bpf_capable()) return; bpf_prog_ksym_set_addr(fp); @@ -1543,7 +1543,7 @@ select_insn: /* ARG1 at this point is guaranteed to point to CTX from * the verifier side due to the fact that the tail call is - * handeled like a helper, that is, bpf_tail_call_proto, + * handled like a helper, that is, bpf_tail_call_proto, * where arg1_type is ARG_PTR_TO_CTX. */ insn = prog->insnsi; @@ -2136,6 +2136,11 @@ BPF_CALL_0(bpf_user_rnd_u32) return res; } +BPF_CALL_0(bpf_get_raw_cpu_id) +{ + return raw_smp_processor_id(); +} + /* Weak definitions of helper functions in case we don't have bpf syscall. */ const struct bpf_func_proto bpf_map_lookup_elem_proto __weak; const struct bpf_func_proto bpf_map_update_elem_proto __weak; @@ -2151,6 +2156,7 @@ const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; const struct bpf_func_proto bpf_get_numa_node_id_proto __weak; const struct bpf_func_proto bpf_ktime_get_ns_proto __weak; +const struct bpf_func_proto bpf_ktime_get_boot_ns_proto __weak; const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak; const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak; diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 3fe0b006d2d2..27595fc6da56 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -85,7 +85,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) u64 cost; int ret; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return ERR_PTR(-EPERM); /* check sanity of attributes */ @@ -162,25 +162,10 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, /* Part of headroom was reserved to xdpf */ hard_start_headroom = sizeof(struct xdp_frame) + xdpf->headroom; - /* build_skb need to place skb_shared_info after SKB end, and - * also want to know the memory "truesize". Thus, need to - * know the memory frame size backing xdp_buff. - * - * XDP was designed to have PAGE_SIZE frames, but this - * assumption is not longer true with ixgbe and i40e. It - * would be preferred to set frame_size to 2048 or 4096 - * depending on the driver. - * frame_size = 2048; - * frame_len = frame_size - sizeof(*xdp_frame); - * - * Instead, with info avail, skb_shared_info in placed after - * packet len. This, unfortunately fakes the truesize. - * Another disadvantage of this approach, the skb_shared_info - * is not at a fixed memory location, with mixed length - * packets, which is bad for cache-line hotness. + /* Memory size backing xdp_frame data already have reserved + * room for build_skb to place skb_shared_info in tailroom. */ - frame_size = SKB_DATA_ALIGN(xdpf->len + hard_start_headroom) + - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + frame_size = xdpf->frame_sz; pkt_data_start = xdpf->data - hard_start_headroom; skb = build_skb_around(skb, pkt_data_start, frame_size); @@ -636,7 +621,7 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, { struct xdp_frame *xdpf; - xdpf = convert_to_xdp_frame(xdp); + xdpf = xdp_convert_buff_to_frame(xdp); if (unlikely(!xdpf)) return -EOVERFLOW; diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 58bdca5d978a..854b09beb16b 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -52,7 +52,6 @@ #define DEV_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) -#define DEV_MAP_BULK_SIZE 16 struct xdp_dev_bulk_queue { struct xdp_frame *q[DEV_MAP_BULK_SIZE]; struct list_head flush_node; @@ -61,12 +60,23 @@ struct xdp_dev_bulk_queue { unsigned int count; }; +/* DEVMAP values */ +struct bpf_devmap_val { + u32 ifindex; /* device index */ + union { + int fd; /* prog fd on map write */ + u32 id; /* prog id on map read */ + } bpf_prog; +}; + struct bpf_dtab_netdev { struct net_device *dev; /* must be first member, due to tracepoint */ struct hlist_node index_hlist; struct bpf_dtab *dtab; + struct bpf_prog *xdp_prog; struct rcu_head rcu; unsigned int idx; + struct bpf_devmap_val val; }; struct bpf_dtab { @@ -106,12 +116,18 @@ static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab, static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) { + u32 valsize = attr->value_size; u64 cost = 0; int err; - /* check sanity of attributes */ + /* check sanity of attributes. 2 value sizes supported: + * 4 bytes: ifindex + * 8 bytes: ifindex + prog fd + */ if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK) + (valsize != offsetofend(struct bpf_devmap_val, ifindex) && + valsize != offsetofend(struct bpf_devmap_val, bpf_prog.fd)) || + attr->map_flags & ~DEV_CREATE_FLAG_MASK) return -EINVAL; /* Lookup returns a pointer straight to dev->ifindex, so make sure the @@ -218,6 +234,8 @@ static void dev_map_free(struct bpf_map *map) hlist_for_each_entry_safe(dev, next, head, index_hlist) { hlist_del_rcu(&dev->index_hlist); + if (dev->xdp_prog) + bpf_prog_put(dev->xdp_prog); dev_put(dev->dev); kfree(dev); } @@ -232,6 +250,8 @@ static void dev_map_free(struct bpf_map *map) if (!dev) continue; + if (dev->xdp_prog) + bpf_prog_put(dev->xdp_prog); dev_put(dev->dev); kfree(dev); } @@ -318,6 +338,16 @@ static int dev_map_hash_get_next_key(struct bpf_map *map, void *key, return -ENOENT; } +bool dev_map_can_have_prog(struct bpf_map *map) +{ + if ((map->map_type == BPF_MAP_TYPE_DEVMAP || + map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) && + map->value_size != offsetofend(struct bpf_devmap_val, ifindex)) + return true; + + return false; +} + static int bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) { struct net_device *dev = bq->dev; @@ -435,13 +465,40 @@ static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, if (unlikely(err)) return err; - xdpf = convert_to_xdp_frame(xdp); + xdpf = xdp_convert_buff_to_frame(xdp); if (unlikely(!xdpf)) return -EOVERFLOW; return bq_enqueue(dev, xdpf, dev_rx); } +static struct xdp_buff *dev_map_run_prog(struct net_device *dev, + struct xdp_buff *xdp, + struct bpf_prog *xdp_prog) +{ + struct xdp_txq_info txq = { .dev = dev }; + u32 act; + + xdp->txq = &txq; + + act = bpf_prog_run_xdp(xdp_prog, xdp); + switch (act) { + case XDP_PASS: + return xdp; + case XDP_DROP: + break; + default: + bpf_warn_invalid_xdp_action(act); + fallthrough; + case XDP_ABORTED: + trace_xdp_exception(dev, xdp_prog, act); + break; + } + + xdp_return_buff(xdp); + return NULL; +} + int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, struct net_device *dev_rx) { @@ -453,6 +510,11 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, { struct net_device *dev = dst->dev; + if (dst->xdp_prog) { + xdp = dev_map_run_prog(dev, xdp, dst->xdp_prog); + if (!xdp) + return 0; + } return __xdp_enqueue(dev, xdp, dev_rx); } @@ -473,18 +535,15 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, static void *dev_map_lookup_elem(struct bpf_map *map, void *key) { struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key); - struct net_device *dev = obj ? obj->dev : NULL; - return dev ? &dev->ifindex : NULL; + return obj ? &obj->val : NULL; } static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key) { struct bpf_dtab_netdev *obj = __dev_map_hash_lookup_elem(map, *(u32 *)key); - struct net_device *dev = obj ? obj->dev : NULL; - - return dev ? &dev->ifindex : NULL; + return obj ? &obj->val : NULL; } static void __dev_map_entry_free(struct rcu_head *rcu) @@ -492,6 +551,8 @@ static void __dev_map_entry_free(struct rcu_head *rcu) struct bpf_dtab_netdev *dev; dev = container_of(rcu, struct bpf_dtab_netdev, rcu); + if (dev->xdp_prog) + bpf_prog_put(dev->xdp_prog); dev_put(dev->dev); kfree(dev); } @@ -542,9 +603,10 @@ static int dev_map_hash_delete_elem(struct bpf_map *map, void *key) static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, struct bpf_dtab *dtab, - u32 ifindex, + struct bpf_devmap_val *val, unsigned int idx) { + struct bpf_prog *prog = NULL; struct bpf_dtab_netdev *dev; dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN, @@ -552,24 +614,46 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, if (!dev) return ERR_PTR(-ENOMEM); - dev->dev = dev_get_by_index(net, ifindex); - if (!dev->dev) { - kfree(dev); - return ERR_PTR(-EINVAL); + dev->dev = dev_get_by_index(net, val->ifindex); + if (!dev->dev) + goto err_out; + + if (val->bpf_prog.fd >= 0) { + prog = bpf_prog_get_type_dev(val->bpf_prog.fd, + BPF_PROG_TYPE_XDP, false); + if (IS_ERR(prog)) + goto err_put_dev; + if (prog->expected_attach_type != BPF_XDP_DEVMAP) + goto err_put_prog; } dev->idx = idx; dev->dtab = dtab; + if (prog) { + dev->xdp_prog = prog; + dev->val.bpf_prog.id = prog->aux->id; + } else { + dev->xdp_prog = NULL; + dev->val.bpf_prog.id = 0; + } + dev->val.ifindex = val->ifindex; return dev; +err_put_prog: + bpf_prog_put(prog); +err_put_dev: + dev_put(dev->dev); +err_out: + kfree(dev); + return ERR_PTR(-EINVAL); } static int __dev_map_update_elem(struct net *net, struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + struct bpf_devmap_val val = { .bpf_prog.fd = -1 }; struct bpf_dtab_netdev *dev, *old_dev; - u32 ifindex = *(u32 *)value; u32 i = *(u32 *)key; if (unlikely(map_flags > BPF_EXIST)) @@ -579,10 +663,16 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map, if (unlikely(map_flags == BPF_NOEXIST)) return -EEXIST; - if (!ifindex) { + /* already verified value_size <= sizeof val */ + memcpy(&val, value, map->value_size); + + if (!val.ifindex) { dev = NULL; + /* can not specify fd if ifindex is 0 */ + if (val.bpf_prog.fd != -1) + return -EINVAL; } else { - dev = __dev_map_alloc_node(net, dtab, ifindex, i); + dev = __dev_map_alloc_node(net, dtab, &val, i); if (IS_ERR(dev)) return PTR_ERR(dev); } @@ -609,13 +699,16 @@ static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + struct bpf_devmap_val val = { .bpf_prog.fd = -1 }; struct bpf_dtab_netdev *dev, *old_dev; - u32 ifindex = *(u32 *)value; u32 idx = *(u32 *)key; unsigned long flags; int err = -EEXIST; - if (unlikely(map_flags > BPF_EXIST || !ifindex)) + /* already verified value_size <= sizeof val */ + memcpy(&val, value, map->value_size); + + if (unlikely(map_flags > BPF_EXIST || !val.ifindex)) return -EINVAL; spin_lock_irqsave(&dtab->index_lock, flags); @@ -624,7 +717,7 @@ static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, if (old_dev && (map_flags & BPF_NOEXIST)) goto out_err; - dev = __dev_map_alloc_node(net, dtab, ifindex, idx); + dev = __dev_map_alloc_node(net, dtab, &val, idx); if (IS_ERR(dev)) { err = PTR_ERR(dev); goto out_err; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index d541c8486c95..b4b288a3c3c9 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -359,9 +359,9 @@ static int htab_map_alloc_check(union bpf_attr *attr) BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) != offsetof(struct htab_elem, hash_node.pprev)); - if (lru && !capable(CAP_SYS_ADMIN)) + if (lru && !bpf_capable()) /* LRU implementation is much complicated than other - * maps. Hence, limit to CAP_SYS_ADMIN for now. + * maps. Hence, limit to CAP_BPF. */ return -EPERM; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index bafc53ddd350..be43ab3e619f 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -151,7 +151,19 @@ BPF_CALL_0(bpf_ktime_get_ns) const struct bpf_func_proto bpf_ktime_get_ns_proto = { .func = bpf_ktime_get_ns, - .gpl_only = true, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; + +BPF_CALL_0(bpf_ktime_get_boot_ns) +{ + /* NMI safe access to clock boottime */ + return ktime_get_boot_fast_ns(); +} + +const struct bpf_func_proto bpf_ktime_get_boot_ns_proto = { + .func = bpf_ktime_get_boot_ns, + .gpl_only = false, .ret_type = RET_INTEGER, }; @@ -562,3 +574,114 @@ const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto = { .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, }; + +static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = { + .func = bpf_get_raw_cpu_id, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; + +BPF_CALL_5(bpf_event_output_data, void *, ctx, struct bpf_map *, map, + u64, flags, void *, data, u64, size) +{ + if (unlikely(flags & ~(BPF_F_INDEX_MASK))) + return -EINVAL; + + return bpf_event_output(map, flags, data, size, NULL, 0, NULL); +} + +const struct bpf_func_proto bpf_event_output_data_proto = { + .func = bpf_event_output_data, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, +}; + +const struct bpf_func_proto bpf_get_current_task_proto __weak; +const struct bpf_func_proto bpf_probe_read_user_proto __weak; +const struct bpf_func_proto bpf_probe_read_user_str_proto __weak; +const struct bpf_func_proto bpf_probe_read_kernel_proto __weak; +const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak; + +const struct bpf_func_proto * +bpf_base_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_map_lookup_elem: + return &bpf_map_lookup_elem_proto; + case BPF_FUNC_map_update_elem: + return &bpf_map_update_elem_proto; + case BPF_FUNC_map_delete_elem: + return &bpf_map_delete_elem_proto; + case BPF_FUNC_map_push_elem: + return &bpf_map_push_elem_proto; + case BPF_FUNC_map_pop_elem: + return &bpf_map_pop_elem_proto; + case BPF_FUNC_map_peek_elem: + return &bpf_map_peek_elem_proto; + case BPF_FUNC_get_prandom_u32: + return &bpf_get_prandom_u32_proto; + case BPF_FUNC_get_smp_processor_id: + return &bpf_get_raw_smp_processor_id_proto; + case BPF_FUNC_get_numa_node_id: + return &bpf_get_numa_node_id_proto; + case BPF_FUNC_tail_call: + return &bpf_tail_call_proto; + case BPF_FUNC_ktime_get_ns: + return &bpf_ktime_get_ns_proto; + case BPF_FUNC_ktime_get_boot_ns: + return &bpf_ktime_get_boot_ns_proto; + case BPF_FUNC_ringbuf_output: + return &bpf_ringbuf_output_proto; + case BPF_FUNC_ringbuf_reserve: + return &bpf_ringbuf_reserve_proto; + case BPF_FUNC_ringbuf_submit: + return &bpf_ringbuf_submit_proto; + case BPF_FUNC_ringbuf_discard: + return &bpf_ringbuf_discard_proto; + case BPF_FUNC_ringbuf_query: + return &bpf_ringbuf_query_proto; + default: + break; + } + + if (!bpf_capable()) + return NULL; + + switch (func_id) { + case BPF_FUNC_spin_lock: + return &bpf_spin_lock_proto; + case BPF_FUNC_spin_unlock: + return &bpf_spin_unlock_proto; + case BPF_FUNC_trace_printk: + if (!perfmon_capable()) + return NULL; + return bpf_get_trace_printk_proto(); + case BPF_FUNC_jiffies64: + return &bpf_jiffies64_proto; + default: + break; + } + + if (!perfmon_capable()) + return NULL; + + switch (func_id) { + case BPF_FUNC_get_current_task: + return &bpf_get_current_task_proto; + case BPF_FUNC_probe_read_user: + return &bpf_probe_read_user_proto; + case BPF_FUNC_probe_read_kernel: + return &bpf_probe_read_kernel_proto; + case BPF_FUNC_probe_read_user_str: + return &bpf_probe_read_user_str_proto; + case BPF_FUNC_probe_read_kernel_str: + return &bpf_probe_read_kernel_str_proto; + default: + return NULL; + } +} diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 95087d9f4ed3..fb878ba3f22f 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -358,8 +358,11 @@ static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg) static int bpf_mklink(struct dentry *dentry, umode_t mode, void *arg) { + struct bpf_link *link = arg; + return bpf_mkobj_ops(dentry, mode, arg, &bpf_link_iops, - &bpffs_obj_fops); + bpf_link_is_iter(link) ? + &bpf_iter_fops : &bpffs_obj_fops); } static struct dentry * diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 65c236cf341e..c8cc4e4cf98d 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -543,7 +543,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) u64 cost = sizeof(*trie), cost_per_node; int ret; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return ERR_PTR(-EPERM); /* check sanity of attributes */ diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index b3c48d1533cb..17738c93bec8 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -60,7 +60,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) /* Misc members not needed in bpf_map_meta_equal() check. */ inner_map_meta->ops = inner_map->ops; if (inner_map->ops == &array_map_ops) { - inner_map_meta->unpriv_array = inner_map->unpriv_array; + inner_map_meta->bypass_spec_v1 = inner_map->bypass_spec_v1; container_of(inner_map_meta, struct bpf_array, map)->index_mask = container_of(inner_map, struct bpf_array, map)->index_mask; } diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c new file mode 100644 index 000000000000..c69071e334bf --- /dev/null +++ b/kernel/bpf/map_iter.c @@ -0,0 +1,102 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2020 Facebook */ +#include <linux/bpf.h> +#include <linux/fs.h> +#include <linux/filter.h> +#include <linux/kernel.h> + +struct bpf_iter_seq_map_info { + u32 mid; +}; + +static void *bpf_map_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct bpf_iter_seq_map_info *info = seq->private; + struct bpf_map *map; + + map = bpf_map_get_curr_or_next(&info->mid); + if (!map) + return NULL; + + ++*pos; + return map; +} + +static void *bpf_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct bpf_iter_seq_map_info *info = seq->private; + struct bpf_map *map; + + ++*pos; + ++info->mid; + bpf_map_put((struct bpf_map *)v); + map = bpf_map_get_curr_or_next(&info->mid); + if (!map) + return NULL; + + return map; +} + +struct bpf_iter__bpf_map { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct bpf_map *, map); +}; + +DEFINE_BPF_ITER_FUNC(bpf_map, struct bpf_iter_meta *meta, struct bpf_map *map) + +static int __bpf_map_seq_show(struct seq_file *seq, void *v, bool in_stop) +{ + struct bpf_iter__bpf_map ctx; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + int ret = 0; + + ctx.meta = &meta; + ctx.map = v; + meta.seq = seq; + prog = bpf_iter_get_info(&meta, in_stop); + if (prog) + ret = bpf_iter_run_prog(prog, &ctx); + + return ret; +} + +static int bpf_map_seq_show(struct seq_file *seq, void *v) +{ + return __bpf_map_seq_show(seq, v, false); +} + +static void bpf_map_seq_stop(struct seq_file *seq, void *v) +{ + if (!v) + (void)__bpf_map_seq_show(seq, v, true); + else + bpf_map_put((struct bpf_map *)v); +} + +static const struct seq_operations bpf_map_seq_ops = { + .start = bpf_map_seq_start, + .next = bpf_map_seq_next, + .stop = bpf_map_seq_stop, + .show = bpf_map_seq_show, +}; + +static const struct bpf_iter_reg bpf_map_reg_info = { + .target = "bpf_map", + .seq_ops = &bpf_map_seq_ops, + .init_seq_private = NULL, + .fini_seq_private = NULL, + .seq_priv_size = sizeof(struct bpf_iter_seq_map_info), + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__bpf_map, map), + PTR_TO_BTF_ID_OR_NULL }, + }, +}; + +static int __init bpf_map_iter_init(void) +{ + return bpf_iter_reg_target(&bpf_map_reg_info); +} + +late_initcall(bpf_map_iter_init); diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c new file mode 100644 index 000000000000..78cf061f8179 --- /dev/null +++ b/kernel/bpf/net_namespace.c @@ -0,0 +1,373 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/bpf.h> +#include <linux/filter.h> +#include <net/net_namespace.h> + +/* + * Functions to manage BPF programs attached to netns + */ + +struct bpf_netns_link { + struct bpf_link link; + enum bpf_attach_type type; + enum netns_bpf_attach_type netns_type; + + /* We don't hold a ref to net in order to auto-detach the link + * when netns is going away. Instead we rely on pernet + * pre_exit callback to clear this pointer. Must be accessed + * with netns_bpf_mutex held. + */ + struct net *net; +}; + +/* Protects updates to netns_bpf */ +DEFINE_MUTEX(netns_bpf_mutex); + +/* Must be called with netns_bpf_mutex held. */ +static void __net_exit bpf_netns_link_auto_detach(struct bpf_link *link) +{ + struct bpf_netns_link *net_link = + container_of(link, struct bpf_netns_link, link); + + net_link->net = NULL; +} + +static void bpf_netns_link_release(struct bpf_link *link) +{ + struct bpf_netns_link *net_link = + container_of(link, struct bpf_netns_link, link); + enum netns_bpf_attach_type type = net_link->netns_type; + struct net *net; + + /* Link auto-detached by dying netns. */ + if (!net_link->net) + return; + + mutex_lock(&netns_bpf_mutex); + + /* Recheck after potential sleep. We can race with cleanup_net + * here, but if we see a non-NULL struct net pointer pre_exit + * has not happened yet and will block on netns_bpf_mutex. + */ + net = net_link->net; + if (!net) + goto out_unlock; + + net->bpf.links[type] = NULL; + RCU_INIT_POINTER(net->bpf.progs[type], NULL); + +out_unlock: + mutex_unlock(&netns_bpf_mutex); +} + +static void bpf_netns_link_dealloc(struct bpf_link *link) +{ + struct bpf_netns_link *net_link = + container_of(link, struct bpf_netns_link, link); + + kfree(net_link); +} + +static int bpf_netns_link_update_prog(struct bpf_link *link, + struct bpf_prog *new_prog, + struct bpf_prog *old_prog) +{ + struct bpf_netns_link *net_link = + container_of(link, struct bpf_netns_link, link); + enum netns_bpf_attach_type type = net_link->netns_type; + struct net *net; + int ret = 0; + + if (old_prog && old_prog != link->prog) + return -EPERM; + if (new_prog->type != link->prog->type) + return -EINVAL; + + mutex_lock(&netns_bpf_mutex); + + net = net_link->net; + if (!net || !check_net(net)) { + /* Link auto-detached or netns dying */ + ret = -ENOLINK; + goto out_unlock; + } + + old_prog = xchg(&link->prog, new_prog); + rcu_assign_pointer(net->bpf.progs[type], new_prog); + bpf_prog_put(old_prog); + +out_unlock: + mutex_unlock(&netns_bpf_mutex); + return ret; +} + +static int bpf_netns_link_fill_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + const struct bpf_netns_link *net_link = + container_of(link, struct bpf_netns_link, link); + unsigned int inum = 0; + struct net *net; + + mutex_lock(&netns_bpf_mutex); + net = net_link->net; + if (net && check_net(net)) + inum = net->ns.inum; + mutex_unlock(&netns_bpf_mutex); + + info->netns.netns_ino = inum; + info->netns.attach_type = net_link->type; + return 0; +} + +static void bpf_netns_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_link_info info = {}; + + bpf_netns_link_fill_info(link, &info); + seq_printf(seq, + "netns_ino:\t%u\n" + "attach_type:\t%u\n", + info.netns.netns_ino, + info.netns.attach_type); +} + +static const struct bpf_link_ops bpf_netns_link_ops = { + .release = bpf_netns_link_release, + .dealloc = bpf_netns_link_dealloc, + .update_prog = bpf_netns_link_update_prog, + .fill_link_info = bpf_netns_link_fill_info, + .show_fdinfo = bpf_netns_link_show_fdinfo, +}; + +int netns_bpf_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); + u32 prog_id, prog_cnt = 0, flags = 0; + enum netns_bpf_attach_type type; + struct bpf_prog *attached; + struct net *net; + + if (attr->query.query_flags) + return -EINVAL; + + type = to_netns_bpf_attach_type(attr->query.attach_type); + if (type < 0) + return -EINVAL; + + net = get_net_ns_by_fd(attr->query.target_fd); + if (IS_ERR(net)) + return PTR_ERR(net); + + rcu_read_lock(); + attached = rcu_dereference(net->bpf.progs[type]); + if (attached) { + prog_cnt = 1; + prog_id = attached->aux->id; + } + rcu_read_unlock(); + + put_net(net); + + if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) + return -EFAULT; + if (copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt))) + return -EFAULT; + + if (!attr->query.prog_cnt || !prog_ids || !prog_cnt) + return 0; + + if (copy_to_user(prog_ids, &prog_id, sizeof(u32))) + return -EFAULT; + + return 0; +} + +int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + enum netns_bpf_attach_type type; + struct net *net; + int ret; + + type = to_netns_bpf_attach_type(attr->attach_type); + if (type < 0) + return -EINVAL; + + net = current->nsproxy->net_ns; + mutex_lock(&netns_bpf_mutex); + + /* Attaching prog directly is not compatible with links */ + if (net->bpf.links[type]) { + ret = -EEXIST; + goto out_unlock; + } + + switch (type) { + case NETNS_BPF_FLOW_DISSECTOR: + ret = flow_dissector_bpf_prog_attach(net, prog); + break; + default: + ret = -EINVAL; + break; + } +out_unlock: + mutex_unlock(&netns_bpf_mutex); + + return ret; +} + +/* Must be called with netns_bpf_mutex held. */ +static int __netns_bpf_prog_detach(struct net *net, + enum netns_bpf_attach_type type) +{ + struct bpf_prog *attached; + + /* Progs attached via links cannot be detached */ + if (net->bpf.links[type]) + return -EINVAL; + + attached = rcu_dereference_protected(net->bpf.progs[type], + lockdep_is_held(&netns_bpf_mutex)); + if (!attached) + return -ENOENT; + RCU_INIT_POINTER(net->bpf.progs[type], NULL); + bpf_prog_put(attached); + return 0; +} + +int netns_bpf_prog_detach(const union bpf_attr *attr) +{ + enum netns_bpf_attach_type type; + int ret; + + type = to_netns_bpf_attach_type(attr->attach_type); + if (type < 0) + return -EINVAL; + + mutex_lock(&netns_bpf_mutex); + ret = __netns_bpf_prog_detach(current->nsproxy->net_ns, type); + mutex_unlock(&netns_bpf_mutex); + + return ret; +} + +static int netns_bpf_link_attach(struct net *net, struct bpf_link *link, + enum netns_bpf_attach_type type) +{ + struct bpf_prog *prog; + int err; + + mutex_lock(&netns_bpf_mutex); + + /* Allow attaching only one prog or link for now */ + if (net->bpf.links[type]) { + err = -E2BIG; + goto out_unlock; + } + /* Links are not compatible with attaching prog directly */ + prog = rcu_dereference_protected(net->bpf.progs[type], + lockdep_is_held(&netns_bpf_mutex)); + if (prog) { + err = -EEXIST; + goto out_unlock; + } + + switch (type) { + case NETNS_BPF_FLOW_DISSECTOR: + err = flow_dissector_bpf_prog_attach(net, link->prog); + break; + default: + err = -EINVAL; + break; + } + if (err) + goto out_unlock; + + net->bpf.links[type] = link; + +out_unlock: + mutex_unlock(&netns_bpf_mutex); + return err; +} + +int netns_bpf_link_create(const union bpf_attr *attr, struct bpf_prog *prog) +{ + enum netns_bpf_attach_type netns_type; + struct bpf_link_primer link_primer; + struct bpf_netns_link *net_link; + enum bpf_attach_type type; + struct net *net; + int err; + + if (attr->link_create.flags) + return -EINVAL; + + type = attr->link_create.attach_type; + netns_type = to_netns_bpf_attach_type(type); + if (netns_type < 0) + return -EINVAL; + + net = get_net_ns_by_fd(attr->link_create.target_fd); + if (IS_ERR(net)) + return PTR_ERR(net); + + net_link = kzalloc(sizeof(*net_link), GFP_USER); + if (!net_link) { + err = -ENOMEM; + goto out_put_net; + } + bpf_link_init(&net_link->link, BPF_LINK_TYPE_NETNS, + &bpf_netns_link_ops, prog); + net_link->net = net; + net_link->type = type; + net_link->netns_type = netns_type; + + err = bpf_link_prime(&net_link->link, &link_primer); + if (err) { + kfree(net_link); + goto out_put_net; + } + + err = netns_bpf_link_attach(net, &net_link->link, netns_type); + if (err) { + bpf_link_cleanup(&link_primer); + goto out_put_net; + } + + put_net(net); + return bpf_link_settle(&link_primer); + +out_put_net: + put_net(net); + return err; +} + +static void __net_exit netns_bpf_pernet_pre_exit(struct net *net) +{ + enum netns_bpf_attach_type type; + struct bpf_link *link; + + mutex_lock(&netns_bpf_mutex); + for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) { + link = net->bpf.links[type]; + if (link) + bpf_netns_link_auto_detach(link); + else + __netns_bpf_prog_detach(net, type); + } + mutex_unlock(&netns_bpf_mutex); +} + +static struct pernet_operations netns_bpf_pernet_ops __net_initdata = { + .pre_exit = netns_bpf_pernet_pre_exit, +}; + +static int __init netns_bpf_init(void) +{ + return register_pernet_subsys(&netns_bpf_pernet_ops); +} + +subsys_initcall(netns_bpf_init); diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index f697647ceb54..05c8e043b9d2 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -19,7 +19,7 @@ struct bpf_queue_stack { u32 head, tail; u32 size; /* max_entries + 1 */ - char elements[0] __aligned(8); + char elements[] __aligned(8); }; static struct bpf_queue_stack *bpf_queue_stack(struct bpf_map *map) @@ -45,7 +45,7 @@ static bool queue_stack_map_is_full(struct bpf_queue_stack *qs) /* Called from syscall */ static int queue_stack_map_alloc_check(union bpf_attr *attr) { - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return -EPERM; /* check sanity of attributes */ diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index 01badd3eda7a..21cde24386db 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -154,7 +154,7 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) struct bpf_map_memory mem; u64 array_size; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return ERR_PTR(-EPERM); array_size = sizeof(*array); diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c new file mode 100644 index 000000000000..180414bb0d3e --- /dev/null +++ b/kernel/bpf/ringbuf.c @@ -0,0 +1,501 @@ +#include <linux/bpf.h> +#include <linux/btf.h> +#include <linux/err.h> +#include <linux/irq_work.h> +#include <linux/slab.h> +#include <linux/filter.h> +#include <linux/mm.h> +#include <linux/vmalloc.h> +#include <linux/wait.h> +#include <linux/poll.h> +#include <uapi/linux/btf.h> + +#define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE) + +/* non-mmap()'able part of bpf_ringbuf (everything up to consumer page) */ +#define RINGBUF_PGOFF \ + (offsetof(struct bpf_ringbuf, consumer_pos) >> PAGE_SHIFT) +/* consumer page and producer page */ +#define RINGBUF_POS_PAGES 2 + +#define RINGBUF_MAX_RECORD_SZ (UINT_MAX/4) + +/* Maximum size of ring buffer area is limited by 32-bit page offset within + * record header, counted in pages. Reserve 8 bits for extensibility, and take + * into account few extra pages for consumer/producer pages and + * non-mmap()'able parts. This gives 64GB limit, which seems plenty for single + * ring buffer. + */ +#define RINGBUF_MAX_DATA_SZ \ + (((1ULL << 24) - RINGBUF_POS_PAGES - RINGBUF_PGOFF) * PAGE_SIZE) + +struct bpf_ringbuf { + wait_queue_head_t waitq; + struct irq_work work; + u64 mask; + struct page **pages; + int nr_pages; + spinlock_t spinlock ____cacheline_aligned_in_smp; + /* Consumer and producer counters are put into separate pages to allow + * mapping consumer page as r/w, but restrict producer page to r/o. + * This protects producer position from being modified by user-space + * application and ruining in-kernel position tracking. + */ + unsigned long consumer_pos __aligned(PAGE_SIZE); + unsigned long producer_pos __aligned(PAGE_SIZE); + char data[] __aligned(PAGE_SIZE); +}; + +struct bpf_ringbuf_map { + struct bpf_map map; + struct bpf_map_memory memory; + struct bpf_ringbuf *rb; +}; + +/* 8-byte ring buffer record header structure */ +struct bpf_ringbuf_hdr { + u32 len; + u32 pg_off; +}; + +static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node) +{ + const gfp_t flags = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN | + __GFP_ZERO; + int nr_meta_pages = RINGBUF_PGOFF + RINGBUF_POS_PAGES; + int nr_data_pages = data_sz >> PAGE_SHIFT; + int nr_pages = nr_meta_pages + nr_data_pages; + struct page **pages, *page; + struct bpf_ringbuf *rb; + size_t array_size; + int i; + + /* Each data page is mapped twice to allow "virtual" + * continuous read of samples wrapping around the end of ring + * buffer area: + * ------------------------------------------------------ + * | meta pages | real data pages | same data pages | + * ------------------------------------------------------ + * | | 1 2 3 4 5 6 7 8 9 | 1 2 3 4 5 6 7 8 9 | + * ------------------------------------------------------ + * | | TA DA | TA DA | + * ------------------------------------------------------ + * ^^^^^^^ + * | + * Here, no need to worry about special handling of wrapped-around + * data due to double-mapped data pages. This works both in kernel and + * when mmap()'ed in user-space, simplifying both kernel and + * user-space implementations significantly. + */ + array_size = (nr_meta_pages + 2 * nr_data_pages) * sizeof(*pages); + if (array_size > PAGE_SIZE) + pages = vmalloc_node(array_size, numa_node); + else + pages = kmalloc_node(array_size, flags, numa_node); + if (!pages) + return NULL; + + for (i = 0; i < nr_pages; i++) { + page = alloc_pages_node(numa_node, flags, 0); + if (!page) { + nr_pages = i; + goto err_free_pages; + } + pages[i] = page; + if (i >= nr_meta_pages) + pages[nr_data_pages + i] = page; + } + + rb = vmap(pages, nr_meta_pages + 2 * nr_data_pages, + VM_ALLOC | VM_USERMAP, PAGE_KERNEL); + if (rb) { + rb->pages = pages; + rb->nr_pages = nr_pages; + return rb; + } + +err_free_pages: + for (i = 0; i < nr_pages; i++) + __free_page(pages[i]); + kvfree(pages); + return NULL; +} + +static void bpf_ringbuf_notify(struct irq_work *work) +{ + struct bpf_ringbuf *rb = container_of(work, struct bpf_ringbuf, work); + + wake_up_all(&rb->waitq); +} + +static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) +{ + struct bpf_ringbuf *rb; + + if (!data_sz || !PAGE_ALIGNED(data_sz)) + return ERR_PTR(-EINVAL); + +#ifdef CONFIG_64BIT + /* on 32-bit arch, it's impossible to overflow record's hdr->pgoff */ + if (data_sz > RINGBUF_MAX_DATA_SZ) + return ERR_PTR(-E2BIG); +#endif + + rb = bpf_ringbuf_area_alloc(data_sz, numa_node); + if (!rb) + return ERR_PTR(-ENOMEM); + + spin_lock_init(&rb->spinlock); + init_waitqueue_head(&rb->waitq); + init_irq_work(&rb->work, bpf_ringbuf_notify); + + rb->mask = data_sz - 1; + rb->consumer_pos = 0; + rb->producer_pos = 0; + + return rb; +} + +static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr) +{ + struct bpf_ringbuf_map *rb_map; + u64 cost; + int err; + + if (attr->map_flags & ~RINGBUF_CREATE_FLAG_MASK) + return ERR_PTR(-EINVAL); + + if (attr->key_size || attr->value_size || + attr->max_entries == 0 || !PAGE_ALIGNED(attr->max_entries)) + return ERR_PTR(-EINVAL); + + rb_map = kzalloc(sizeof(*rb_map), GFP_USER); + if (!rb_map) + return ERR_PTR(-ENOMEM); + + bpf_map_init_from_attr(&rb_map->map, attr); + + cost = sizeof(struct bpf_ringbuf_map) + + sizeof(struct bpf_ringbuf) + + attr->max_entries; + err = bpf_map_charge_init(&rb_map->map.memory, cost); + if (err) + goto err_free_map; + + rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node); + if (IS_ERR(rb_map->rb)) { + err = PTR_ERR(rb_map->rb); + goto err_uncharge; + } + + return &rb_map->map; + +err_uncharge: + bpf_map_charge_finish(&rb_map->map.memory); +err_free_map: + kfree(rb_map); + return ERR_PTR(err); +} + +static void bpf_ringbuf_free(struct bpf_ringbuf *rb) +{ + /* copy pages pointer and nr_pages to local variable, as we are going + * to unmap rb itself with vunmap() below + */ + struct page **pages = rb->pages; + int i, nr_pages = rb->nr_pages; + + vunmap(rb); + for (i = 0; i < nr_pages; i++) + __free_page(pages[i]); + kvfree(pages); +} + +static void ringbuf_map_free(struct bpf_map *map) +{ + struct bpf_ringbuf_map *rb_map; + + /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, + * so the programs (can be more than one that used this map) were + * disconnected from events. Wait for outstanding critical sections in + * these programs to complete + */ + synchronize_rcu(); + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + bpf_ringbuf_free(rb_map->rb); + kfree(rb_map); +} + +static void *ringbuf_map_lookup_elem(struct bpf_map *map, void *key) +{ + return ERR_PTR(-ENOTSUPP); +} + +static int ringbuf_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 flags) +{ + return -ENOTSUPP; +} + +static int ringbuf_map_delete_elem(struct bpf_map *map, void *key) +{ + return -ENOTSUPP; +} + +static int ringbuf_map_get_next_key(struct bpf_map *map, void *key, + void *next_key) +{ + return -ENOTSUPP; +} + +static size_t bpf_ringbuf_mmap_page_cnt(const struct bpf_ringbuf *rb) +{ + size_t data_pages = (rb->mask + 1) >> PAGE_SHIFT; + + /* consumer page + producer page + 2 x data pages */ + return RINGBUF_POS_PAGES + 2 * data_pages; +} + +static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) +{ + struct bpf_ringbuf_map *rb_map; + size_t mmap_sz; + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + mmap_sz = bpf_ringbuf_mmap_page_cnt(rb_map->rb) << PAGE_SHIFT; + + if (vma->vm_pgoff * PAGE_SIZE + (vma->vm_end - vma->vm_start) > mmap_sz) + return -EINVAL; + + return remap_vmalloc_range(vma, rb_map->rb, + vma->vm_pgoff + RINGBUF_PGOFF); +} + +static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb) +{ + unsigned long cons_pos, prod_pos; + + cons_pos = smp_load_acquire(&rb->consumer_pos); + prod_pos = smp_load_acquire(&rb->producer_pos); + return prod_pos - cons_pos; +} + +static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp, + struct poll_table_struct *pts) +{ + struct bpf_ringbuf_map *rb_map; + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + poll_wait(filp, &rb_map->rb->waitq, pts); + + if (ringbuf_avail_data_sz(rb_map->rb)) + return EPOLLIN | EPOLLRDNORM; + return 0; +} + +const struct bpf_map_ops ringbuf_map_ops = { + .map_alloc = ringbuf_map_alloc, + .map_free = ringbuf_map_free, + .map_mmap = ringbuf_map_mmap, + .map_poll = ringbuf_map_poll, + .map_lookup_elem = ringbuf_map_lookup_elem, + .map_update_elem = ringbuf_map_update_elem, + .map_delete_elem = ringbuf_map_delete_elem, + .map_get_next_key = ringbuf_map_get_next_key, +}; + +/* Given pointer to ring buffer record metadata and struct bpf_ringbuf itself, + * calculate offset from record metadata to ring buffer in pages, rounded + * down. This page offset is stored as part of record metadata and allows to + * restore struct bpf_ringbuf * from record pointer. This page offset is + * stored at offset 4 of record metadata header. + */ +static size_t bpf_ringbuf_rec_pg_off(struct bpf_ringbuf *rb, + struct bpf_ringbuf_hdr *hdr) +{ + return ((void *)hdr - (void *)rb) >> PAGE_SHIFT; +} + +/* Given pointer to ring buffer record header, restore pointer to struct + * bpf_ringbuf itself by using page offset stored at offset 4 + */ +static struct bpf_ringbuf * +bpf_ringbuf_restore_from_rec(struct bpf_ringbuf_hdr *hdr) +{ + unsigned long addr = (unsigned long)(void *)hdr; + unsigned long off = (unsigned long)hdr->pg_off << PAGE_SHIFT; + + return (void*)((addr & PAGE_MASK) - off); +} + +static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) +{ + unsigned long cons_pos, prod_pos, new_prod_pos, flags; + u32 len, pg_off; + struct bpf_ringbuf_hdr *hdr; + + if (unlikely(size > RINGBUF_MAX_RECORD_SZ)) + return NULL; + + len = round_up(size + BPF_RINGBUF_HDR_SZ, 8); + cons_pos = smp_load_acquire(&rb->consumer_pos); + + if (in_nmi()) { + if (!spin_trylock_irqsave(&rb->spinlock, flags)) + return NULL; + } else { + spin_lock_irqsave(&rb->spinlock, flags); + } + + prod_pos = rb->producer_pos; + new_prod_pos = prod_pos + len; + + /* check for out of ringbuf space by ensuring producer position + * doesn't advance more than (ringbuf_size - 1) ahead + */ + if (new_prod_pos - cons_pos > rb->mask) { + spin_unlock_irqrestore(&rb->spinlock, flags); + return NULL; + } + + hdr = (void *)rb->data + (prod_pos & rb->mask); + pg_off = bpf_ringbuf_rec_pg_off(rb, hdr); + hdr->len = size | BPF_RINGBUF_BUSY_BIT; + hdr->pg_off = pg_off; + + /* pairs with consumer's smp_load_acquire() */ + smp_store_release(&rb->producer_pos, new_prod_pos); + + spin_unlock_irqrestore(&rb->spinlock, flags); + + return (void *)hdr + BPF_RINGBUF_HDR_SZ; +} + +BPF_CALL_3(bpf_ringbuf_reserve, struct bpf_map *, map, u64, size, u64, flags) +{ + struct bpf_ringbuf_map *rb_map; + + if (unlikely(flags)) + return 0; + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + return (unsigned long)__bpf_ringbuf_reserve(rb_map->rb, size); +} + +const struct bpf_func_proto bpf_ringbuf_reserve_proto = { + .func = bpf_ringbuf_reserve, + .ret_type = RET_PTR_TO_ALLOC_MEM_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_CONST_ALLOC_SIZE_OR_ZERO, + .arg3_type = ARG_ANYTHING, +}; + +static void bpf_ringbuf_commit(void *sample, u64 flags, bool discard) +{ + unsigned long rec_pos, cons_pos; + struct bpf_ringbuf_hdr *hdr; + struct bpf_ringbuf *rb; + u32 new_len; + + hdr = sample - BPF_RINGBUF_HDR_SZ; + rb = bpf_ringbuf_restore_from_rec(hdr); + new_len = hdr->len ^ BPF_RINGBUF_BUSY_BIT; + if (discard) + new_len |= BPF_RINGBUF_DISCARD_BIT; + + /* update record header with correct final size prefix */ + xchg(&hdr->len, new_len); + + /* if consumer caught up and is waiting for our record, notify about + * new data availability + */ + rec_pos = (void *)hdr - (void *)rb->data; + cons_pos = smp_load_acquire(&rb->consumer_pos) & rb->mask; + + if (flags & BPF_RB_FORCE_WAKEUP) + irq_work_queue(&rb->work); + else if (cons_pos == rec_pos && !(flags & BPF_RB_NO_WAKEUP)) + irq_work_queue(&rb->work); +} + +BPF_CALL_2(bpf_ringbuf_submit, void *, sample, u64, flags) +{ + bpf_ringbuf_commit(sample, flags, false /* discard */); + return 0; +} + +const struct bpf_func_proto bpf_ringbuf_submit_proto = { + .func = bpf_ringbuf_submit, + .ret_type = RET_VOID, + .arg1_type = ARG_PTR_TO_ALLOC_MEM, + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_ringbuf_discard, void *, sample, u64, flags) +{ + bpf_ringbuf_commit(sample, flags, true /* discard */); + return 0; +} + +const struct bpf_func_proto bpf_ringbuf_discard_proto = { + .func = bpf_ringbuf_discard, + .ret_type = RET_VOID, + .arg1_type = ARG_PTR_TO_ALLOC_MEM, + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_ringbuf_output, struct bpf_map *, map, void *, data, u64, size, + u64, flags) +{ + struct bpf_ringbuf_map *rb_map; + void *rec; + + if (unlikely(flags & ~(BPF_RB_NO_WAKEUP | BPF_RB_FORCE_WAKEUP))) + return -EINVAL; + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + rec = __bpf_ringbuf_reserve(rb_map->rb, size); + if (!rec) + return -EAGAIN; + + memcpy(rec, data, size); + bpf_ringbuf_commit(rec, flags, false /* discard */); + return 0; +} + +const struct bpf_func_proto bpf_ringbuf_output_proto = { + .func = bpf_ringbuf_output, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_ringbuf_query, struct bpf_map *, map, u64, flags) +{ + struct bpf_ringbuf *rb; + + rb = container_of(map, struct bpf_ringbuf_map, map)->rb; + + switch (flags) { + case BPF_RB_AVAIL_DATA: + return ringbuf_avail_data_sz(rb); + case BPF_RB_RING_SIZE: + return rb->mask + 1; + case BPF_RB_CONS_POS: + return smp_load_acquire(&rb->consumer_pos); + case BPF_RB_PROD_POS: + return smp_load_acquire(&rb->producer_pos); + default: + return 0; + } +} + +const struct bpf_func_proto bpf_ringbuf_query_proto = { + .func = bpf_ringbuf_query, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_ANYTHING, +}; diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index db76339fe358..7b8381ce40a0 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -93,7 +93,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) u64 cost, n_buckets; int err; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return ERR_PTR(-EPERM); if (attr->map_flags & ~STACK_CREATE_FLAG_MASK) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 42c7a42fc9c8..4d530b1d5683 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -27,6 +27,8 @@ #include <uapi/linux/btf.h> #include <asm/pgtable.h> #include <linux/bpf_lsm.h> +#include <linux/poll.h> +#include <linux/bpf-netns.h> #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ @@ -43,6 +45,8 @@ static DEFINE_IDR(prog_idr); static DEFINE_SPINLOCK(prog_idr_lock); static DEFINE_IDR(map_idr); static DEFINE_SPINLOCK(map_idr_lock); +static DEFINE_IDR(link_idr); +static DEFINE_SPINLOCK(link_idr_lock); int sysctl_unprivileged_bpf_disabled __read_mostly; @@ -50,9 +54,11 @@ static const struct bpf_map_ops * const bpf_map_types[] = { #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) #define BPF_MAP_TYPE(_id, _ops) \ [_id] = &_ops, +#define BPF_LINK_TYPE(_id, _name) #include <linux/bpf_types.h> #undef BPF_PROG_TYPE #undef BPF_MAP_TYPE +#undef BPF_LINK_TYPE }; /* @@ -661,6 +667,16 @@ out: return err; } +static __poll_t bpf_map_poll(struct file *filp, struct poll_table_struct *pts) +{ + struct bpf_map *map = filp->private_data; + + if (map->ops->map_poll) + return map->ops->map_poll(map, filp, pts); + + return EPOLLERR; +} + const struct file_operations bpf_map_fops = { #ifdef CONFIG_PROC_FS .show_fdinfo = bpf_map_show_fdinfo, @@ -669,6 +685,7 @@ const struct file_operations bpf_map_fops = { .read = bpf_dummy_read, .write = bpf_dummy_write, .mmap = bpf_map_mmap, + .poll = bpf_map_poll, }; int bpf_map_new_fd(struct bpf_map *map, int flags) @@ -1386,7 +1403,7 @@ int generic_map_lookup_batch(struct bpf_map *map, buf = kmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN); if (!buf) { - kvfree(buf_prevkey); + kfree(buf_prevkey); return -ENOMEM; } @@ -1471,7 +1488,8 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { + if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ) || + !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { err = -EPERM; goto err_put; } @@ -1546,7 +1564,7 @@ static int map_freeze(const union bpf_attr *attr) err = -EBUSY; goto err_put; } - if (!capable(CAP_SYS_ADMIN)) { + if (!bpf_capable()) { err = -EPERM; goto err_put; } @@ -1562,9 +1580,11 @@ static const struct bpf_prog_ops * const bpf_prog_types[] = { #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ [_id] = & _name ## _prog_ops, #define BPF_MAP_TYPE(_id, _ops) +#define BPF_LINK_TYPE(_id, _name) #include <linux/bpf_types.h> #undef BPF_PROG_TYPE #undef BPF_MAP_TYPE +#undef BPF_LINK_TYPE }; static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) @@ -1986,6 +2006,10 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: @@ -2019,6 +2043,55 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, } } +static bool is_net_admin_prog_type(enum bpf_prog_type prog_type) +{ + switch (prog_type) { + case BPF_PROG_TYPE_SCHED_CLS: + case BPF_PROG_TYPE_SCHED_ACT: + case BPF_PROG_TYPE_XDP: + case BPF_PROG_TYPE_LWT_IN: + case BPF_PROG_TYPE_LWT_OUT: + case BPF_PROG_TYPE_LWT_XMIT: + case BPF_PROG_TYPE_LWT_SEG6LOCAL: + case BPF_PROG_TYPE_SK_SKB: + case BPF_PROG_TYPE_SK_MSG: + case BPF_PROG_TYPE_LIRC_MODE2: + case BPF_PROG_TYPE_FLOW_DISSECTOR: + case BPF_PROG_TYPE_CGROUP_DEVICE: + case BPF_PROG_TYPE_CGROUP_SOCK: + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + case BPF_PROG_TYPE_CGROUP_SOCKOPT: + case BPF_PROG_TYPE_CGROUP_SYSCTL: + case BPF_PROG_TYPE_SOCK_OPS: + case BPF_PROG_TYPE_EXT: /* extends any prog */ + return true; + case BPF_PROG_TYPE_CGROUP_SKB: + /* always unpriv */ + case BPF_PROG_TYPE_SK_REUSEPORT: + /* equivalent to SOCKET_FILTER. need CAP_BPF only */ + default: + return false; + } +} + +static bool is_perfmon_prog_type(enum bpf_prog_type prog_type) +{ + switch (prog_type) { + case BPF_PROG_TYPE_KPROBE: + case BPF_PROG_TYPE_TRACEPOINT: + case BPF_PROG_TYPE_PERF_EVENT: + case BPF_PROG_TYPE_RAW_TRACEPOINT: + case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: + case BPF_PROG_TYPE_TRACING: + case BPF_PROG_TYPE_LSM: + case BPF_PROG_TYPE_STRUCT_OPS: /* has access to struct sock */ + case BPF_PROG_TYPE_EXT: /* extends any prog */ + return true; + default: + return false; + } +} + /* last field in 'union bpf_attr' used by this command */ #define BPF_PROG_LOAD_LAST_FIELD attach_prog_fd @@ -2041,7 +2114,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && (attr->prog_flags & BPF_F_ANY_ALIGNMENT) && - !capable(CAP_SYS_ADMIN)) + !bpf_capable()) return -EPERM; /* copy eBPF program license from user space */ @@ -2054,11 +2127,16 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) is_gpl = license_is_gpl_compatible(license); if (attr->insn_cnt == 0 || - attr->insn_cnt > (capable(CAP_SYS_ADMIN) ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) + attr->insn_cnt > (bpf_capable() ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) return -E2BIG; if (type != BPF_PROG_TYPE_SOCKET_FILTER && type != BPF_PROG_TYPE_CGROUP_SKB && - !capable(CAP_SYS_ADMIN)) + !bpf_capable()) + return -EPERM; + + if (is_net_admin_prog_type(type) && !capable(CAP_NET_ADMIN)) + return -EPERM; + if (is_perfmon_prog_type(type) && !perfmon_capable()) return -EPERM; bpf_prog_load_fixup_attach_type(attr); @@ -2197,25 +2275,39 @@ static int bpf_obj_get(const union bpf_attr *attr) attr->file_flags); } -void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops, - struct bpf_prog *prog) +void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, + const struct bpf_link_ops *ops, struct bpf_prog *prog) { atomic64_set(&link->refcnt, 1); + link->type = type; + link->id = 0; link->ops = ops; link->prog = prog; } +static void bpf_link_free_id(int id) +{ + if (!id) + return; + + spin_lock_bh(&link_idr_lock); + idr_remove(&link_idr, id); + spin_unlock_bh(&link_idr_lock); +} + /* Clean up bpf_link and corresponding anon_inode file and FD. After * anon_inode is created, bpf_link can't be just kfree()'d due to deferred - * anon_inode's release() call. This helper manages marking bpf_link as - * defunct, releases anon_inode file and puts reserved FD. + * anon_inode's release() call. This helper marksbpf_link as + * defunct, releases anon_inode file and puts reserved FD. bpf_prog's refcnt + * is not decremented, it's the responsibility of a calling code that failed + * to complete bpf_link initialization. */ -void bpf_link_cleanup(struct bpf_link *link, struct file *link_file, - int link_fd) +void bpf_link_cleanup(struct bpf_link_primer *primer) { - link->prog = NULL; - fput(link_file); - put_unused_fd(link_fd); + primer->link->prog = NULL; + bpf_link_free_id(primer->id); + fput(primer->file); + put_unused_fd(primer->fd); } void bpf_link_inc(struct bpf_link *link) @@ -2226,6 +2318,7 @@ void bpf_link_inc(struct bpf_link *link) /* bpf_link_free is guaranteed to be called from process context */ static void bpf_link_free(struct bpf_link *link) { + bpf_link_free_id(link->id); if (link->prog) { /* detach BPF program, clean up used resources */ link->ops->release(link); @@ -2267,35 +2360,35 @@ static int bpf_link_release(struct inode *inode, struct file *filp) } #ifdef CONFIG_PROC_FS -static const struct bpf_link_ops bpf_raw_tp_lops; -static const struct bpf_link_ops bpf_tracing_link_lops; +#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) +#define BPF_MAP_TYPE(_id, _ops) +#define BPF_LINK_TYPE(_id, _name) [_id] = #_name, +static const char *bpf_link_type_strs[] = { + [BPF_LINK_TYPE_UNSPEC] = "<invalid>", +#include <linux/bpf_types.h> +}; +#undef BPF_PROG_TYPE +#undef BPF_MAP_TYPE +#undef BPF_LINK_TYPE static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) { const struct bpf_link *link = filp->private_data; const struct bpf_prog *prog = link->prog; char prog_tag[sizeof(prog->tag) * 2 + 1] = { }; - const char *link_type; - - if (link->ops == &bpf_raw_tp_lops) - link_type = "raw_tracepoint"; - else if (link->ops == &bpf_tracing_link_lops) - link_type = "tracing"; -#ifdef CONFIG_CGROUP_BPF - else if (link->ops == &bpf_cgroup_link_lops) - link_type = "cgroup"; -#endif - else - link_type = "unknown"; bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); seq_printf(m, "link_type:\t%s\n" + "link_id:\t%u\n" "prog_tag:\t%s\n" "prog_id:\t%u\n", - link_type, + bpf_link_type_strs[link->type], + link->id, prog_tag, prog->aux->id); + if (link->ops->show_fdinfo) + link->ops->show_fdinfo(link, m); } #endif @@ -2308,36 +2401,77 @@ static const struct file_operations bpf_link_fops = { .write = bpf_dummy_write, }; -int bpf_link_new_fd(struct bpf_link *link) +static int bpf_link_alloc_id(struct bpf_link *link) { - return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC); -} + int id; -/* Similar to bpf_link_new_fd, create anon_inode for given bpf_link, but - * instead of immediately installing fd in fdtable, just reserve it and - * return. Caller then need to either install it with fd_install(fd, file) or - * release with put_unused_fd(fd). - * This is useful for cases when bpf_link attachment/detachment are - * complicated and expensive operations and should be delayed until all the fd - * reservation and anon_inode creation succeeds. + idr_preload(GFP_KERNEL); + spin_lock_bh(&link_idr_lock); + id = idr_alloc_cyclic(&link_idr, link, 1, INT_MAX, GFP_ATOMIC); + spin_unlock_bh(&link_idr_lock); + idr_preload_end(); + + return id; +} + +/* Prepare bpf_link to be exposed to user-space by allocating anon_inode file, + * reserving unused FD and allocating ID from link_idr. This is to be paired + * with bpf_link_settle() to install FD and ID and expose bpf_link to + * user-space, if bpf_link is successfully attached. If not, bpf_link and + * pre-allocated resources are to be freed with bpf_cleanup() call. All the + * transient state is passed around in struct bpf_link_primer. + * This is preferred way to create and initialize bpf_link, especially when + * there are complicated and expensive operations inbetween creating bpf_link + * itself and attaching it to BPF hook. By using bpf_link_prime() and + * bpf_link_settle() kernel code using bpf_link doesn't have to perform + * expensive (and potentially failing) roll back operations in a rare case + * that file, FD, or ID can't be allocated. */ -struct file *bpf_link_new_file(struct bpf_link *link, int *reserved_fd) +int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer) { struct file *file; - int fd; + int fd, id; fd = get_unused_fd_flags(O_CLOEXEC); if (fd < 0) - return ERR_PTR(fd); + return fd; + + + id = bpf_link_alloc_id(link); + if (id < 0) { + put_unused_fd(fd); + return id; + } file = anon_inode_getfile("bpf_link", &bpf_link_fops, link, O_CLOEXEC); if (IS_ERR(file)) { + bpf_link_free_id(id); put_unused_fd(fd); - return file; + return PTR_ERR(file); } - *reserved_fd = fd; - return file; + primer->link = link; + primer->file = file; + primer->fd = fd; + primer->id = id; + return 0; +} + +int bpf_link_settle(struct bpf_link_primer *primer) +{ + /* make bpf_link fetchable by ID */ + spin_lock_bh(&link_idr_lock); + primer->link->id = primer->id; + spin_unlock_bh(&link_idr_lock); + /* make bpf_link fetchable by FD */ + fd_install(primer->fd, primer->file); + /* pass through installed FD */ + return primer->fd; +} + +int bpf_link_new_fd(struct bpf_link *link) +{ + return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC); } struct bpf_link *bpf_link_get_from_fd(u32 ufd) @@ -2361,6 +2495,7 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd) struct bpf_tracing_link { struct bpf_link link; + enum bpf_attach_type attach_type; }; static void bpf_tracing_link_release(struct bpf_link *link) @@ -2376,16 +2511,40 @@ static void bpf_tracing_link_dealloc(struct bpf_link *link) kfree(tr_link); } +static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_tracing_link *tr_link = + container_of(link, struct bpf_tracing_link, link); + + seq_printf(seq, + "attach_type:\t%d\n", + tr_link->attach_type); +} + +static int bpf_tracing_link_fill_link_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + struct bpf_tracing_link *tr_link = + container_of(link, struct bpf_tracing_link, link); + + info->tracing.attach_type = tr_link->attach_type; + + return 0; +} + static const struct bpf_link_ops bpf_tracing_link_lops = { .release = bpf_tracing_link_release, .dealloc = bpf_tracing_link_dealloc, + .show_fdinfo = bpf_tracing_link_show_fdinfo, + .fill_link_info = bpf_tracing_link_fill_link_info, }; static int bpf_tracing_prog_attach(struct bpf_prog *prog) { + struct bpf_link_primer link_primer; struct bpf_tracing_link *link; - struct file *link_file; - int link_fd, err; + int err; switch (prog->type) { case BPF_PROG_TYPE_TRACING: @@ -2418,24 +2577,23 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog) err = -ENOMEM; goto out_put_prog; } - bpf_link_init(&link->link, &bpf_tracing_link_lops, prog); + bpf_link_init(&link->link, BPF_LINK_TYPE_TRACING, + &bpf_tracing_link_lops, prog); + link->attach_type = prog->expected_attach_type; - link_file = bpf_link_new_file(&link->link, &link_fd); - if (IS_ERR(link_file)) { + err = bpf_link_prime(&link->link, &link_primer); + if (err) { kfree(link); - err = PTR_ERR(link_file); goto out_put_prog; } err = bpf_trampoline_link_prog(prog); if (err) { - bpf_link_cleanup(&link->link, link_file, link_fd); + bpf_link_cleanup(&link_primer); goto out_put_prog; } - fd_install(link_fd, link_file); - return link_fd; - + return bpf_link_settle(&link_primer); out_put_prog: bpf_prog_put(prog); return err; @@ -2463,22 +2621,69 @@ static void bpf_raw_tp_link_dealloc(struct bpf_link *link) kfree(raw_tp); } -static const struct bpf_link_ops bpf_raw_tp_lops = { +static void bpf_raw_tp_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_raw_tp_link *raw_tp_link = + container_of(link, struct bpf_raw_tp_link, link); + + seq_printf(seq, + "tp_name:\t%s\n", + raw_tp_link->btp->tp->name); +} + +static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + struct bpf_raw_tp_link *raw_tp_link = + container_of(link, struct bpf_raw_tp_link, link); + char __user *ubuf = u64_to_user_ptr(info->raw_tracepoint.tp_name); + const char *tp_name = raw_tp_link->btp->tp->name; + u32 ulen = info->raw_tracepoint.tp_name_len; + size_t tp_len = strlen(tp_name); + + if (ulen && !ubuf) + return -EINVAL; + + info->raw_tracepoint.tp_name_len = tp_len + 1; + + if (!ubuf) + return 0; + + if (ulen >= tp_len + 1) { + if (copy_to_user(ubuf, tp_name, tp_len + 1)) + return -EFAULT; + } else { + char zero = '\0'; + + if (copy_to_user(ubuf, tp_name, ulen - 1)) + return -EFAULT; + if (put_user(zero, ubuf + ulen - 1)) + return -EFAULT; + return -ENOSPC; + } + + return 0; +} + +static const struct bpf_link_ops bpf_raw_tp_link_lops = { .release = bpf_raw_tp_link_release, .dealloc = bpf_raw_tp_link_dealloc, + .show_fdinfo = bpf_raw_tp_link_show_fdinfo, + .fill_link_info = bpf_raw_tp_link_fill_link_info, }; #define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd static int bpf_raw_tracepoint_open(const union bpf_attr *attr) { + struct bpf_link_primer link_primer; struct bpf_raw_tp_link *link; struct bpf_raw_event_map *btp; - struct file *link_file; struct bpf_prog *prog; const char *tp_name; char buf[128]; - int link_fd, err; + int err; if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN)) return -EINVAL; @@ -2531,24 +2736,23 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) err = -ENOMEM; goto out_put_btp; } - bpf_link_init(&link->link, &bpf_raw_tp_lops, prog); + bpf_link_init(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT, + &bpf_raw_tp_link_lops, prog); link->btp = btp; - link_file = bpf_link_new_file(&link->link, &link_fd); - if (IS_ERR(link_file)) { + err = bpf_link_prime(&link->link, &link_primer); + if (err) { kfree(link); - err = PTR_ERR(link_file); goto out_put_btp; } err = bpf_probe_register(link->btp, prog); if (err) { - bpf_link_cleanup(&link->link, link_file, link_fd); + bpf_link_cleanup(&link_primer); goto out_put_btp; } - fd_install(link_fd, link_file); - return link_fd; + return bpf_link_settle(&link_primer); out_put_btp: bpf_put_raw_tracepoint(btp); @@ -2566,6 +2770,11 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, case BPF_PROG_TYPE_CGROUP_SOCKOPT: return attach_type == prog->expected_attach_type ? 0 : -EINVAL; case BPF_PROG_TYPE_CGROUP_SKB: + if (!capable(CAP_NET_ADMIN)) + /* cg-skb progs can be loaded by unpriv user. + * check permissions at attach time. + */ + return -EPERM; return prog->enforce_expected_attach_type && prog->expected_attach_type != attach_type ? -EINVAL : 0; @@ -2590,6 +2799,10 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: @@ -2613,6 +2826,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) case BPF_CGROUP_GETSOCKOPT: case BPF_CGROUP_SETSOCKOPT: return BPF_PROG_TYPE_CGROUP_SOCKOPT; + case BPF_TRACE_ITER: + return BPF_PROG_TYPE_TRACING; default: return BPF_PROG_TYPE_UNSPEC; } @@ -2629,9 +2844,6 @@ static int bpf_prog_attach(const union bpf_attr *attr) struct bpf_prog *prog; int ret; - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - if (CHECK_ATTR(BPF_PROG_ATTACH)) return -EINVAL; @@ -2660,7 +2872,7 @@ static int bpf_prog_attach(const union bpf_attr *attr) ret = lirc_prog_attach(attr, prog); break; case BPF_PROG_TYPE_FLOW_DISSECTOR: - ret = skb_flow_dissector_bpf_prog_attach(attr, prog); + ret = netns_bpf_prog_attach(attr, prog); break; case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_SKB: @@ -2686,9 +2898,6 @@ static int bpf_prog_detach(const union bpf_attr *attr) { enum bpf_prog_type ptype; - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - if (CHECK_ATTR(BPF_PROG_DETACH)) return -EINVAL; @@ -2701,7 +2910,9 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_PROG_TYPE_LIRC_MODE2: return lirc_prog_detach(attr); case BPF_PROG_TYPE_FLOW_DISSECTOR: - return skb_flow_dissector_bpf_prog_detach(attr); + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + return netns_bpf_prog_detach(attr); case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_SKB: case BPF_PROG_TYPE_CGROUP_SOCK: @@ -2737,6 +2948,10 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_INET6_POST_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: @@ -2750,7 +2965,7 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_LIRC_MODE2: return lirc_prog_query(attr, uattr); case BPF_FLOW_DISSECTOR: - return skb_flow_dissector_prog_query(attr, uattr); + return netns_bpf_prog_query(attr, uattr); default: return -EINVAL; } @@ -2764,8 +2979,6 @@ static int bpf_prog_test_run(const union bpf_attr *attr, struct bpf_prog *prog; int ret = -ENOTSUPP; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; if (CHECK_ATTR(BPF_PROG_TEST_RUN)) return -EINVAL; @@ -2816,6 +3029,25 @@ static int bpf_obj_get_next_id(const union bpf_attr *attr, return err; } +struct bpf_map *bpf_map_get_curr_or_next(u32 *id) +{ + struct bpf_map *map; + + spin_lock_bh(&map_idr_lock); +again: + map = idr_get_next(&map_idr, id); + if (map) { + map = __bpf_map_inc_not_zero(map, false); + if (IS_ERR(map)) { + (*id)++; + goto again; + } + } + spin_unlock_bh(&map_idr_lock); + + return map; +} + #define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id struct bpf_prog *bpf_prog_by_id(u32 id) @@ -3047,7 +3279,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, info.run_time_ns = stats.nsecs; info.run_cnt = stats.cnt; - if (!capable(CAP_SYS_ADMIN)) { + if (!bpf_capable()) { info.jited_prog_len = 0; info.xlated_prog_len = 0; info.nr_jited_ksyms = 0; @@ -3329,6 +3561,42 @@ static int bpf_btf_get_info_by_fd(struct btf *btf, return btf_get_info_by_fd(btf, attr, uattr); } +static int bpf_link_get_info_by_fd(struct bpf_link *link, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + struct bpf_link_info __user *uinfo = u64_to_user_ptr(attr->info.info); + struct bpf_link_info info; + u32 info_len = attr->info.info_len; + int err; + + err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len); + if (err) + return err; + info_len = min_t(u32, sizeof(info), info_len); + + memset(&info, 0, sizeof(info)); + if (copy_from_user(&info, uinfo, info_len)) + return -EFAULT; + + info.type = link->type; + info.id = link->id; + info.prog_id = link->prog->aux->id; + + if (link->ops->fill_link_info) { + err = link->ops->fill_link_info(link, &info); + if (err) + return err; + } + + if (copy_to_user(uinfo, &info, info_len) || + put_user(info_len, &uattr->info.info_len)) + return -EFAULT; + + return 0; +} + + #define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, @@ -3353,6 +3621,9 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, uattr); else if (f.file->f_op == &btf_fops) err = bpf_btf_get_info_by_fd(f.file->private_data, attr, uattr); + else if (f.file->f_op == &bpf_link_fops) + err = bpf_link_get_info_by_fd(f.file->private_data, + attr, uattr); else err = -EINVAL; @@ -3367,7 +3638,7 @@ static int bpf_btf_load(const union bpf_attr *attr) if (CHECK_ATTR(BPF_BTF_LOAD)) return -EINVAL; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return -EPERM; return btf_new_fd(attr); @@ -3480,7 +3751,7 @@ static int bpf_task_fd_query(const union bpf_attr *attr, if (file->f_op == &bpf_link_fops) { struct bpf_link *link = file->private_data; - if (link->ops == &bpf_raw_tp_lops) { + if (link->ops == &bpf_raw_tp_link_lops) { struct bpf_raw_tp_link *raw_tp = container_of(link, struct bpf_raw_tp_link, link); struct bpf_raw_event_map *btp = raw_tp->btp; @@ -3574,6 +3845,15 @@ err_put: return err; } +static int tracing_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + if (attr->link_create.attach_type == BPF_TRACE_ITER && + prog->expected_attach_type == BPF_TRACE_ITER) + return bpf_iter_link_attach(attr, prog); + + return -EINVAL; +} + #define BPF_LINK_CREATE_LAST_FIELD link_create.flags static int link_create(union bpf_attr *attr) { @@ -3581,9 +3861,6 @@ static int link_create(union bpf_attr *attr) struct bpf_prog *prog; int ret; - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - if (CHECK_ATTR(BPF_LINK_CREATE)) return -EINVAL; @@ -3610,6 +3887,12 @@ static int link_create(union bpf_attr *attr) case BPF_PROG_TYPE_CGROUP_SOCKOPT: ret = cgroup_bpf_link_attach(attr, prog); break; + case BPF_PROG_TYPE_TRACING: + ret = tracing_bpf_link_attach(attr, prog); + break; + case BPF_PROG_TYPE_FLOW_DISSECTOR: + ret = netns_bpf_link_create(attr, prog); + break; default: ret = -EINVAL; } @@ -3629,9 +3912,6 @@ static int link_update(union bpf_attr *attr) u32 flags; int ret; - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - if (CHECK_ATTR(BPF_LINK_UPDATE)) return -EINVAL; @@ -3661,13 +3941,10 @@ static int link_update(union bpf_attr *attr) goto out_put_progs; } -#ifdef CONFIG_CGROUP_BPF - if (link->ops == &bpf_cgroup_link_lops) { - ret = cgroup_bpf_replace(link, old_prog, new_prog); - goto out_put_progs; - } -#endif - ret = -EINVAL; + if (link->ops->update_prog) + ret = link->ops->update_prog(link, new_prog, old_prog); + else + ret = -EINVAL; out_put_progs: if (old_prog) @@ -3679,12 +3956,131 @@ out_put_link: return ret; } +static int bpf_link_inc_not_zero(struct bpf_link *link) +{ + return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? 0 : -ENOENT; +} + +#define BPF_LINK_GET_FD_BY_ID_LAST_FIELD link_id + +static int bpf_link_get_fd_by_id(const union bpf_attr *attr) +{ + struct bpf_link *link; + u32 id = attr->link_id; + int fd, err; + + if (CHECK_ATTR(BPF_LINK_GET_FD_BY_ID)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + spin_lock_bh(&link_idr_lock); + link = idr_find(&link_idr, id); + /* before link is "settled", ID is 0, pretend it doesn't exist yet */ + if (link) { + if (link->id) + err = bpf_link_inc_not_zero(link); + else + err = -EAGAIN; + } else { + err = -ENOENT; + } + spin_unlock_bh(&link_idr_lock); + + if (err) + return err; + + fd = bpf_link_new_fd(link); + if (fd < 0) + bpf_link_put(link); + + return fd; +} + +DEFINE_MUTEX(bpf_stats_enabled_mutex); + +static int bpf_stats_release(struct inode *inode, struct file *file) +{ + mutex_lock(&bpf_stats_enabled_mutex); + static_key_slow_dec(&bpf_stats_enabled_key.key); + mutex_unlock(&bpf_stats_enabled_mutex); + return 0; +} + +static const struct file_operations bpf_stats_fops = { + .release = bpf_stats_release, +}; + +static int bpf_enable_runtime_stats(void) +{ + int fd; + + mutex_lock(&bpf_stats_enabled_mutex); + + /* Set a very high limit to avoid overflow */ + if (static_key_count(&bpf_stats_enabled_key.key) > INT_MAX / 2) { + mutex_unlock(&bpf_stats_enabled_mutex); + return -EBUSY; + } + + fd = anon_inode_getfd("bpf-stats", &bpf_stats_fops, NULL, O_CLOEXEC); + if (fd >= 0) + static_key_slow_inc(&bpf_stats_enabled_key.key); + + mutex_unlock(&bpf_stats_enabled_mutex); + return fd; +} + +#define BPF_ENABLE_STATS_LAST_FIELD enable_stats.type + +static int bpf_enable_stats(union bpf_attr *attr) +{ + + if (CHECK_ATTR(BPF_ENABLE_STATS)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + switch (attr->enable_stats.type) { + case BPF_STATS_RUN_TIME: + return bpf_enable_runtime_stats(); + default: + break; + } + return -EINVAL; +} + +#define BPF_ITER_CREATE_LAST_FIELD iter_create.flags + +static int bpf_iter_create(union bpf_attr *attr) +{ + struct bpf_link *link; + int err; + + if (CHECK_ATTR(BPF_ITER_CREATE)) + return -EINVAL; + + if (attr->iter_create.flags) + return -EINVAL; + + link = bpf_link_get_from_fd(attr->iter_create.link_fd); + if (IS_ERR(link)) + return PTR_ERR(link); + + err = bpf_iter_new_fd(link); + bpf_link_put(link); + + return err; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr; int err; - if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN)) + if (sysctl_unprivileged_bpf_disabled && !bpf_capable()) return -EPERM; err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size); @@ -3796,6 +4192,19 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_LINK_UPDATE: err = link_update(&attr); break; + case BPF_LINK_GET_FD_BY_ID: + err = bpf_link_get_fd_by_id(&attr); + break; + case BPF_LINK_GET_NEXT_ID: + err = bpf_obj_get_next_id(&attr, uattr, + &link_idr, &link_idr_lock); + break; + case BPF_ENABLE_STATS: + err = bpf_enable_stats(&attr); + break; + case BPF_ITER_CREATE: + err = bpf_iter_create(&attr); + break; default: err = -EINVAL; break; diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c new file mode 100644 index 000000000000..4dbf2b6035f8 --- /dev/null +++ b/kernel/bpf/task_iter.c @@ -0,0 +1,353 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2020 Facebook */ + +#include <linux/init.h> +#include <linux/namei.h> +#include <linux/pid_namespace.h> +#include <linux/fs.h> +#include <linux/fdtable.h> +#include <linux/filter.h> + +struct bpf_iter_seq_task_common { + struct pid_namespace *ns; +}; + +struct bpf_iter_seq_task_info { + /* The first field must be struct bpf_iter_seq_task_common. + * this is assumed by {init, fini}_seq_pidns() callback functions. + */ + struct bpf_iter_seq_task_common common; + u32 tid; +}; + +static struct task_struct *task_seq_get_next(struct pid_namespace *ns, + u32 *tid) +{ + struct task_struct *task = NULL; + struct pid *pid; + + rcu_read_lock(); +retry: + pid = idr_get_next(&ns->idr, tid); + if (pid) { + task = get_pid_task(pid, PIDTYPE_PID); + if (!task) { + ++*tid; + goto retry; + } + } + rcu_read_unlock(); + + return task; +} + +static void *task_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct bpf_iter_seq_task_info *info = seq->private; + struct task_struct *task; + + task = task_seq_get_next(info->common.ns, &info->tid); + if (!task) + return NULL; + + ++*pos; + return task; +} + +static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct bpf_iter_seq_task_info *info = seq->private; + struct task_struct *task; + + ++*pos; + ++info->tid; + put_task_struct((struct task_struct *)v); + task = task_seq_get_next(info->common.ns, &info->tid); + if (!task) + return NULL; + + return task; +} + +struct bpf_iter__task { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct task_struct *, task); +}; + +DEFINE_BPF_ITER_FUNC(task, struct bpf_iter_meta *meta, struct task_struct *task) + +static int __task_seq_show(struct seq_file *seq, struct task_struct *task, + bool in_stop) +{ + struct bpf_iter_meta meta; + struct bpf_iter__task ctx; + struct bpf_prog *prog; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, in_stop); + if (!prog) + return 0; + + meta.seq = seq; + ctx.meta = &meta; + ctx.task = task; + return bpf_iter_run_prog(prog, &ctx); +} + +static int task_seq_show(struct seq_file *seq, void *v) +{ + return __task_seq_show(seq, v, false); +} + +static void task_seq_stop(struct seq_file *seq, void *v) +{ + if (!v) + (void)__task_seq_show(seq, v, true); + else + put_task_struct((struct task_struct *)v); +} + +static const struct seq_operations task_seq_ops = { + .start = task_seq_start, + .next = task_seq_next, + .stop = task_seq_stop, + .show = task_seq_show, +}; + +struct bpf_iter_seq_task_file_info { + /* The first field must be struct bpf_iter_seq_task_common. + * this is assumed by {init, fini}_seq_pidns() callback functions. + */ + struct bpf_iter_seq_task_common common; + struct task_struct *task; + struct files_struct *files; + u32 tid; + u32 fd; +}; + +static struct file * +task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info, + struct task_struct **task, struct files_struct **fstruct) +{ + struct pid_namespace *ns = info->common.ns; + u32 curr_tid = info->tid, max_fds; + struct files_struct *curr_files; + struct task_struct *curr_task; + int curr_fd = info->fd; + + /* If this function returns a non-NULL file object, + * it held a reference to the task/files_struct/file. + * Otherwise, it does not hold any reference. + */ +again: + if (*task) { + curr_task = *task; + curr_files = *fstruct; + curr_fd = info->fd; + } else { + curr_task = task_seq_get_next(ns, &curr_tid); + if (!curr_task) + return NULL; + + curr_files = get_files_struct(curr_task); + if (!curr_files) { + put_task_struct(curr_task); + curr_tid = ++(info->tid); + info->fd = 0; + goto again; + } + + /* set *fstruct, *task and info->tid */ + *fstruct = curr_files; + *task = curr_task; + if (curr_tid == info->tid) { + curr_fd = info->fd; + } else { + info->tid = curr_tid; + curr_fd = 0; + } + } + + rcu_read_lock(); + max_fds = files_fdtable(curr_files)->max_fds; + for (; curr_fd < max_fds; curr_fd++) { + struct file *f; + + f = fcheck_files(curr_files, curr_fd); + if (!f) + continue; + + /* set info->fd */ + info->fd = curr_fd; + get_file(f); + rcu_read_unlock(); + return f; + } + + /* the current task is done, go to the next task */ + rcu_read_unlock(); + put_files_struct(curr_files); + put_task_struct(curr_task); + *task = NULL; + *fstruct = NULL; + info->fd = 0; + curr_tid = ++(info->tid); + goto again; +} + +static void *task_file_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct bpf_iter_seq_task_file_info *info = seq->private; + struct files_struct *files = NULL; + struct task_struct *task = NULL; + struct file *file; + + file = task_file_seq_get_next(info, &task, &files); + if (!file) { + info->files = NULL; + info->task = NULL; + return NULL; + } + + ++*pos; + info->task = task; + info->files = files; + + return file; +} + +static void *task_file_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct bpf_iter_seq_task_file_info *info = seq->private; + struct files_struct *files = info->files; + struct task_struct *task = info->task; + struct file *file; + + ++*pos; + ++info->fd; + fput((struct file *)v); + file = task_file_seq_get_next(info, &task, &files); + if (!file) { + info->files = NULL; + info->task = NULL; + return NULL; + } + + info->task = task; + info->files = files; + + return file; +} + +struct bpf_iter__task_file { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct task_struct *, task); + u32 fd __aligned(8); + __bpf_md_ptr(struct file *, file); +}; + +DEFINE_BPF_ITER_FUNC(task_file, struct bpf_iter_meta *meta, + struct task_struct *task, u32 fd, + struct file *file) + +static int __task_file_seq_show(struct seq_file *seq, struct file *file, + bool in_stop) +{ + struct bpf_iter_seq_task_file_info *info = seq->private; + struct bpf_iter__task_file ctx; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, in_stop); + if (!prog) + return 0; + + ctx.meta = &meta; + ctx.task = info->task; + ctx.fd = info->fd; + ctx.file = file; + return bpf_iter_run_prog(prog, &ctx); +} + +static int task_file_seq_show(struct seq_file *seq, void *v) +{ + return __task_file_seq_show(seq, v, false); +} + +static void task_file_seq_stop(struct seq_file *seq, void *v) +{ + struct bpf_iter_seq_task_file_info *info = seq->private; + + if (!v) { + (void)__task_file_seq_show(seq, v, true); + } else { + fput((struct file *)v); + put_files_struct(info->files); + put_task_struct(info->task); + info->files = NULL; + info->task = NULL; + } +} + +static int init_seq_pidns(void *priv_data) +{ + struct bpf_iter_seq_task_common *common = priv_data; + + common->ns = get_pid_ns(task_active_pid_ns(current)); + return 0; +} + +static void fini_seq_pidns(void *priv_data) +{ + struct bpf_iter_seq_task_common *common = priv_data; + + put_pid_ns(common->ns); +} + +static const struct seq_operations task_file_seq_ops = { + .start = task_file_seq_start, + .next = task_file_seq_next, + .stop = task_file_seq_stop, + .show = task_file_seq_show, +}; + +static const struct bpf_iter_reg task_reg_info = { + .target = "task", + .seq_ops = &task_seq_ops, + .init_seq_private = init_seq_pidns, + .fini_seq_private = fini_seq_pidns, + .seq_priv_size = sizeof(struct bpf_iter_seq_task_info), + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__task, task), + PTR_TO_BTF_ID_OR_NULL }, + }, +}; + +static const struct bpf_iter_reg task_file_reg_info = { + .target = "task_file", + .seq_ops = &task_file_seq_ops, + .init_seq_private = init_seq_pidns, + .fini_seq_private = fini_seq_pidns, + .seq_priv_size = sizeof(struct bpf_iter_seq_task_file_info), + .ctx_arg_info_size = 2, + .ctx_arg_info = { + { offsetof(struct bpf_iter__task_file, task), + PTR_TO_BTF_ID_OR_NULL }, + { offsetof(struct bpf_iter__task_file, file), + PTR_TO_BTF_ID_OR_NULL }, + }, +}; + +static int __init task_iter_init(void) +{ + int ret; + + ret = bpf_iter_reg_target(&task_reg_info); + if (ret) + return ret; + + return bpf_iter_reg_target(&task_file_reg_info); +} +late_initcall(task_iter_init); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index efe14cf24bc6..5c7bbaac81ef 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -28,9 +28,11 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = { #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ [_id] = & _name ## _verifier_ops, #define BPF_MAP_TYPE(_id, _ops) +#define BPF_LINK_TYPE(_id, _name) #include <linux/bpf_types.h> #undef BPF_PROG_TYPE #undef BPF_MAP_TYPE +#undef BPF_LINK_TYPE }; /* bpf_check() is a static code analyzer that walks eBPF program @@ -168,6 +170,8 @@ struct bpf_verifier_stack_elem { int insn_idx; int prev_insn_idx; struct bpf_verifier_stack_elem *next; + /* length of verifier log at the time this state was pushed on stack */ + u32 log_pos; }; #define BPF_COMPLEXITY_LIMIT_JMP_SEQ 8192 @@ -229,6 +233,7 @@ struct bpf_call_arg_meta { bool pkt_access; int regno; int access_size; + int mem_size; u64 msize_max_value; int ref_obj_id; int func_id; @@ -283,6 +288,18 @@ void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, log->ubuf = NULL; } +static void bpf_vlog_reset(struct bpf_verifier_log *log, u32 new_pos) +{ + char zero = 0; + + if (!bpf_verifier_log_needed(log)) + return; + + log->len_used = new_pos; + if (put_user(zero, log->ubuf + new_pos)) + log->ubuf = NULL; +} + /* log_level controls verbosity level of eBPF verifier. * bpf_verifier_log_write() is used to dump the verification trace to the log, * so the user can figure out what's wrong with the program @@ -377,12 +394,23 @@ static bool type_is_sk_pointer(enum bpf_reg_type type) type == PTR_TO_XDP_SOCK; } +static bool reg_type_not_null(enum bpf_reg_type type) +{ + return type == PTR_TO_SOCKET || + type == PTR_TO_TCP_SOCK || + type == PTR_TO_MAP_VALUE || + type == PTR_TO_SOCK_COMMON || + type == PTR_TO_BTF_ID; +} + static bool reg_type_may_be_null(enum bpf_reg_type type) { return type == PTR_TO_MAP_VALUE_OR_NULL || type == PTR_TO_SOCKET_OR_NULL || type == PTR_TO_SOCK_COMMON_OR_NULL || - type == PTR_TO_TCP_SOCK_OR_NULL; + type == PTR_TO_TCP_SOCK_OR_NULL || + type == PTR_TO_BTF_ID_OR_NULL || + type == PTR_TO_MEM_OR_NULL; } static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) @@ -396,7 +424,9 @@ static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type) return type == PTR_TO_SOCKET || type == PTR_TO_SOCKET_OR_NULL || type == PTR_TO_TCP_SOCK || - type == PTR_TO_TCP_SOCK_OR_NULL; + type == PTR_TO_TCP_SOCK_OR_NULL || + type == PTR_TO_MEM || + type == PTR_TO_MEM_OR_NULL; } static bool arg_type_may_be_refcounted(enum bpf_arg_type type) @@ -410,14 +440,37 @@ static bool arg_type_may_be_refcounted(enum bpf_arg_type type) */ static bool is_release_function(enum bpf_func_id func_id) { - return func_id == BPF_FUNC_sk_release; + return func_id == BPF_FUNC_sk_release || + func_id == BPF_FUNC_ringbuf_submit || + func_id == BPF_FUNC_ringbuf_discard; } -static bool is_acquire_function(enum bpf_func_id func_id) +static bool may_be_acquire_function(enum bpf_func_id func_id) { return func_id == BPF_FUNC_sk_lookup_tcp || func_id == BPF_FUNC_sk_lookup_udp || - func_id == BPF_FUNC_skc_lookup_tcp; + func_id == BPF_FUNC_skc_lookup_tcp || + func_id == BPF_FUNC_map_lookup_elem || + func_id == BPF_FUNC_ringbuf_reserve; +} + +static bool is_acquire_function(enum bpf_func_id func_id, + const struct bpf_map *map) +{ + enum bpf_map_type map_type = map ? map->map_type : BPF_MAP_TYPE_UNSPEC; + + if (func_id == BPF_FUNC_sk_lookup_tcp || + func_id == BPF_FUNC_sk_lookup_udp || + func_id == BPF_FUNC_skc_lookup_tcp || + func_id == BPF_FUNC_ringbuf_reserve) + return true; + + if (func_id == BPF_FUNC_map_lookup_elem && + (map_type == BPF_MAP_TYPE_SOCKMAP || + map_type == BPF_MAP_TYPE_SOCKHASH)) + return true; + + return false; } static bool is_ptr_cast_function(enum bpf_func_id func_id) @@ -448,6 +501,9 @@ static const char * const reg_type_str[] = { [PTR_TO_TP_BUFFER] = "tp_buffer", [PTR_TO_XDP_SOCK] = "xdp_sock", [PTR_TO_BTF_ID] = "ptr_", + [PTR_TO_BTF_ID_OR_NULL] = "ptr_or_null_", + [PTR_TO_MEM] = "mem", + [PTR_TO_MEM_OR_NULL] = "mem_or_null", }; static char slot_type_char[] = { @@ -508,7 +564,7 @@ static void print_verifier_state(struct bpf_verifier_env *env, /* reg->off should be 0 for SCALAR_VALUE */ verbose(env, "%lld", reg->var_off.value + reg->off); } else { - if (t == PTR_TO_BTF_ID) + if (t == PTR_TO_BTF_ID || t == PTR_TO_BTF_ID_OR_NULL) verbose(env, "%s", kernel_type_name(reg->btf_id)); verbose(env, "(id=%d", reg->id); if (reg_type_may_be_refcounted_or_null(t)) @@ -846,7 +902,7 @@ static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifi } static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, - int *insn_idx) + int *insn_idx, bool pop_log) { struct bpf_verifier_state *cur = env->cur_state; struct bpf_verifier_stack_elem *elem, *head = env->head; @@ -860,6 +916,8 @@ static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, if (err) return err; } + if (pop_log) + bpf_vlog_reset(&env->log, head->log_pos); if (insn_idx) *insn_idx = head->insn_idx; if (prev_insn_idx) @@ -887,6 +945,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, elem->insn_idx = insn_idx; elem->prev_insn_idx = prev_insn_idx; elem->next = env->head; + elem->log_pos = env->log.len_used; env->head = elem; env->stack_size++; err = copy_verifier_state(&elem->st, cur); @@ -915,7 +974,7 @@ err: free_verifier_state(env->cur_state, true); env->cur_state = NULL; /* pop all elements and return */ - while (!pop_stack(env, NULL, NULL)); + while (!pop_stack(env, NULL, NULL, false)); return NULL; } @@ -1255,7 +1314,7 @@ static void __mark_reg_unknown(const struct bpf_verifier_env *env, reg->type = SCALAR_VALUE; reg->var_off = tnum_unknown; reg->frameno = 0; - reg->precise = env->subprog_cnt > 1 || !env->allow_ptr_leaks; + reg->precise = env->subprog_cnt > 1 || !env->bpf_capable; __mark_reg_unbounded(reg); } @@ -1387,8 +1446,9 @@ static int check_subprogs(struct bpf_verifier_env *env) continue; if (insn[i].src_reg != BPF_PSEUDO_CALL) continue; - if (!env->allow_ptr_leaks) { - verbose(env, "function calls to other bpf functions are allowed for root only\n"); + if (!env->bpf_capable) { + verbose(env, + "function calls to other bpf functions are allowed for CAP_BPF and CAP_SYS_ADMIN\n"); return -EPERM; } ret = add_subprog(env, i + insn[i].imm + 1); @@ -1922,8 +1982,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno, bool new_marks = false; int i, err; - if (!env->allow_ptr_leaks) - /* backtracking is root only for now */ + if (!env->bpf_capable) return 0; func = st->frame[st->curframe]; @@ -2101,6 +2160,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_TCP_SOCK_OR_NULL: case PTR_TO_XDP_SOCK: case PTR_TO_BTF_ID: + case PTR_TO_BTF_ID_OR_NULL: return true; default: return false; @@ -2170,7 +2230,7 @@ static int check_stack_write(struct bpf_verifier_env *env, reg = &cur->regs[value_regno]; if (reg && size == BPF_REG_SIZE && register_is_const(reg) && - !register_is_null(reg) && env->allow_ptr_leaks) { + !register_is_null(reg) && env->bpf_capable) { if (dst_reg != BPF_REG_FP) { /* The backtracking logic can only recognize explicit * stack slot address like [fp - 8]. Other spill of @@ -2196,7 +2256,7 @@ static int check_stack_write(struct bpf_verifier_env *env, return -EINVAL; } - if (!env->allow_ptr_leaks) { + if (!env->bypass_spec_v4) { bool sanitize = false; if (state->stack[spi].slot_type[0] == STACK_SPILL && @@ -2418,32 +2478,49 @@ static int check_map_access_type(struct bpf_verifier_env *env, u32 regno, return 0; } -/* check read/write into map element returned by bpf_map_lookup_elem() */ -static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off, - int size, bool zero_size_allowed) +/* check read/write into memory region (e.g., map value, ringbuf sample, etc) */ +static int __check_mem_access(struct bpf_verifier_env *env, int regno, + int off, int size, u32 mem_size, + bool zero_size_allowed) { - struct bpf_reg_state *regs = cur_regs(env); - struct bpf_map *map = regs[regno].map_ptr; + bool size_ok = size > 0 || (size == 0 && zero_size_allowed); + struct bpf_reg_state *reg; + + if (off >= 0 && size_ok && (u64)off + size <= mem_size) + return 0; - if (off < 0 || size < 0 || (size == 0 && !zero_size_allowed) || - off + size > map->value_size) { + reg = &cur_regs(env)[regno]; + switch (reg->type) { + case PTR_TO_MAP_VALUE: verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n", - map->value_size, off, size); - return -EACCES; + mem_size, off, size); + break; + case PTR_TO_PACKET: + case PTR_TO_PACKET_META: + case PTR_TO_PACKET_END: + verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n", + off, size, regno, reg->id, off, mem_size); + break; + case PTR_TO_MEM: + default: + verbose(env, "invalid access to memory, mem_size=%u off=%d size=%d\n", + mem_size, off, size); } - return 0; + + return -EACCES; } -/* check read/write into a map element with possible variable offset */ -static int check_map_access(struct bpf_verifier_env *env, u32 regno, - int off, int size, bool zero_size_allowed) +/* check read/write into a memory region with possible variable offset */ +static int check_mem_region_access(struct bpf_verifier_env *env, u32 regno, + int off, int size, u32 mem_size, + bool zero_size_allowed) { struct bpf_verifier_state *vstate = env->cur_state; struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *reg = &state->regs[regno]; int err; - /* We may have adjusted the register to this map value, so we + /* We may have adjusted the register pointing to memory region, so we * need to try adding each of min_value and max_value to off * to make sure our theoretical access will be safe. */ @@ -2464,10 +2541,10 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, regno); return -EACCES; } - err = __check_map_access(env, regno, reg->smin_value + off, size, - zero_size_allowed); + err = __check_mem_access(env, regno, reg->smin_value + off, size, + mem_size, zero_size_allowed); if (err) { - verbose(env, "R%d min value is outside of the array range\n", + verbose(env, "R%d min value is outside of the allowed memory range\n", regno); return err; } @@ -2477,18 +2554,38 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, * If reg->umax_value + off could overflow, treat that as unbounded too. */ if (reg->umax_value >= BPF_MAX_VAR_OFF) { - verbose(env, "R%d unbounded memory access, make sure to bounds check any array access into a map\n", + verbose(env, "R%d unbounded memory access, make sure to bounds check any such access\n", regno); return -EACCES; } - err = __check_map_access(env, regno, reg->umax_value + off, size, - zero_size_allowed); - if (err) - verbose(env, "R%d max value is outside of the array range\n", + err = __check_mem_access(env, regno, reg->umax_value + off, size, + mem_size, zero_size_allowed); + if (err) { + verbose(env, "R%d max value is outside of the allowed memory range\n", regno); + return err; + } + + return 0; +} + +/* check read/write into a map element with possible variable offset */ +static int check_map_access(struct bpf_verifier_env *env, u32 regno, + int off, int size, bool zero_size_allowed) +{ + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; + struct bpf_reg_state *reg = &state->regs[regno]; + struct bpf_map *map = reg->map_ptr; + int err; - if (map_value_has_spin_lock(reg->map_ptr)) { - u32 lock = reg->map_ptr->spin_lock_off; + err = check_mem_region_access(env, regno, off, size, map->value_size, + zero_size_allowed); + if (err) + return err; + + if (map_value_has_spin_lock(map)) { + u32 lock = map->spin_lock_off; /* if any part of struct bpf_spin_lock can be touched by * load/store reject this program. @@ -2546,21 +2643,6 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, } } -static int __check_packet_access(struct bpf_verifier_env *env, u32 regno, - int off, int size, bool zero_size_allowed) -{ - struct bpf_reg_state *regs = cur_regs(env); - struct bpf_reg_state *reg = ®s[regno]; - - if (off < 0 || size < 0 || (size == 0 && !zero_size_allowed) || - (u64)off + size > reg->range) { - verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n", - off, size, regno, reg->id, reg->off, reg->range); - return -EACCES; - } - return 0; -} - static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, int size, bool zero_size_allowed) { @@ -2581,16 +2663,17 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, regno); return -EACCES; } - err = __check_packet_access(env, regno, off, size, zero_size_allowed); + err = __check_mem_access(env, regno, off, size, reg->range, + zero_size_allowed); if (err) { verbose(env, "R%d offset is outside of the packet\n", regno); return err; } - /* __check_packet_access has made sure "off + size - 1" is within u16. + /* __check_mem_access has made sure "off + size - 1" is within u16. * reg->umax_value can't be bigger than MAX_PACKET_OFF which is 0xffff, * otherwise find_good_pkt_pointers would have refused to set range info - * that __check_packet_access would have rejected this pkt access. + * that __check_mem_access would have rejected this pkt access. * Therefore, "off + reg->umax_value + size - 1" won't overflow u32. */ env->prog->aux->max_pkt_offset = @@ -2621,7 +2704,7 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, */ *reg_type = info.reg_type; - if (*reg_type == PTR_TO_BTF_ID) + if (*reg_type == PTR_TO_BTF_ID || *reg_type == PTR_TO_BTF_ID_OR_NULL) *btf_id = info.btf_id; else env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size; @@ -3170,6 +3253,16 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn mark_reg_unknown(env, regs, value_regno); } } + } else if (reg->type == PTR_TO_MEM) { + if (t == BPF_WRITE && value_regno >= 0 && + is_pointer_value(env, value_regno)) { + verbose(env, "R%d leaks addr into mem\n", value_regno); + return -EACCES; + } + err = check_mem_region_access(env, regno, off, size, + reg->mem_size, false); + if (!err && t == BPF_READ && value_regno >= 0) + mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_CTX) { enum bpf_reg_type reg_type = SCALAR_VALUE; u32 btf_id = 0; @@ -3205,7 +3298,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn * a sub-register. */ regs[value_regno].subreg_def = DEF_NOT_SUBREG; - if (reg_type == PTR_TO_BTF_ID) + if (reg_type == PTR_TO_BTF_ID || + reg_type == PTR_TO_BTF_ID_OR_NULL) regs[value_regno].btf_id = btf_id; } regs[value_regno].type = reg_type; @@ -3390,7 +3484,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, * Spectre masking for stack ALU. * See also retrieve_ptr_limit(). */ - if (!env->allow_ptr_leaks) { + if (!env->bypass_spec_v1) { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); @@ -3452,6 +3546,11 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, *stype = STACK_MISC; goto mark; } + + if (state->stack[spi].slot_type[0] == STACK_SPILL && + state->stack[spi].spilled_ptr.type == PTR_TO_BTF_ID) + goto mark; + if (state->stack[spi].slot_type[0] == STACK_SPILL && state->stack[spi].spilled_ptr.type == SCALAR_VALUE) { __mark_reg_unknown(env, &state->stack[spi].spilled_ptr); @@ -3501,6 +3600,10 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, return -EACCES; return check_map_access(env, regno, reg->off, access_size, zero_size_allowed); + case PTR_TO_MEM: + return check_mem_region_access(env, regno, reg->off, + access_size, reg->mem_size, + zero_size_allowed); default: /* scalar_value|ptr_to_stack or invalid ptr */ return check_stack_boundary(env, regno, access_size, zero_size_allowed, meta); @@ -3605,6 +3708,17 @@ static bool arg_type_is_mem_size(enum bpf_arg_type type) type == ARG_CONST_SIZE_OR_ZERO; } +static bool arg_type_is_alloc_mem_ptr(enum bpf_arg_type type) +{ + return type == ARG_PTR_TO_ALLOC_MEM || + type == ARG_PTR_TO_ALLOC_MEM_OR_NULL; +} + +static bool arg_type_is_alloc_size(enum bpf_arg_type type) +{ + return type == ARG_CONST_ALLOC_SIZE_OR_ZERO; +} + static bool arg_type_is_int_ptr(enum bpf_arg_type type) { return type == ARG_PTR_TO_INT || @@ -3664,7 +3778,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, type != expected_type) goto err_type; } else if (arg_type == ARG_CONST_SIZE || - arg_type == ARG_CONST_SIZE_OR_ZERO) { + arg_type == ARG_CONST_SIZE_OR_ZERO || + arg_type == ARG_CONST_ALLOC_SIZE_OR_ZERO) { expected_type = SCALAR_VALUE; if (type != expected_type) goto err_type; @@ -3735,13 +3850,29 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, * happens during stack boundary checking. */ if (register_is_null(reg) && - arg_type == ARG_PTR_TO_MEM_OR_NULL) + (arg_type == ARG_PTR_TO_MEM_OR_NULL || + arg_type == ARG_PTR_TO_ALLOC_MEM_OR_NULL)) /* final test in check_stack_boundary() */; else if (!type_is_pkt_pointer(type) && type != PTR_TO_MAP_VALUE && + type != PTR_TO_MEM && type != expected_type) goto err_type; meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM; + } else if (arg_type_is_alloc_mem_ptr(arg_type)) { + expected_type = PTR_TO_MEM; + if (register_is_null(reg) && + arg_type == ARG_PTR_TO_ALLOC_MEM_OR_NULL) + /* final test in check_stack_boundary() */; + else if (type != expected_type) + goto err_type; + if (meta->ref_obj_id) { + verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", + regno, reg->ref_obj_id, + meta->ref_obj_id); + return -EFAULT; + } + meta->ref_obj_id = reg->ref_obj_id; } else if (arg_type_is_int_ptr(arg_type)) { expected_type = PTR_TO_STACK; if (!type_is_pkt_pointer(type) && @@ -3837,6 +3968,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, zero_size_allowed, meta); if (!err) err = mark_chain_precision(env, regno); + } else if (arg_type_is_alloc_size(arg_type)) { + if (!tnum_is_const(reg->var_off)) { + verbose(env, "R%d unbounded size, use 'var &= const' or 'if (var < const)'\n", + regno); + return -EACCES; + } + meta->mem_size = reg->var_off.value; } else if (arg_type_is_int_ptr(arg_type)) { int size = int_ptr_type_to_size(arg_type); @@ -3873,6 +4011,14 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_xdp_output) goto error; break; + case BPF_MAP_TYPE_RINGBUF: + if (func_id != BPF_FUNC_ringbuf_output && + func_id != BPF_FUNC_ringbuf_reserve && + func_id != BPF_FUNC_ringbuf_submit && + func_id != BPF_FUNC_ringbuf_discard && + func_id != BPF_FUNC_ringbuf_query) + goto error; + break; case BPF_MAP_TYPE_STACK_TRACE: if (func_id != BPF_FUNC_get_stackid) goto error; @@ -3915,7 +4061,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_sock_map_update && func_id != BPF_FUNC_map_delete_elem && func_id != BPF_FUNC_msg_redirect_map && - func_id != BPF_FUNC_sk_select_reuseport) + func_id != BPF_FUNC_sk_select_reuseport && + func_id != BPF_FUNC_map_lookup_elem) goto error; break; case BPF_MAP_TYPE_SOCKHASH: @@ -3923,7 +4070,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_sock_hash_update && func_id != BPF_FUNC_map_delete_elem && func_id != BPF_FUNC_msg_redirect_hash && - func_id != BPF_FUNC_sk_select_reuseport) + func_id != BPF_FUNC_sk_select_reuseport && + func_id != BPF_FUNC_map_lookup_elem) goto error; break; case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY: @@ -4093,7 +4241,7 @@ static bool check_refcount_ok(const struct bpf_func_proto *fn, int func_id) /* A reference acquiring function cannot acquire * another refcounted ptr. */ - if (is_acquire_function(func_id) && count) + if (may_be_acquire_function(func_id) && count) return false; /* We only support one arg being unreferenced at the moment, @@ -4388,10 +4536,10 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, if (!BPF_MAP_PTR(aux->map_ptr_state)) bpf_map_ptr_store(aux, meta->map_ptr, - meta->map_ptr->unpriv_array); + !meta->map_ptr->bypass_spec_v1); else if (BPF_MAP_PTR(aux->map_ptr_state) != meta->map_ptr) bpf_map_ptr_store(aux, BPF_MAP_PTR_POISON, - meta->map_ptr->unpriv_array); + !meta->map_ptr->bypass_spec_v1); return 0; } @@ -4597,6 +4745,11 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL; regs[BPF_REG_0].id = ++env->id_gen; + } else if (fn->ret_type == RET_PTR_TO_ALLOC_MEM_OR_NULL) { + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL; + regs[BPF_REG_0].id = ++env->id_gen; + regs[BPF_REG_0].mem_size = meta.mem_size; } else { verbose(env, "unknown return type %d of func %s#%d\n", fn->ret_type, func_id_name(func_id), func_id); @@ -4606,7 +4759,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn if (is_ptr_cast_function(func_id)) { /* For release_reference() */ regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id; - } else if (is_acquire_function(func_id)) { + } else if (is_acquire_function(func_id, meta.map_ptr)) { int id = acquire_reference_state(env, insn_idx); if (id < 0) @@ -4760,7 +4913,7 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, static bool can_skip_alu_sanitation(const struct bpf_verifier_env *env, const struct bpf_insn *insn) { - return env->allow_ptr_leaks || BPF_SRC(insn->code) == BPF_K; + return env->bypass_spec_v1 || BPF_SRC(insn->code) == BPF_K; } static int update_alu_sanitation_state(struct bpf_insn_aux_data *aux, @@ -5070,7 +5223,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, /* For unprivileged we require that resulting offset must be in bounds * in order to be able to sanitize access later on. */ - if (!env->allow_ptr_leaks) { + if (!env->bypass_spec_v1) { if (dst_reg->type == PTR_TO_MAP_VALUE && check_map_access(env, dst, dst_reg->off, 1, false)) { verbose(env, "R%d pointer arithmetic of map value goes out of range, " @@ -5611,7 +5764,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, { struct bpf_reg_state *regs = cur_regs(env); u8 opcode = BPF_OP(insn->code); - bool src_known, dst_known; + bool src_known; s64 smin_val, smax_val; u64 umin_val, umax_val; s32 s32_min_val, s32_max_val; @@ -5633,7 +5786,6 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, if (alu32) { src_known = tnum_subreg_is_const(src_reg.var_off); - dst_known = tnum_subreg_is_const(dst_reg->var_off); if ((src_known && (s32_min_val != s32_max_val || u32_min_val != u32_max_val)) || s32_min_val > s32_max_val || u32_min_val > u32_max_val) { @@ -5645,7 +5797,6 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, } } else { src_known = tnum_is_const(src_reg.var_off); - dst_known = tnum_is_const(dst_reg->var_off); if ((src_known && (smin_val != smax_val || umin_val != umax_val)) || smin_val > smax_val || umin_val > umax_val) { @@ -6261,8 +6412,25 @@ static int is_branch64_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode, bool is_jmp32) { - if (__is_pointer_value(false, reg)) - return -1; + if (__is_pointer_value(false, reg)) { + if (!reg_type_not_null(reg->type)) + return -1; + + /* If pointer is valid tests against zero will fail so we can + * use this to direct branch taken. + */ + if (val != 0) + return -1; + + switch (opcode) { + case BPF_JEQ: + return 0; + case BPF_JNE: + return 1; + default: + return -1; + } + } if (is_jmp32) return is_branch32_taken(reg, val, opcode); @@ -6517,12 +6685,16 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, if (is_null) { reg->type = SCALAR_VALUE; } else if (reg->type == PTR_TO_MAP_VALUE_OR_NULL) { - if (reg->map_ptr->inner_map_meta) { + const struct bpf_map *map = reg->map_ptr; + + if (map->inner_map_meta) { reg->type = CONST_PTR_TO_MAP; - reg->map_ptr = reg->map_ptr->inner_map_meta; - } else if (reg->map_ptr->map_type == - BPF_MAP_TYPE_XSKMAP) { + reg->map_ptr = map->inner_map_meta; + } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { reg->type = PTR_TO_XDP_SOCK; + } else if (map->map_type == BPF_MAP_TYPE_SOCKMAP || + map->map_type == BPF_MAP_TYPE_SOCKHASH) { + reg->type = PTR_TO_SOCKET; } else { reg->type = PTR_TO_MAP_VALUE; } @@ -6532,6 +6704,10 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, reg->type = PTR_TO_SOCK_COMMON; } else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) { reg->type = PTR_TO_TCP_SOCK; + } else if (reg->type == PTR_TO_BTF_ID_OR_NULL) { + reg->type = PTR_TO_BTF_ID; + } else if (reg->type == PTR_TO_MEM_OR_NULL) { + reg->type = PTR_TO_MEM; } if (is_null) { /* We don't need id and ref_obj_id from this point @@ -6755,7 +6931,11 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, } if (pred >= 0) { - err = mark_chain_precision(env, insn->dst_reg); + /* If we get here with a dst_reg pointer type it is because + * above is_branch_taken() special cased the 0 comparison. + */ + if (!__is_pointer_value(false, dst_reg)) + err = mark_chain_precision(env, insn->dst_reg); if (BPF_SRC(insn->code) == BPF_X && !err) err = mark_chain_precision(env, insn->src_reg); if (err) @@ -7041,7 +7221,11 @@ static int check_return_code(struct bpf_verifier_env *env) switch (env->prog->type) { case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG || - env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG) + env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG || + env->prog->expected_attach_type == BPF_CGROUP_INET4_GETPEERNAME || + env->prog->expected_attach_type == BPF_CGROUP_INET6_GETPEERNAME || + env->prog->expected_attach_type == BPF_CGROUP_INET4_GETSOCKNAME || + env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME) range = tnum_range(1, 1); break; case BPF_PROG_TYPE_CGROUP_SKB: @@ -7070,6 +7254,8 @@ static int check_return_code(struct bpf_verifier_env *env) case BPF_TRACE_RAW_TP: case BPF_MODIFY_RETURN: return 0; + case BPF_TRACE_ITER: + break; default: return -ENOTSUPP; } @@ -7206,7 +7392,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env, insn_stack[env->cfg.cur_stack++] = w; return 1; } else if ((insn_state[w] & 0xF0) == DISCOVERED) { - if (loop_ok && env->allow_ptr_leaks) + if (loop_ok && env->bpf_capable) return 0; verbose_linfo(env, t, "%d: ", t); verbose_linfo(env, w, "%d: ", w); @@ -8315,7 +8501,7 @@ next: if (env->max_states_per_insn < states_cnt) env->max_states_per_insn = states_cnt; - if (!env->allow_ptr_leaks && states_cnt > BPF_COMPLEXITY_LIMIT_STATES) + if (!env->bpf_capable && states_cnt > BPF_COMPLEXITY_LIMIT_STATES) return push_jmp_history(env, cur); if (!add_new_state) @@ -8402,6 +8588,7 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type) case PTR_TO_TCP_SOCK_OR_NULL: case PTR_TO_XDP_SOCK: case PTR_TO_BTF_ID: + case PTR_TO_BTF_ID_OR_NULL: return false; default: return true; @@ -8428,6 +8615,7 @@ static bool reg_type_mismatch(enum bpf_reg_type src, enum bpf_reg_type prev) static int do_check(struct bpf_verifier_env *env) { + bool pop_log = !(env->log.level & BPF_LOG_LEVEL2); struct bpf_verifier_state *state = env->cur_state; struct bpf_insn *insns = env->prog->insnsi; struct bpf_reg_state *regs; @@ -8704,7 +8892,7 @@ static int do_check(struct bpf_verifier_env *env) process_bpf_exit: update_branch_counts(env, env->cur_state); err = pop_stack(env, &prev_insn_idx, - &env->insn_idx); + &env->insn_idx, pop_log); if (err < 0) { if (err != -ENOENT) return err; @@ -9974,7 +10162,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) insn->code = BPF_JMP | BPF_TAIL_CALL; aux = &env->insn_aux_data[i + delta]; - if (env->allow_ptr_leaks && !expect_blinding && + if (env->bpf_capable && !expect_blinding && prog->jit_requested && !bpf_map_key_poisoned(aux) && !bpf_map_ptr_poisoned(aux) && @@ -10227,6 +10415,7 @@ static void sanitize_insn_aux_data(struct bpf_verifier_env *env) static int do_check_common(struct bpf_verifier_env *env, int subprog) { + bool pop_log = !(env->log.level & BPF_LOG_LEVEL2); struct bpf_verifier_state *state; struct bpf_reg_state *regs; int ret, i; @@ -10289,7 +10478,9 @@ out: free_verifier_state(env->cur_state, true); env->cur_state = NULL; } - while (!pop_stack(env, NULL, NULL)); + while (!pop_stack(env, NULL, NULL, false)); + if (!ret && pop_log) + bpf_vlog_reset(&env->log, 0); free_states(env); if (ret) /* clean aux data in case subprog was rejected */ @@ -10445,6 +10636,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) struct bpf_prog *tgt_prog = prog->aux->linked_prog; u32 btf_id = prog->aux->attach_btf_id; const char prefix[] = "btf_trace_"; + struct btf_func_model fmodel; int ret = 0, subprog = -1, i; struct bpf_trampoline *tr; const struct btf_type *t; @@ -10586,6 +10778,22 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) prog->aux->attach_func_proto = t; prog->aux->attach_btf_trace = true; return 0; + case BPF_TRACE_ITER: + if (!btf_type_is_func(t)) { + verbose(env, "attach_btf_id %u is not a function\n", + btf_id); + return -EINVAL; + } + t = btf_type_by_id(btf, t->type); + if (!btf_type_is_func_proto(t)) + return -EINVAL; + prog->aux->attach_func_name = tname; + prog->aux->attach_func_proto = t; + if (!bpf_iter_prog_supported(prog)) + return -EINVAL; + ret = btf_distill_func_proto(&env->log, btf, t, + tname, &fmodel); + return ret; default: if (!prog_extension) return -EINVAL; @@ -10696,7 +10904,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, env->insn_aux_data[i].orig_idx = i; env->prog = *prog; env->ops = bpf_verifier_ops[env->prog->type]; - is_priv = capable(CAP_SYS_ADMIN); + is_priv = bpf_capable(); if (!btf_vmlinux && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) { mutex_lock(&bpf_verifier_lock); @@ -10737,7 +10945,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, if (attr->prog_flags & BPF_F_ANY_ALIGNMENT) env->strict_alignment = false; - env->allow_ptr_leaks = is_priv; + env->allow_ptr_leaks = bpf_allow_ptr_leaks(); + env->bypass_spec_v1 = bpf_bypass_spec_v1(); + env->bypass_spec_v4 = bpf_bypass_spec_v4(); + env->bpf_capable = bpf_capable(); if (is_priv) env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ; diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c deleted file mode 100644 index 2cc5c8f4c800..000000000000 --- a/kernel/bpf/xskmap.c +++ /dev/null @@ -1,265 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* XSKMAP used for AF_XDP sockets - * Copyright(c) 2018 Intel Corporation. - */ - -#include <linux/bpf.h> -#include <linux/capability.h> -#include <net/xdp_sock.h> -#include <linux/slab.h> -#include <linux/sched.h> - -int xsk_map_inc(struct xsk_map *map) -{ - bpf_map_inc(&map->map); - return 0; -} - -void xsk_map_put(struct xsk_map *map) -{ - bpf_map_put(&map->map); -} - -static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map, - struct xdp_sock **map_entry) -{ - struct xsk_map_node *node; - int err; - - node = kzalloc(sizeof(*node), GFP_ATOMIC | __GFP_NOWARN); - if (!node) - return ERR_PTR(-ENOMEM); - - err = xsk_map_inc(map); - if (err) { - kfree(node); - return ERR_PTR(err); - } - - node->map = map; - node->map_entry = map_entry; - return node; -} - -static void xsk_map_node_free(struct xsk_map_node *node) -{ - xsk_map_put(node->map); - kfree(node); -} - -static void xsk_map_sock_add(struct xdp_sock *xs, struct xsk_map_node *node) -{ - spin_lock_bh(&xs->map_list_lock); - list_add_tail(&node->node, &xs->map_list); - spin_unlock_bh(&xs->map_list_lock); -} - -static void xsk_map_sock_delete(struct xdp_sock *xs, - struct xdp_sock **map_entry) -{ - struct xsk_map_node *n, *tmp; - - spin_lock_bh(&xs->map_list_lock); - list_for_each_entry_safe(n, tmp, &xs->map_list, node) { - if (map_entry == n->map_entry) { - list_del(&n->node); - xsk_map_node_free(n); - } - } - spin_unlock_bh(&xs->map_list_lock); -} - -static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) -{ - struct bpf_map_memory mem; - int err, numa_node; - struct xsk_map *m; - u64 size; - - if (!capable(CAP_NET_ADMIN)) - return ERR_PTR(-EPERM); - - if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size != 4 || - attr->map_flags & ~(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)) - return ERR_PTR(-EINVAL); - - numa_node = bpf_map_attr_numa_node(attr); - size = struct_size(m, xsk_map, attr->max_entries); - - err = bpf_map_charge_init(&mem, size); - if (err < 0) - return ERR_PTR(err); - - m = bpf_map_area_alloc(size, numa_node); - if (!m) { - bpf_map_charge_finish(&mem); - return ERR_PTR(-ENOMEM); - } - - bpf_map_init_from_attr(&m->map, attr); - bpf_map_charge_move(&m->map.memory, &mem); - spin_lock_init(&m->lock); - - return &m->map; -} - -static void xsk_map_free(struct bpf_map *map) -{ - struct xsk_map *m = container_of(map, struct xsk_map, map); - - bpf_clear_redirect_map(map); - synchronize_net(); - bpf_map_area_free(m); -} - -static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key) -{ - struct xsk_map *m = container_of(map, struct xsk_map, map); - u32 index = key ? *(u32 *)key : U32_MAX; - u32 *next = next_key; - - if (index >= m->map.max_entries) { - *next = 0; - return 0; - } - - if (index == m->map.max_entries - 1) - return -ENOENT; - *next = index + 1; - return 0; -} - -static u32 xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) -{ - const int ret = BPF_REG_0, mp = BPF_REG_1, index = BPF_REG_2; - struct bpf_insn *insn = insn_buf; - - *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); - *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5); - *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(sizeof(struct xsk_sock *))); - *insn++ = BPF_ALU64_IMM(BPF_ADD, mp, offsetof(struct xsk_map, xsk_map)); - *insn++ = BPF_ALU64_REG(BPF_ADD, ret, mp); - *insn++ = BPF_LDX_MEM(BPF_SIZEOF(struct xsk_sock *), ret, ret, 0); - *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); - *insn++ = BPF_MOV64_IMM(ret, 0); - return insn - insn_buf; -} - -static void *xsk_map_lookup_elem(struct bpf_map *map, void *key) -{ - WARN_ON_ONCE(!rcu_read_lock_held()); - return __xsk_map_lookup_elem(map, *(u32 *)key); -} - -static void *xsk_map_lookup_elem_sys_only(struct bpf_map *map, void *key) -{ - return ERR_PTR(-EOPNOTSUPP); -} - -static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value, - u64 map_flags) -{ - struct xsk_map *m = container_of(map, struct xsk_map, map); - struct xdp_sock *xs, *old_xs, **map_entry; - u32 i = *(u32 *)key, fd = *(u32 *)value; - struct xsk_map_node *node; - struct socket *sock; - int err; - - if (unlikely(map_flags > BPF_EXIST)) - return -EINVAL; - if (unlikely(i >= m->map.max_entries)) - return -E2BIG; - - sock = sockfd_lookup(fd, &err); - if (!sock) - return err; - - if (sock->sk->sk_family != PF_XDP) { - sockfd_put(sock); - return -EOPNOTSUPP; - } - - xs = (struct xdp_sock *)sock->sk; - - if (!xsk_is_setup_for_bpf_map(xs)) { - sockfd_put(sock); - return -EOPNOTSUPP; - } - - map_entry = &m->xsk_map[i]; - node = xsk_map_node_alloc(m, map_entry); - if (IS_ERR(node)) { - sockfd_put(sock); - return PTR_ERR(node); - } - - spin_lock_bh(&m->lock); - old_xs = READ_ONCE(*map_entry); - if (old_xs == xs) { - err = 0; - goto out; - } else if (old_xs && map_flags == BPF_NOEXIST) { - err = -EEXIST; - goto out; - } else if (!old_xs && map_flags == BPF_EXIST) { - err = -ENOENT; - goto out; - } - xsk_map_sock_add(xs, node); - WRITE_ONCE(*map_entry, xs); - if (old_xs) - xsk_map_sock_delete(old_xs, map_entry); - spin_unlock_bh(&m->lock); - sockfd_put(sock); - return 0; - -out: - spin_unlock_bh(&m->lock); - sockfd_put(sock); - xsk_map_node_free(node); - return err; -} - -static int xsk_map_delete_elem(struct bpf_map *map, void *key) -{ - struct xsk_map *m = container_of(map, struct xsk_map, map); - struct xdp_sock *old_xs, **map_entry; - int k = *(u32 *)key; - - if (k >= map->max_entries) - return -EINVAL; - - spin_lock_bh(&m->lock); - map_entry = &m->xsk_map[k]; - old_xs = xchg(map_entry, NULL); - if (old_xs) - xsk_map_sock_delete(old_xs, map_entry); - spin_unlock_bh(&m->lock); - - return 0; -} - -void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, - struct xdp_sock **map_entry) -{ - spin_lock_bh(&map->lock); - if (READ_ONCE(*map_entry) == xs) { - WRITE_ONCE(*map_entry, NULL); - xsk_map_sock_delete(xs, map_entry); - } - spin_unlock_bh(&map->lock); -} - -const struct bpf_map_ops xsk_map_ops = { - .map_alloc = xsk_map_alloc, - .map_free = xsk_map_free, - .map_get_next_key = xsk_map_get_next_key, - .map_lookup_elem = xsk_map_lookup_elem, - .map_gen_lookup = xsk_map_gen_lookup, - .map_lookup_elem_sys_only = xsk_map_lookup_elem_sys_only, - .map_update_elem = xsk_map_update_elem, - .map_delete_elem = xsk_map_delete_elem, - .map_check_btf = map_check_no_btf, -}; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 06b5ea9d899d..557a9b9d2244 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6508,33 +6508,6 @@ int cgroup_bpf_attach(struct cgroup *cgrp, return ret; } -int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *old_prog, - struct bpf_prog *new_prog) -{ - struct bpf_cgroup_link *cg_link; - int ret; - - if (link->ops != &bpf_cgroup_link_lops) - return -EINVAL; - - cg_link = container_of(link, struct bpf_cgroup_link, link); - - mutex_lock(&cgroup_mutex); - /* link might have been auto-released by dying cgroup, so fail */ - if (!cg_link->cgroup) { - ret = -EINVAL; - goto out_unlock; - } - if (old_prog && link->prog != old_prog) { - ret = -EPERM; - goto out_unlock; - } - ret = __cgroup_bpf_replace(cg_link->cgroup, cg_link, new_prog); -out_unlock: - mutex_unlock(&cgroup_mutex); - return ret; -} - int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, enum bpf_attach_type type) { diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index b1991043b7d8..334d48b16c36 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -236,7 +236,7 @@ exit_put: * sysctl_perf_event_max_contexts_per_stack. */ int perf_event_max_stack_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int *value = table->data; int new_value = *value, ret; diff --git a/kernel/events/core.c b/kernel/events/core.c index e296c5c59c6f..2e330f330303 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -442,8 +442,7 @@ static void update_perf_cpu_limits(void) static bool perf_rotate_context(struct perf_cpu_context *cpuctx); int perf_proc_update_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; int perf_cpu = sysctl_perf_cpu_time_max_percent; @@ -467,8 +466,7 @@ int perf_proc_update_handler(struct ctl_table *table, int write, int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT; int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 3f310df4a693..0fbdee78266b 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -892,7 +892,7 @@ static void unoptimize_all_kprobes(void) static DEFINE_MUTEX(kprobe_sysctl_mutex); int sysctl_kprobes_optimization; int proc_kprobes_optimization_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, + void *buffer, size_t *length, loff_t *ppos) { int ret; diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 8d1c15832e55..166d7bf49666 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -269,8 +269,8 @@ static int __init init_lstats_procfs(void) return 0; } -int sysctl_latencytop(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int sysctl_latencytop(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int err; diff --git a/kernel/module.c b/kernel/module.c index a0f201d2e184..be5413903d20 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -4,6 +4,9 @@ Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM. */ + +#define INCLUDE_VERMAGIC + #include <linux/export.h> #include <linux/extable.h> #include <linux/moduleloader.h> diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 11db2bdbb41e..0e5ac162c3a8 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -263,7 +263,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) #ifdef CONFIG_CHECKPOINT_RESTORE static int pid_ns_ctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct pid_namespace *pid_ns = task_active_pid_ns(current); struct ctl_table tmp = *table; diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 9fdd6a42ad6a..3132d6f860a8 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -173,7 +173,7 @@ __setup("printk.devkmsg=", control_devkmsg); char devkmsg_log_str[DEVKMSG_STR_MAX_SIZE] = "ratelimit"; int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { char old_str[DEVKMSG_STR_MAX_SIZE]; unsigned int old; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d7669027aede..8298b2c240ce 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1111,8 +1111,7 @@ static void uclamp_update_root_tg(void) { } #endif int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { bool update_root_tg = false; int old_min, old_max; @@ -2782,7 +2781,7 @@ void set_numabalancing_state(bool enabled) #ifdef CONFIG_PROC_SYSCTL int sysctl_numa_balancing(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; int err; @@ -2856,8 +2855,8 @@ static void __init init_schedstats(void) } #ifdef CONFIG_PROC_SYSCTL -int sysctl_schedstats(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { struct ctl_table t; int err; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0ed04d2a8959..35f4cc024dcf 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -645,8 +645,7 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) */ int sched_proc_update_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); unsigned int factor = get_update_sysctl_factor(); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 6d60ba21ed29..f395ddb75f38 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2724,9 +2724,8 @@ static void sched_rt_do_global(void) def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); } -int sched_rt_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +int sched_rt_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int old_period, old_runtime; static DEFINE_MUTEX(mutex); @@ -2764,9 +2763,8 @@ undo: return ret; } -int sched_rr_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +int sched_rr_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int ret; static DEFINE_MUTEX(mutex); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 1d7b446fac7d..ba81187bb7af 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -199,7 +199,7 @@ bool sched_energy_update; #ifdef CONFIG_PROC_SYSCTL int sched_energy_aware_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret, state; diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 55a6184f5990..d653d8426de9 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -1776,7 +1776,7 @@ static void audit_actions_logged(u32 actions_logged, u32 old_actions_logged, } static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write, - void __user *buffer, size_t *lenp, + void *buffer, size_t *lenp, loff_t *ppos) { int ret; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b9ff323e1d26..7efe0d499c1d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -68,6 +68,9 @@ #include <linux/bpf.h> #include <linux/mount.h> #include <linux/userfaultfd_k.h> +#include <linux/coredump.h> +#include <linux/latencytop.h> +#include <linux/pid.h> #include "../lib/kstrtox.h" @@ -103,22 +106,6 @@ #if defined(CONFIG_SYSCTL) -/* External variables not in a header file. */ -extern int suid_dumpable; -#ifdef CONFIG_COREDUMP -extern int core_uses_pid; -extern char core_pattern[]; -extern unsigned int core_pipe_limit; -#endif -extern int pid_max; -extern int pid_max_min, pid_max_max; -extern int percpu_pagelist_fraction; -extern int latencytop_enabled; -extern unsigned int sysctl_nr_open_min, sysctl_nr_open_max; -#ifndef CONFIG_MMU -extern int sysctl_nr_trim_pages; -#endif - /* Constants used for minimum and maximum */ #ifdef CONFIG_LOCKUP_DETECTOR static int sixty = 60; @@ -160,24 +147,6 @@ static unsigned long hung_task_timeout_max = (LONG_MAX/HZ); #ifdef CONFIG_INOTIFY_USER #include <linux/inotify.h> #endif -#ifdef CONFIG_SPARC -#endif - -#ifdef CONFIG_PARISC -extern int pwrsw_enabled; -#endif - -#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW -extern int unaligned_enabled; -#endif - -#ifdef CONFIG_IA64 -extern int unaligned_dump_stack; -#endif - -#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN -extern int no_unaligned_warning; -#endif #ifdef CONFIG_PROC_SYSCTL @@ -207,102 +176,1472 @@ enum sysctl_writes_mode { }; static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT; +#endif /* CONFIG_PROC_SYSCTL */ + +#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ + defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) +int sysctl_legacy_va_layout; +#endif + +#ifdef CONFIG_SCHED_DEBUG +static int min_sched_granularity_ns = 100000; /* 100 usecs */ +static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ +static int min_wakeup_granularity_ns; /* 0 usecs */ +static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ +#ifdef CONFIG_SMP +static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; +static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; +#endif /* CONFIG_SMP */ +#endif /* CONFIG_SCHED_DEBUG */ -static int proc_do_cad_pid(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); -static int proc_taint(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); #ifdef CONFIG_COMPACTION -static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table, - int write, void __user *buffer, - size_t *lenp, loff_t *ppos); +static int min_extfrag_threshold; +static int max_extfrag_threshold = 1000; +#endif + +#endif /* CONFIG_SYSCTL */ + +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_SYSCTL) +static int bpf_stats_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + struct static_key *key = (struct static_key *)table->data; + static int saved_val; + int val, ret; + struct ctl_table tmp = { + .data = &val, + .maxlen = sizeof(val), + .mode = table->mode, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + mutex_lock(&bpf_stats_enabled_mutex); + val = saved_val; + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + if (write && !ret && val != saved_val) { + if (val) + static_key_slow_inc(key); + else + static_key_slow_dec(key); + saved_val = val; + } + mutex_unlock(&bpf_stats_enabled_mutex); + return ret; +} #endif + +/* + * /proc/sys support + */ + +#ifdef CONFIG_PROC_SYSCTL + +static int _proc_do_string(char *data, int maxlen, int write, + char *buffer, size_t *lenp, loff_t *ppos) +{ + size_t len; + char c, *p; + + if (!data || !maxlen || !*lenp) { + *lenp = 0; + return 0; + } + + if (write) { + if (sysctl_writes_strict == SYSCTL_WRITES_STRICT) { + /* Only continue writes not past the end of buffer. */ + len = strlen(data); + if (len > maxlen - 1) + len = maxlen - 1; + + if (*ppos > len) + return 0; + len = *ppos; + } else { + /* Start writing from beginning of buffer. */ + len = 0; + } + + *ppos += *lenp; + p = buffer; + while ((p - buffer) < *lenp && len < maxlen - 1) { + c = *(p++); + if (c == 0 || c == '\n') + break; + data[len++] = c; + } + data[len] = 0; + } else { + len = strlen(data); + if (len > maxlen) + len = maxlen; + + if (*ppos > len) { + *lenp = 0; + return 0; + } + + data += *ppos; + len -= *ppos; + + if (len > *lenp) + len = *lenp; + if (len) + memcpy(buffer, data, len); + if (len < *lenp) { + buffer[len] = '\n'; + len++; + } + *lenp = len; + *ppos += len; + } + return 0; +} + +static void warn_sysctl_write(struct ctl_table *table) +{ + pr_warn_once("%s wrote to %s when file position was not 0!\n" + "This will not be supported in the future. To silence this\n" + "warning, set kernel.sysctl_writes_strict = -1\n", + current->comm, table->procname); +} + +/** + * proc_first_pos_non_zero_ignore - check if first position is allowed + * @ppos: file position + * @table: the sysctl table + * + * Returns true if the first position is non-zero and the sysctl_writes_strict + * mode indicates this is not allowed for numeric input types. String proc + * handlers can ignore the return value. + */ +static bool proc_first_pos_non_zero_ignore(loff_t *ppos, + struct ctl_table *table) +{ + if (!*ppos) + return false; + + switch (sysctl_writes_strict) { + case SYSCTL_WRITES_STRICT: + return true; + case SYSCTL_WRITES_WARN: + warn_sysctl_write(table); + return false; + default: + return false; + } +} + +/** + * proc_dostring - read a string sysctl + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes a string from/to the user buffer. If the kernel + * buffer provided is not large enough to hold the string, the + * string is truncated. The copied string is %NULL-terminated. + * If the string is being read by the user process, it is copied + * and a newline '\n' is added. It is truncated if the buffer is + * not large enough. + * + * Returns 0 on success. + */ +int proc_dostring(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + if (write) + proc_first_pos_non_zero_ignore(ppos, table); + + return _proc_do_string(table->data, table->maxlen, write, buffer, lenp, + ppos); +} + +static size_t proc_skip_spaces(char **buf) +{ + size_t ret; + char *tmp = skip_spaces(*buf); + ret = tmp - *buf; + *buf = tmp; + return ret; +} + +static void proc_skip_char(char **buf, size_t *size, const char v) +{ + while (*size) { + if (**buf != v) + break; + (*size)--; + (*buf)++; + } +} + +/** + * strtoul_lenient - parse an ASCII formatted integer from a buffer and only + * fail on overflow + * + * @cp: kernel buffer containing the string to parse + * @endp: pointer to store the trailing characters + * @base: the base to use + * @res: where the parsed integer will be stored + * + * In case of success 0 is returned and @res will contain the parsed integer, + * @endp will hold any trailing characters. + * This function will fail the parse on overflow. If there wasn't an overflow + * the function will defer the decision what characters count as invalid to the + * caller. + */ +static int strtoul_lenient(const char *cp, char **endp, unsigned int base, + unsigned long *res) +{ + unsigned long long result; + unsigned int rv; + + cp = _parse_integer_fixup_radix(cp, &base); + rv = _parse_integer(cp, base, &result); + if ((rv & KSTRTOX_OVERFLOW) || (result != (unsigned long)result)) + return -ERANGE; + + cp += rv; + + if (endp) + *endp = (char *)cp; + + *res = (unsigned long)result; + return 0; +} + +#define TMPBUFLEN 22 +/** + * proc_get_long - reads an ASCII formatted integer from a user buffer + * + * @buf: a kernel buffer + * @size: size of the kernel buffer + * @val: this is where the number will be stored + * @neg: set to %TRUE if number is negative + * @perm_tr: a vector which contains the allowed trailers + * @perm_tr_len: size of the perm_tr vector + * @tr: pointer to store the trailer character + * + * In case of success %0 is returned and @buf and @size are updated with + * the amount of bytes read. If @tr is non-NULL and a trailing + * character exists (size is non-zero after returning from this + * function), @tr is updated with the trailing character. + */ +static int proc_get_long(char **buf, size_t *size, + unsigned long *val, bool *neg, + const char *perm_tr, unsigned perm_tr_len, char *tr) +{ + int len; + char *p, tmp[TMPBUFLEN]; + + if (!*size) + return -EINVAL; + + len = *size; + if (len > TMPBUFLEN - 1) + len = TMPBUFLEN - 1; + + memcpy(tmp, *buf, len); + + tmp[len] = 0; + p = tmp; + if (*p == '-' && *size > 1) { + *neg = true; + p++; + } else + *neg = false; + if (!isdigit(*p)) + return -EINVAL; + + if (strtoul_lenient(p, &p, 0, val)) + return -EINVAL; + + len = p - tmp; + + /* We don't know if the next char is whitespace thus we may accept + * invalid integers (e.g. 1234...a) or two integers instead of one + * (e.g. 123...1). So lets not allow such large numbers. */ + if (len == TMPBUFLEN - 1) + return -EINVAL; + + if (len < *size && perm_tr_len && !memchr(perm_tr, *p, perm_tr_len)) + return -EINVAL; + + if (tr && (len < *size)) + *tr = *p; + + *buf += len; + *size -= len; + + return 0; +} + +/** + * proc_put_long - converts an integer to a decimal ASCII formatted string + * + * @buf: the user buffer + * @size: the size of the user buffer + * @val: the integer to be converted + * @neg: sign of the number, %TRUE for negative + * + * In case of success @buf and @size are updated with the amount of bytes + * written. + */ +static void proc_put_long(void **buf, size_t *size, unsigned long val, bool neg) +{ + int len; + char tmp[TMPBUFLEN], *p = tmp; + + sprintf(p, "%s%lu", neg ? "-" : "", val); + len = strlen(tmp); + if (len > *size) + len = *size; + memcpy(*buf, tmp, len); + *size -= len; + *buf += len; +} +#undef TMPBUFLEN + +static void proc_put_char(void **buf, size_t *size, char c) +{ + if (*size) { + char **buffer = (char **)buf; + **buffer = c; + + (*size)--; + (*buffer)++; + *buf = *buffer; + } +} + +static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp, + int *valp, + int write, void *data) +{ + if (write) { + if (*negp) { + if (*lvalp > (unsigned long) INT_MAX + 1) + return -EINVAL; + *valp = -*lvalp; + } else { + if (*lvalp > (unsigned long) INT_MAX) + return -EINVAL; + *valp = *lvalp; + } + } else { + int val = *valp; + if (val < 0) { + *negp = true; + *lvalp = -(unsigned long)val; + } else { + *negp = false; + *lvalp = (unsigned long)val; + } + } + return 0; +} + +static int do_proc_douintvec_conv(unsigned long *lvalp, + unsigned int *valp, + int write, void *data) +{ + if (write) { + if (*lvalp > UINT_MAX) + return -EINVAL; + *valp = *lvalp; + } else { + unsigned int val = *valp; + *lvalp = (unsigned long)val; + } + return 0; +} + +static const char proc_wspace_sep[] = { ' ', '\t', '\n' }; + +static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, + int write, void *buffer, + size_t *lenp, loff_t *ppos, + int (*conv)(bool *negp, unsigned long *lvalp, int *valp, + int write, void *data), + void *data) +{ + int *i, vleft, first = 1, err = 0; + size_t left; + char *p; + + if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) { + *lenp = 0; + return 0; + } + + i = (int *) tbl_data; + vleft = table->maxlen / sizeof(*i); + left = *lenp; + + if (!conv) + conv = do_proc_dointvec_conv; + + if (write) { + if (proc_first_pos_non_zero_ignore(ppos, table)) + goto out; + + if (left > PAGE_SIZE - 1) + left = PAGE_SIZE - 1; + p = buffer; + } + + for (; left && vleft--; i++, first=0) { + unsigned long lval; + bool neg; + + if (write) { + left -= proc_skip_spaces(&p); + + if (!left) + break; + err = proc_get_long(&p, &left, &lval, &neg, + proc_wspace_sep, + sizeof(proc_wspace_sep), NULL); + if (err) + break; + if (conv(&neg, &lval, i, 1, data)) { + err = -EINVAL; + break; + } + } else { + if (conv(&neg, &lval, i, 0, data)) { + err = -EINVAL; + break; + } + if (!first) + proc_put_char(&buffer, &left, '\t'); + proc_put_long(&buffer, &left, lval, neg); + } + } + + if (!write && !first && left && !err) + proc_put_char(&buffer, &left, '\n'); + if (write && !err && left) + left -= proc_skip_spaces(&p); + if (write && first) + return err ? : -EINVAL; + *lenp -= left; +out: + *ppos += *lenp; + return err; +} + +static int do_proc_dointvec(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos, + int (*conv)(bool *negp, unsigned long *lvalp, int *valp, + int write, void *data), + void *data) +{ + return __do_proc_dointvec(table->data, table, write, + buffer, lenp, ppos, conv, data); +} + +static int do_proc_douintvec_w(unsigned int *tbl_data, + struct ctl_table *table, + void *buffer, + size_t *lenp, loff_t *ppos, + int (*conv)(unsigned long *lvalp, + unsigned int *valp, + int write, void *data), + void *data) +{ + unsigned long lval; + int err = 0; + size_t left; + bool neg; + char *p = buffer; + + left = *lenp; + + if (proc_first_pos_non_zero_ignore(ppos, table)) + goto bail_early; + + if (left > PAGE_SIZE - 1) + left = PAGE_SIZE - 1; + + left -= proc_skip_spaces(&p); + if (!left) { + err = -EINVAL; + goto out_free; + } + + err = proc_get_long(&p, &left, &lval, &neg, + proc_wspace_sep, + sizeof(proc_wspace_sep), NULL); + if (err || neg) { + err = -EINVAL; + goto out_free; + } + + if (conv(&lval, tbl_data, 1, data)) { + err = -EINVAL; + goto out_free; + } + + if (!err && left) + left -= proc_skip_spaces(&p); + +out_free: + if (err) + return -EINVAL; + + return 0; + + /* This is in keeping with old __do_proc_dointvec() */ +bail_early: + *ppos += *lenp; + return err; +} + +static int do_proc_douintvec_r(unsigned int *tbl_data, void *buffer, + size_t *lenp, loff_t *ppos, + int (*conv)(unsigned long *lvalp, + unsigned int *valp, + int write, void *data), + void *data) +{ + unsigned long lval; + int err = 0; + size_t left; + + left = *lenp; + + if (conv(&lval, tbl_data, 0, data)) { + err = -EINVAL; + goto out; + } + + proc_put_long(&buffer, &left, lval, false); + if (!left) + goto out; + + proc_put_char(&buffer, &left, '\n'); + +out: + *lenp -= left; + *ppos += *lenp; + + return err; +} + +static int __do_proc_douintvec(void *tbl_data, struct ctl_table *table, + int write, void *buffer, + size_t *lenp, loff_t *ppos, + int (*conv)(unsigned long *lvalp, + unsigned int *valp, + int write, void *data), + void *data) +{ + unsigned int *i, vleft; + + if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) { + *lenp = 0; + return 0; + } + + i = (unsigned int *) tbl_data; + vleft = table->maxlen / sizeof(*i); + + /* + * Arrays are not supported, keep this simple. *Do not* add + * support for them. + */ + if (vleft != 1) { + *lenp = 0; + return -EINVAL; + } + + if (!conv) + conv = do_proc_douintvec_conv; + + if (write) + return do_proc_douintvec_w(i, table, buffer, lenp, ppos, + conv, data); + return do_proc_douintvec_r(i, buffer, lenp, ppos, conv, data); +} + +static int do_proc_douintvec(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos, + int (*conv)(unsigned long *lvalp, + unsigned int *valp, + int write, void *data), + void *data) +{ + return __do_proc_douintvec(table->data, table, write, + buffer, lenp, ppos, conv, data); +} + +/** + * proc_dointvec - read a vector of integers + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * + * Returns 0 on success. + */ +int proc_dointvec(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) +{ + return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL); +} + +#ifdef CONFIG_COMPACTION +static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table, + int write, void *buffer, size_t *lenp, loff_t *ppos) +{ + int ret, old; + + if (!IS_ENABLED(CONFIG_PREEMPT_RT) || !write) + return proc_dointvec_minmax(table, write, buffer, lenp, ppos); + + old = *(int *)table->data; + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (ret) + return ret; + if (old != *(int *)table->data) + pr_warn_once("sysctl attribute %s changed by %s[%d]\n", + table->procname, current->comm, + task_pid_nr(current)); + return ret; +} #endif +/** + * proc_douintvec - read a vector of unsigned integers + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer + * values from/to the user buffer, treated as an ASCII string. + * + * Returns 0 on success. + */ +int proc_douintvec(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) +{ + return do_proc_douintvec(table, write, buffer, lenp, ppos, + do_proc_douintvec_conv, NULL); +} + +/* + * Taint values can only be increased + * This means we can safely use a temporary. + */ +static int proc_taint(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table t; + unsigned long tmptaint = get_taint(); + int err; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + t = *table; + t.data = &tmptaint; + err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos); + if (err < 0) + return err; + + if (write) { + /* + * Poor man's atomic or. Not worth adding a primitive + * to everyone's atomic.h for this + */ + int i; + for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) { + if ((tmptaint >> i) & 1) + add_taint(i, LOCKDEP_STILL_OK); + } + } + + return err; +} + #ifdef CONFIG_PRINTK static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos) +{ + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + return proc_dointvec_minmax(table, write, buffer, lenp, ppos); +} +#endif + +/** + * struct do_proc_dointvec_minmax_conv_param - proc_dointvec_minmax() range checking structure + * @min: pointer to minimum allowable value + * @max: pointer to maximum allowable value + * + * The do_proc_dointvec_minmax_conv_param structure provides the + * minimum and maximum values for doing range checking for those sysctl + * parameters that use the proc_dointvec_minmax() handler. + */ +struct do_proc_dointvec_minmax_conv_param { + int *min; + int *max; +}; + +static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp, + int *valp, + int write, void *data) +{ + int tmp, ret; + struct do_proc_dointvec_minmax_conv_param *param = data; + /* + * If writing, first do so via a temporary local int so we can + * bounds-check it before touching *valp. + */ + int *ip = write ? &tmp : valp; + + ret = do_proc_dointvec_conv(negp, lvalp, ip, write, data); + if (ret) + return ret; + + if (write) { + if ((param->min && *param->min > tmp) || + (param->max && *param->max < tmp)) + return -EINVAL; + *valp = tmp; + } + + return 0; +} + +/** + * proc_dointvec_minmax - read a vector of integers with min/max values + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * + * This routine will ensure the values are within the range specified by + * table->extra1 (min) and table->extra2 (max). + * + * Returns 0 on success or -EINVAL on write when the range check fails. + */ +int proc_dointvec_minmax(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct do_proc_dointvec_minmax_conv_param param = { + .min = (int *) table->extra1, + .max = (int *) table->extra2, + }; + return do_proc_dointvec(table, write, buffer, lenp, ppos, + do_proc_dointvec_minmax_conv, ¶m); +} + +/** + * struct do_proc_douintvec_minmax_conv_param - proc_douintvec_minmax() range checking structure + * @min: pointer to minimum allowable value + * @max: pointer to maximum allowable value + * + * The do_proc_douintvec_minmax_conv_param structure provides the + * minimum and maximum values for doing range checking for those sysctl + * parameters that use the proc_douintvec_minmax() handler. + */ +struct do_proc_douintvec_minmax_conv_param { + unsigned int *min; + unsigned int *max; +}; + +static int do_proc_douintvec_minmax_conv(unsigned long *lvalp, + unsigned int *valp, + int write, void *data) +{ + int ret; + unsigned int tmp; + struct do_proc_douintvec_minmax_conv_param *param = data; + /* write via temporary local uint for bounds-checking */ + unsigned int *up = write ? &tmp : valp; + + ret = do_proc_douintvec_conv(lvalp, up, write, data); + if (ret) + return ret; + + if (write) { + if ((param->min && *param->min > tmp) || + (param->max && *param->max < tmp)) + return -ERANGE; + + *valp = tmp; + } + + return 0; +} + +/** + * proc_douintvec_minmax - read a vector of unsigned ints with min/max values + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer + * values from/to the user buffer, treated as an ASCII string. Negative + * strings are not allowed. + * + * This routine will ensure the values are within the range specified by + * table->extra1 (min) and table->extra2 (max). There is a final sanity + * check for UINT_MAX to avoid having to support wrap around uses from + * userspace. + * + * Returns 0 on success or -ERANGE on write when the range check fails. + */ +int proc_douintvec_minmax(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct do_proc_douintvec_minmax_conv_param param = { + .min = (unsigned int *) table->extra1, + .max = (unsigned int *) table->extra2, + }; + return do_proc_douintvec(table, write, buffer, lenp, ppos, + do_proc_douintvec_minmax_conv, ¶m); +} + +static int do_proc_dopipe_max_size_conv(unsigned long *lvalp, + unsigned int *valp, + int write, void *data) +{ + if (write) { + unsigned int val; + + val = round_pipe_size(*lvalp); + if (val == 0) + return -EINVAL; + + *valp = val; + } else { + unsigned int val = *valp; + *lvalp = (unsigned long) val; + } + + return 0; +} + +static int proc_dopipe_max_size(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return do_proc_douintvec(table, write, buffer, lenp, ppos, + do_proc_dopipe_max_size_conv, NULL); +} + +static void validate_coredump_safety(void) +{ +#ifdef CONFIG_COREDUMP + if (suid_dumpable == SUID_DUMP_ROOT && + core_pattern[0] != '/' && core_pattern[0] != '|') { + printk(KERN_WARNING +"Unsafe core_pattern used with fs.suid_dumpable=2.\n" +"Pipe handler or fully qualified core dump path required.\n" +"Set kernel.core_pattern before fs.suid_dumpable.\n" + ); + } #endif +} static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos) +{ + int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (!error) + validate_coredump_safety(); + return error; +} + #ifdef CONFIG_COREDUMP static int proc_dostring_coredump(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos) +{ + int error = proc_dostring(table, write, buffer, lenp, ppos); + if (!error) + validate_coredump_safety(); + return error; +} #endif -static int proc_dopipe_max_size(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); #ifdef CONFIG_MAGIC_SYSRQ static int sysrq_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); -#endif + void *buffer, size_t *lenp, loff_t *ppos) +{ + int tmp, ret; -static struct ctl_table kern_table[]; -static struct ctl_table vm_table[]; -static struct ctl_table fs_table[]; -static struct ctl_table debug_table[]; -static struct ctl_table dev_table[]; -extern struct ctl_table random_table[]; -#ifdef CONFIG_EPOLL -extern struct ctl_table epoll_table[]; -#endif + tmp = sysrq_mask(); -#ifdef CONFIG_FW_LOADER_USER_HELPER -extern struct ctl_table firmware_config_table[]; -#endif + ret = __do_proc_dointvec(&tmp, table, write, buffer, + lenp, ppos, NULL, NULL); + if (ret || !write) + return ret; -#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ - defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) -int sysctl_legacy_va_layout; + if (write) + sysrq_toggle_support(tmp); + + return 0; +} #endif -/* The default sysctl tables: */ +static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, + int write, void *buffer, size_t *lenp, loff_t *ppos, + unsigned long convmul, unsigned long convdiv) +{ + unsigned long *i, *min, *max; + int vleft, first = 1, err = 0; + size_t left; + char *p; -static struct ctl_table sysctl_base_table[] = { - { - .procname = "kernel", - .mode = 0555, - .child = kern_table, - }, - { - .procname = "vm", - .mode = 0555, - .child = vm_table, - }, - { - .procname = "fs", - .mode = 0555, - .child = fs_table, - }, - { - .procname = "debug", - .mode = 0555, - .child = debug_table, - }, - { - .procname = "dev", - .mode = 0555, - .child = dev_table, - }, - { } -}; + if (!data || !table->maxlen || !*lenp || (*ppos && !write)) { + *lenp = 0; + return 0; + } -#ifdef CONFIG_SCHED_DEBUG -static int min_sched_granularity_ns = 100000; /* 100 usecs */ -static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ -static int min_wakeup_granularity_ns; /* 0 usecs */ -static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ -#ifdef CONFIG_SMP -static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; -static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; -#endif /* CONFIG_SMP */ -#endif /* CONFIG_SCHED_DEBUG */ + i = (unsigned long *) data; + min = (unsigned long *) table->extra1; + max = (unsigned long *) table->extra2; + vleft = table->maxlen / sizeof(unsigned long); + left = *lenp; -#ifdef CONFIG_COMPACTION -static int min_extfrag_threshold; -static int max_extfrag_threshold = 1000; -#endif + if (write) { + if (proc_first_pos_non_zero_ignore(ppos, table)) + goto out; + + if (left > PAGE_SIZE - 1) + left = PAGE_SIZE - 1; + p = buffer; + } + + for (; left && vleft--; i++, first = 0) { + unsigned long val; + + if (write) { + bool neg; + + left -= proc_skip_spaces(&p); + if (!left) + break; + + err = proc_get_long(&p, &left, &val, &neg, + proc_wspace_sep, + sizeof(proc_wspace_sep), NULL); + if (err) + break; + if (neg) + continue; + val = convmul * val / convdiv; + if ((min && val < *min) || (max && val > *max)) { + err = -EINVAL; + break; + } + *i = val; + } else { + val = convdiv * (*i) / convmul; + if (!first) + proc_put_char(&buffer, &left, '\t'); + proc_put_long(&buffer, &left, val, false); + } + } + + if (!write && !first && left && !err) + proc_put_char(&buffer, &left, '\n'); + if (write && !err) + left -= proc_skip_spaces(&p); + if (write && first) + return err ? : -EINVAL; + *lenp -= left; +out: + *ppos += *lenp; + return err; +} + +static int do_proc_doulongvec_minmax(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos, unsigned long convmul, + unsigned long convdiv) +{ + return __do_proc_doulongvec_minmax(table->data, table, write, + buffer, lenp, ppos, convmul, convdiv); +} + +/** + * proc_doulongvec_minmax - read a vector of long integers with min/max values + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long + * values from/to the user buffer, treated as an ASCII string. + * + * This routine will ensure the values are within the range specified by + * table->extra1 (min) and table->extra2 (max). + * + * Returns 0 on success. + */ +int proc_doulongvec_minmax(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, 1l, 1l); +} + +/** + * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long + * values from/to the user buffer, treated as an ASCII string. The values + * are treated as milliseconds, and converted to jiffies when they are stored. + * + * This routine will ensure the values are within the range specified by + * table->extra1 (min) and table->extra2 (max). + * + * Returns 0 on success. + */ +int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return do_proc_doulongvec_minmax(table, write, buffer, + lenp, ppos, HZ, 1000l); +} + + +static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp, + int *valp, + int write, void *data) +{ + if (write) { + if (*lvalp > INT_MAX / HZ) + return 1; + *valp = *negp ? -(*lvalp*HZ) : (*lvalp*HZ); + } else { + int val = *valp; + unsigned long lval; + if (val < 0) { + *negp = true; + lval = -(unsigned long)val; + } else { + *negp = false; + lval = (unsigned long)val; + } + *lvalp = lval / HZ; + } + return 0; +} + +static int do_proc_dointvec_userhz_jiffies_conv(bool *negp, unsigned long *lvalp, + int *valp, + int write, void *data) +{ + if (write) { + if (USER_HZ < HZ && *lvalp > (LONG_MAX / HZ) * USER_HZ) + return 1; + *valp = clock_t_to_jiffies(*negp ? -*lvalp : *lvalp); + } else { + int val = *valp; + unsigned long lval; + if (val < 0) { + *negp = true; + lval = -(unsigned long)val; + } else { + *negp = false; + lval = (unsigned long)val; + } + *lvalp = jiffies_to_clock_t(lval); + } + return 0; +} + +static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp, + int *valp, + int write, void *data) +{ + if (write) { + unsigned long jif = msecs_to_jiffies(*negp ? -*lvalp : *lvalp); + + if (jif > INT_MAX) + return 1; + *valp = (int)jif; + } else { + int val = *valp; + unsigned long lval; + if (val < 0) { + *negp = true; + lval = -(unsigned long)val; + } else { + *negp = false; + lval = (unsigned long)val; + } + *lvalp = jiffies_to_msecs(lval); + } + return 0; +} + +/** + * proc_dointvec_jiffies - read a vector of integers as seconds + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * The values read are assumed to be in seconds, and are converted into + * jiffies. + * + * Returns 0 on success. + */ +int proc_dointvec_jiffies(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return do_proc_dointvec(table,write,buffer,lenp,ppos, + do_proc_dointvec_jiffies_conv,NULL); +} + +/** + * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: pointer to the file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * The values read are assumed to be in 1/USER_HZ seconds, and + * are converted into jiffies. + * + * Returns 0 on success. + */ +int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return do_proc_dointvec(table,write,buffer,lenp,ppos, + do_proc_dointvec_userhz_jiffies_conv,NULL); +} + +/** + * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * @ppos: the current position in the file + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * The values read are assumed to be in 1/1000 seconds, and + * are converted into jiffies. + * + * Returns 0 on success. + */ +int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) +{ + return do_proc_dointvec(table, write, buffer, lenp, ppos, + do_proc_dointvec_ms_jiffies_conv, NULL); +} + +static int proc_do_cad_pid(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) +{ + struct pid *new_pid; + pid_t tmp; + int r; + + tmp = pid_vnr(cad_pid); + + r = __do_proc_dointvec(&tmp, table, write, buffer, + lenp, ppos, NULL, NULL); + if (r || !write) + return r; + + new_pid = find_get_pid(tmp); + if (!new_pid) + return -ESRCH; + + put_pid(xchg(&cad_pid, new_pid)); + return 0; +} + +/** + * proc_do_large_bitmap - read/write from/to a large bitmap + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * The bitmap is stored at table->data and the bitmap length (in bits) + * in table->maxlen. + * + * We use a range comma separated format (e.g. 1,3-4,10-10) so that + * large bitmaps may be represented in a compact manner. Writing into + * the file will clear the bitmap then update it with the given input. + * + * Returns 0 on success. + */ +int proc_do_large_bitmap(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int err = 0; + bool first = 1; + size_t left = *lenp; + unsigned long bitmap_len = table->maxlen; + unsigned long *bitmap = *(unsigned long **) table->data; + unsigned long *tmp_bitmap = NULL; + char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c; + + if (!bitmap || !bitmap_len || !left || (*ppos && !write)) { + *lenp = 0; + return 0; + } + + if (write) { + char *p = buffer; + size_t skipped = 0; + + if (left > PAGE_SIZE - 1) { + left = PAGE_SIZE - 1; + /* How much of the buffer we'll skip this pass */ + skipped = *lenp - left; + } + + tmp_bitmap = bitmap_zalloc(bitmap_len, GFP_KERNEL); + if (!tmp_bitmap) + return -ENOMEM; + proc_skip_char(&p, &left, '\n'); + while (!err && left) { + unsigned long val_a, val_b; + bool neg; + size_t saved_left; + + /* In case we stop parsing mid-number, we can reset */ + saved_left = left; + err = proc_get_long(&p, &left, &val_a, &neg, tr_a, + sizeof(tr_a), &c); + /* + * If we consumed the entirety of a truncated buffer or + * only one char is left (may be a "-"), then stop here, + * reset, & come back for more. + */ + if ((left <= 1) && skipped) { + left = saved_left; + break; + } + + if (err) + break; + if (val_a >= bitmap_len || neg) { + err = -EINVAL; + break; + } + + val_b = val_a; + if (left) { + p++; + left--; + } + + if (c == '-') { + err = proc_get_long(&p, &left, &val_b, + &neg, tr_b, sizeof(tr_b), + &c); + /* + * If we consumed all of a truncated buffer or + * then stop here, reset, & come back for more. + */ + if (!left && skipped) { + left = saved_left; + break; + } + + if (err) + break; + if (val_b >= bitmap_len || neg || + val_a > val_b) { + err = -EINVAL; + break; + } + if (left) { + p++; + left--; + } + } + + bitmap_set(tmp_bitmap, val_a, val_b - val_a + 1); + first = 0; + proc_skip_char(&p, &left, '\n'); + } + left += skipped; + } else { + unsigned long bit_a, bit_b = 0; + + while (left) { + bit_a = find_next_bit(bitmap, bitmap_len, bit_b); + if (bit_a >= bitmap_len) + break; + bit_b = find_next_zero_bit(bitmap, bitmap_len, + bit_a + 1) - 1; + + if (!first) + proc_put_char(&buffer, &left, ','); + proc_put_long(&buffer, &left, bit_a, false); + if (bit_a != bit_b) { + proc_put_char(&buffer, &left, '-'); + proc_put_long(&buffer, &left, bit_b, false); + } + + first = 0; bit_b++; + } + proc_put_char(&buffer, &left, '\n'); + } + + if (!err) { + if (write) { + if (*ppos) + bitmap_or(bitmap, bitmap, tmp_bitmap, bitmap_len); + else + bitmap_copy(bitmap, tmp_bitmap, bitmap_len); + } + *lenp -= left; + *ppos += *lenp; + } + + bitmap_free(tmp_bitmap); + return err; +} + +#else /* CONFIG_PROC_SYSCTL */ + +int proc_dostring(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_dointvec(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_douintvec(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_dointvec_minmax(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_douintvec_minmax(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_dointvec_jiffies(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_doulongvec_minmax(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_do_large_bitmap(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +#endif /* CONFIG_PROC_SYSCTL */ + +#if defined(CONFIG_SYSCTL) +int proc_do_static_key(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct static_key *key = (struct static_key *)table->data; + static DEFINE_MUTEX(static_key_mutex); + int val, ret; + struct ctl_table tmp = { + .data = &val, + .maxlen = sizeof(val), + .mode = table->mode, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + mutex_lock(&static_key_mutex); + val = static_key_enabled(key); + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + if (write && !ret) { + if (val) + static_key_enable(key); + else + static_key_disable(key); + } + mutex_unlock(&static_key_mutex); + return ret; +} static struct ctl_table kern_table[] = { { @@ -613,7 +1952,7 @@ static struct ctl_table kern_table[] = { .procname = "soft-power", .data = &pwrsw_enabled, .maxlen = sizeof (int), - .mode = 0644, + .mode = 0644, .proc_handler = proc_dointvec, }, #endif @@ -1074,7 +2413,7 @@ static struct ctl_table kern_table[] = { .procname = "ignore-unaligned-usertrap", .data = &no_unaligned_warning, .maxlen = sizeof (int), - .mode = 0644, + .mode = 0644, .proc_handler = proc_dointvec, }, #endif @@ -1246,7 +2585,7 @@ static struct ctl_table kern_table[] = { .data = &bpf_stats_enabled_key.key, .maxlen = sizeof(bpf_stats_enabled_key), .mode = 0644, - .proc_handler = proc_do_static_key, + .proc_handler = bpf_stats_handler, }, #endif #if defined(CONFIG_TREE_RCU) @@ -1322,7 +2661,7 @@ static struct ctl_table vm_table[] = { .proc_handler = overcommit_kbytes_handler, }, { - .procname = "page-cluster", + .procname = "page-cluster", .data = &page_cluster, .maxlen = sizeof(int), .mode = 0644, @@ -1493,7 +2832,7 @@ static struct ctl_table vm_table[] = { .data = &watermark_boost_factor, .maxlen = sizeof(watermark_boost_factor), .mode = 0644, - .proc_handler = watermark_boost_factor_sysctl_handler, + .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, { @@ -1961,6 +3300,35 @@ static struct ctl_table dev_table[] = { { } }; +static struct ctl_table sysctl_base_table[] = { + { + .procname = "kernel", + .mode = 0555, + .child = kern_table, + }, + { + .procname = "vm", + .mode = 0555, + .child = vm_table, + }, + { + .procname = "fs", + .mode = 0555, + .child = fs_table, + }, + { + .procname = "debug", + .mode = 0555, + .child = debug_table, + }, + { + .procname = "dev", + .mode = 0555, + .child = dev_table, + }, + { } +}; + int __init sysctl_init(void) { struct ctl_table_header *hdr; @@ -1969,1474 +3337,7 @@ int __init sysctl_init(void) kmemleak_not_leak(hdr); return 0; } - #endif /* CONFIG_SYSCTL */ - -/* - * /proc/sys support - */ - -#ifdef CONFIG_PROC_SYSCTL - -static int _proc_do_string(char *data, int maxlen, int write, - char __user *buffer, - size_t *lenp, loff_t *ppos) -{ - size_t len; - char __user *p; - char c; - - if (!data || !maxlen || !*lenp) { - *lenp = 0; - return 0; - } - - if (write) { - if (sysctl_writes_strict == SYSCTL_WRITES_STRICT) { - /* Only continue writes not past the end of buffer. */ - len = strlen(data); - if (len > maxlen - 1) - len = maxlen - 1; - - if (*ppos > len) - return 0; - len = *ppos; - } else { - /* Start writing from beginning of buffer. */ - len = 0; - } - - *ppos += *lenp; - p = buffer; - while ((p - buffer) < *lenp && len < maxlen - 1) { - if (get_user(c, p++)) - return -EFAULT; - if (c == 0 || c == '\n') - break; - data[len++] = c; - } - data[len] = 0; - } else { - len = strlen(data); - if (len > maxlen) - len = maxlen; - - if (*ppos > len) { - *lenp = 0; - return 0; - } - - data += *ppos; - len -= *ppos; - - if (len > *lenp) - len = *lenp; - if (len) - if (copy_to_user(buffer, data, len)) - return -EFAULT; - if (len < *lenp) { - if (put_user('\n', buffer + len)) - return -EFAULT; - len++; - } - *lenp = len; - *ppos += len; - } - return 0; -} - -static void warn_sysctl_write(struct ctl_table *table) -{ - pr_warn_once("%s wrote to %s when file position was not 0!\n" - "This will not be supported in the future. To silence this\n" - "warning, set kernel.sysctl_writes_strict = -1\n", - current->comm, table->procname); -} - -/** - * proc_first_pos_non_zero_ignore - check if first position is allowed - * @ppos: file position - * @table: the sysctl table - * - * Returns true if the first position is non-zero and the sysctl_writes_strict - * mode indicates this is not allowed for numeric input types. String proc - * handlers can ignore the return value. - */ -static bool proc_first_pos_non_zero_ignore(loff_t *ppos, - struct ctl_table *table) -{ - if (!*ppos) - return false; - - switch (sysctl_writes_strict) { - case SYSCTL_WRITES_STRICT: - return true; - case SYSCTL_WRITES_WARN: - warn_sysctl_write(table); - return false; - default: - return false; - } -} - -/** - * proc_dostring - read a string sysctl - * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file - * @buffer: the user buffer - * @lenp: the size of the user buffer - * @ppos: file position - * - * Reads/writes a string from/to the user buffer. If the kernel - * buffer provided is not large enough to hold the string, the - * string is truncated. The copied string is %NULL-terminated. - * If the string is being read by the user process, it is copied - * and a newline '\n' is added. It is truncated if the buffer is - * not large enough. - * - * Returns 0 on success. - */ -int proc_dostring(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - if (write) - proc_first_pos_non_zero_ignore(ppos, table); - - return _proc_do_string((char *)(table->data), table->maxlen, write, - (char __user *)buffer, lenp, ppos); -} - -static size_t proc_skip_spaces(char **buf) -{ - size_t ret; - char *tmp = skip_spaces(*buf); - ret = tmp - *buf; - *buf = tmp; - return ret; -} - -static void proc_skip_char(char **buf, size_t *size, const char v) -{ - while (*size) { - if (**buf != v) - break; - (*size)--; - (*buf)++; - } -} - -/** - * strtoul_lenient - parse an ASCII formatted integer from a buffer and only - * fail on overflow - * - * @cp: kernel buffer containing the string to parse - * @endp: pointer to store the trailing characters - * @base: the base to use - * @res: where the parsed integer will be stored - * - * In case of success 0 is returned and @res will contain the parsed integer, - * @endp will hold any trailing characters. - * This function will fail the parse on overflow. If there wasn't an overflow - * the function will defer the decision what characters count as invalid to the - * caller. - */ -static int strtoul_lenient(const char *cp, char **endp, unsigned int base, - unsigned long *res) -{ - unsigned long long result; - unsigned int rv; - - cp = _parse_integer_fixup_radix(cp, &base); - rv = _parse_integer(cp, base, &result); - if ((rv & KSTRTOX_OVERFLOW) || (result != (unsigned long)result)) - return -ERANGE; - - cp += rv; - - if (endp) - *endp = (char *)cp; - - *res = (unsigned long)result; - return 0; -} - -#define TMPBUFLEN 22 -/** - * proc_get_long - reads an ASCII formatted integer from a user buffer - * - * @buf: a kernel buffer - * @size: size of the kernel buffer - * @val: this is where the number will be stored - * @neg: set to %TRUE if number is negative - * @perm_tr: a vector which contains the allowed trailers - * @perm_tr_len: size of the perm_tr vector - * @tr: pointer to store the trailer character - * - * In case of success %0 is returned and @buf and @size are updated with - * the amount of bytes read. If @tr is non-NULL and a trailing - * character exists (size is non-zero after returning from this - * function), @tr is updated with the trailing character. - */ -static int proc_get_long(char **buf, size_t *size, - unsigned long *val, bool *neg, - const char *perm_tr, unsigned perm_tr_len, char *tr) -{ - int len; - char *p, tmp[TMPBUFLEN]; - - if (!*size) - return -EINVAL; - - len = *size; - if (len > TMPBUFLEN - 1) - len = TMPBUFLEN - 1; - - memcpy(tmp, *buf, len); - - tmp[len] = 0; - p = tmp; - if (*p == '-' && *size > 1) { - *neg = true; - p++; - } else - *neg = false; - if (!isdigit(*p)) - return -EINVAL; - - if (strtoul_lenient(p, &p, 0, val)) - return -EINVAL; - - len = p - tmp; - - /* We don't know if the next char is whitespace thus we may accept - * invalid integers (e.g. 1234...a) or two integers instead of one - * (e.g. 123...1). So lets not allow such large numbers. */ - if (len == TMPBUFLEN - 1) - return -EINVAL; - - if (len < *size && perm_tr_len && !memchr(perm_tr, *p, perm_tr_len)) - return -EINVAL; - - if (tr && (len < *size)) - *tr = *p; - - *buf += len; - *size -= len; - - return 0; -} - -/** - * proc_put_long - converts an integer to a decimal ASCII formatted string - * - * @buf: the user buffer - * @size: the size of the user buffer - * @val: the integer to be converted - * @neg: sign of the number, %TRUE for negative - * - * In case of success %0 is returned and @buf and @size are updated with - * the amount of bytes written. - */ -static int proc_put_long(void __user **buf, size_t *size, unsigned long val, - bool neg) -{ - int len; - char tmp[TMPBUFLEN], *p = tmp; - - sprintf(p, "%s%lu", neg ? "-" : "", val); - len = strlen(tmp); - if (len > *size) - len = *size; - if (copy_to_user(*buf, tmp, len)) - return -EFAULT; - *size -= len; - *buf += len; - return 0; -} -#undef TMPBUFLEN - -static int proc_put_char(void __user **buf, size_t *size, char c) -{ - if (*size) { - char __user **buffer = (char __user **)buf; - if (put_user(c, *buffer)) - return -EFAULT; - (*size)--, (*buffer)++; - *buf = *buffer; - } - return 0; -} - -static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp, - int *valp, - int write, void *data) -{ - if (write) { - if (*negp) { - if (*lvalp > (unsigned long) INT_MAX + 1) - return -EINVAL; - *valp = -*lvalp; - } else { - if (*lvalp > (unsigned long) INT_MAX) - return -EINVAL; - *valp = *lvalp; - } - } else { - int val = *valp; - if (val < 0) { - *negp = true; - *lvalp = -(unsigned long)val; - } else { - *negp = false; - *lvalp = (unsigned long)val; - } - } - return 0; -} - -static int do_proc_douintvec_conv(unsigned long *lvalp, - unsigned int *valp, - int write, void *data) -{ - if (write) { - if (*lvalp > UINT_MAX) - return -EINVAL; - *valp = *lvalp; - } else { - unsigned int val = *valp; - *lvalp = (unsigned long)val; - } - return 0; -} - -static const char proc_wspace_sep[] = { ' ', '\t', '\n' }; - -static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, - int write, void __user *buffer, - size_t *lenp, loff_t *ppos, - int (*conv)(bool *negp, unsigned long *lvalp, int *valp, - int write, void *data), - void *data) -{ - int *i, vleft, first = 1, err = 0; - size_t left; - char *kbuf = NULL, *p; - - if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) { - *lenp = 0; - return 0; - } - - i = (int *) tbl_data; - vleft = table->maxlen / sizeof(*i); - left = *lenp; - - if (!conv) - conv = do_proc_dointvec_conv; - - if (write) { - if (proc_first_pos_non_zero_ignore(ppos, table)) - goto out; - - if (left > PAGE_SIZE - 1) - left = PAGE_SIZE - 1; - p = kbuf = memdup_user_nul(buffer, left); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); - } - - for (; left && vleft--; i++, first=0) { - unsigned long lval; - bool neg; - - if (write) { - left -= proc_skip_spaces(&p); - - if (!left) - break; - err = proc_get_long(&p, &left, &lval, &neg, - proc_wspace_sep, - sizeof(proc_wspace_sep), NULL); - if (err) - break; - if (conv(&neg, &lval, i, 1, data)) { - err = -EINVAL; - break; - } - } else { - if (conv(&neg, &lval, i, 0, data)) { - err = -EINVAL; - break; - } - if (!first) - err = proc_put_char(&buffer, &left, '\t'); - if (err) - break; - err = proc_put_long(&buffer, &left, lval, neg); - if (err) - break; - } - } - - if (!write && !first && left && !err) - err = proc_put_char(&buffer, &left, '\n'); - if (write && !err && left) - left -= proc_skip_spaces(&p); - if (write) { - kfree(kbuf); - if (first) - return err ? : -EINVAL; - } - *lenp -= left; -out: - *ppos += *lenp; - return err; -} - -static int do_proc_dointvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos, - int (*conv)(bool *negp, unsigned long *lvalp, int *valp, - int write, void *data), - void *data) -{ - return __do_proc_dointvec(table->data, table, write, - buffer, lenp, ppos, conv, data); -} - -static int do_proc_douintvec_w(unsigned int *tbl_data, - struct ctl_table *table, - void __user *buffer, - size_t *lenp, loff_t *ppos, - int (*conv)(unsigned long *lvalp, - unsigned int *valp, - int write, void *data), - void *data) -{ - unsigned long lval; - int err = 0; - size_t left; - bool neg; - char *kbuf = NULL, *p; - - left = *lenp; - - if (proc_first_pos_non_zero_ignore(ppos, table)) - goto bail_early; - - if (left > PAGE_SIZE - 1) - left = PAGE_SIZE - 1; - - p = kbuf = memdup_user_nul(buffer, left); - if (IS_ERR(kbuf)) - return -EINVAL; - - left -= proc_skip_spaces(&p); - if (!left) { - err = -EINVAL; - goto out_free; - } - - err = proc_get_long(&p, &left, &lval, &neg, - proc_wspace_sep, - sizeof(proc_wspace_sep), NULL); - if (err || neg) { - err = -EINVAL; - goto out_free; - } - - if (conv(&lval, tbl_data, 1, data)) { - err = -EINVAL; - goto out_free; - } - - if (!err && left) - left -= proc_skip_spaces(&p); - -out_free: - kfree(kbuf); - if (err) - return -EINVAL; - - return 0; - - /* This is in keeping with old __do_proc_dointvec() */ -bail_early: - *ppos += *lenp; - return err; -} - -static int do_proc_douintvec_r(unsigned int *tbl_data, void __user *buffer, - size_t *lenp, loff_t *ppos, - int (*conv)(unsigned long *lvalp, - unsigned int *valp, - int write, void *data), - void *data) -{ - unsigned long lval; - int err = 0; - size_t left; - - left = *lenp; - - if (conv(&lval, tbl_data, 0, data)) { - err = -EINVAL; - goto out; - } - - err = proc_put_long(&buffer, &left, lval, false); - if (err || !left) - goto out; - - err = proc_put_char(&buffer, &left, '\n'); - -out: - *lenp -= left; - *ppos += *lenp; - - return err; -} - -static int __do_proc_douintvec(void *tbl_data, struct ctl_table *table, - int write, void __user *buffer, - size_t *lenp, loff_t *ppos, - int (*conv)(unsigned long *lvalp, - unsigned int *valp, - int write, void *data), - void *data) -{ - unsigned int *i, vleft; - - if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) { - *lenp = 0; - return 0; - } - - i = (unsigned int *) tbl_data; - vleft = table->maxlen / sizeof(*i); - - /* - * Arrays are not supported, keep this simple. *Do not* add - * support for them. - */ - if (vleft != 1) { - *lenp = 0; - return -EINVAL; - } - - if (!conv) - conv = do_proc_douintvec_conv; - - if (write) - return do_proc_douintvec_w(i, table, buffer, lenp, ppos, - conv, data); - return do_proc_douintvec_r(i, buffer, lenp, ppos, conv, data); -} - -static int do_proc_douintvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos, - int (*conv)(unsigned long *lvalp, - unsigned int *valp, - int write, void *data), - void *data) -{ - return __do_proc_douintvec(table->data, table, write, - buffer, lenp, ppos, conv, data); -} - -/** - * proc_dointvec - read a vector of integers - * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file - * @buffer: the user buffer - * @lenp: the size of the user buffer - * @ppos: file position - * - * Reads/writes up to table->maxlen/sizeof(unsigned int) integer - * values from/to the user buffer, treated as an ASCII string. - * - * Returns 0 on success. - */ -int proc_dointvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL); -} - -#ifdef CONFIG_COMPACTION -static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table, - int write, void __user *buffer, - size_t *lenp, loff_t *ppos) -{ - int ret, old; - - if (!IS_ENABLED(CONFIG_PREEMPT_RT) || !write) - return proc_dointvec_minmax(table, write, buffer, lenp, ppos); - - old = *(int *)table->data; - ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - if (ret) - return ret; - if (old != *(int *)table->data) - pr_warn_once("sysctl attribute %s changed by %s[%d]\n", - table->procname, current->comm, - task_pid_nr(current)); - return ret; -} -#endif - -/** - * proc_douintvec - read a vector of unsigned integers - * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file - * @buffer: the user buffer - * @lenp: the size of the user buffer - * @ppos: file position - * - * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer - * values from/to the user buffer, treated as an ASCII string. - * - * Returns 0 on success. - */ -int proc_douintvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return do_proc_douintvec(table, write, buffer, lenp, ppos, - do_proc_douintvec_conv, NULL); -} - -/* - * Taint values can only be increased - * This means we can safely use a temporary. - */ -static int proc_taint(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - struct ctl_table t; - unsigned long tmptaint = get_taint(); - int err; - - if (write && !capable(CAP_SYS_ADMIN)) - return -EPERM; - - t = *table; - t.data = &tmptaint; - err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos); - if (err < 0) - return err; - - if (write) { - /* - * Poor man's atomic or. Not worth adding a primitive - * to everyone's atomic.h for this - */ - int i; - for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) { - if ((tmptaint >> i) & 1) - add_taint(i, LOCKDEP_STILL_OK); - } - } - - return err; -} - -#ifdef CONFIG_PRINTK -static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - if (write && !capable(CAP_SYS_ADMIN)) - return -EPERM; - - return proc_dointvec_minmax(table, write, buffer, lenp, ppos); -} -#endif - -/** - * struct do_proc_dointvec_minmax_conv_param - proc_dointvec_minmax() range checking structure - * @min: pointer to minimum allowable value - * @max: pointer to maximum allowable value - * - * The do_proc_dointvec_minmax_conv_param structure provides the - * minimum and maximum values for doing range checking for those sysctl - * parameters that use the proc_dointvec_minmax() handler. - */ -struct do_proc_dointvec_minmax_conv_param { - int *min; - int *max; -}; - -static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp, - int *valp, - int write, void *data) -{ - int tmp, ret; - struct do_proc_dointvec_minmax_conv_param *param = data; - /* - * If writing, first do so via a temporary local int so we can - * bounds-check it before touching *valp. - */ - int *ip = write ? &tmp : valp; - - ret = do_proc_dointvec_conv(negp, lvalp, ip, write, data); - if (ret) - return ret; - - if (write) { - if ((param->min && *param->min > tmp) || - (param->max && *param->max < tmp)) - return -EINVAL; - *valp = tmp; - } - - return 0; -} - -/** - * proc_dointvec_minmax - read a vector of integers with min/max values - * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file - * @buffer: the user buffer - * @lenp: the size of the user buffer - * @ppos: file position - * - * Reads/writes up to table->maxlen/sizeof(unsigned int) integer - * values from/to the user buffer, treated as an ASCII string. - * - * This routine will ensure the values are within the range specified by - * table->extra1 (min) and table->extra2 (max). - * - * Returns 0 on success or -EINVAL on write when the range check fails. - */ -int proc_dointvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - struct do_proc_dointvec_minmax_conv_param param = { - .min = (int *) table->extra1, - .max = (int *) table->extra2, - }; - return do_proc_dointvec(table, write, buffer, lenp, ppos, - do_proc_dointvec_minmax_conv, ¶m); -} - -/** - * struct do_proc_douintvec_minmax_conv_param - proc_douintvec_minmax() range checking structure - * @min: pointer to minimum allowable value - * @max: pointer to maximum allowable value - * - * The do_proc_douintvec_minmax_conv_param structure provides the - * minimum and maximum values for doing range checking for those sysctl - * parameters that use the proc_douintvec_minmax() handler. - */ -struct do_proc_douintvec_minmax_conv_param { - unsigned int *min; - unsigned int *max; -}; - -static int do_proc_douintvec_minmax_conv(unsigned long *lvalp, - unsigned int *valp, - int write, void *data) -{ - int ret; - unsigned int tmp; - struct do_proc_douintvec_minmax_conv_param *param = data; - /* write via temporary local uint for bounds-checking */ - unsigned int *up = write ? &tmp : valp; - - ret = do_proc_douintvec_conv(lvalp, up, write, data); - if (ret) - return ret; - - if (write) { - if ((param->min && *param->min > tmp) || - (param->max && *param->max < tmp)) - return -ERANGE; - - *valp = tmp; - } - - return 0; -} - -/** - * proc_douintvec_minmax - read a vector of unsigned ints with min/max values - * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file - * @buffer: the user buffer - * @lenp: the size of the user buffer - * @ppos: file position - * - * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer - * values from/to the user buffer, treated as an ASCII string. Negative - * strings are not allowed. - * - * This routine will ensure the values are within the range specified by - * table->extra1 (min) and table->extra2 (max). There is a final sanity - * check for UINT_MAX to avoid having to support wrap around uses from - * userspace. - * - * Returns 0 on success or -ERANGE on write when the range check fails. - */ -int proc_douintvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - struct do_proc_douintvec_minmax_conv_param param = { - .min = (unsigned int *) table->extra1, - .max = (unsigned int *) table->extra2, - }; - return do_proc_douintvec(table, write, buffer, lenp, ppos, - do_proc_douintvec_minmax_conv, ¶m); -} - -static int do_proc_dopipe_max_size_conv(unsigned long *lvalp, - unsigned int *valp, - int write, void *data) -{ - if (write) { - unsigned int val; - - val = round_pipe_size(*lvalp); - if (val == 0) - return -EINVAL; - - *valp = val; - } else { - unsigned int val = *valp; - *lvalp = (unsigned long) val; - } - - return 0; -} - -static int proc_dopipe_max_size(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return do_proc_douintvec(table, write, buffer, lenp, ppos, - do_proc_dopipe_max_size_conv, NULL); -} - -static void validate_coredump_safety(void) -{ -#ifdef CONFIG_COREDUMP - if (suid_dumpable == SUID_DUMP_ROOT && - core_pattern[0] != '/' && core_pattern[0] != '|') { - printk(KERN_WARNING -"Unsafe core_pattern used with fs.suid_dumpable=2.\n" -"Pipe handler or fully qualified core dump path required.\n" -"Set kernel.core_pattern before fs.suid_dumpable.\n" - ); - } -#endif -} - -static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - if (!error) - validate_coredump_safety(); - return error; -} - -#ifdef CONFIG_COREDUMP -static int proc_dostring_coredump(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - int error = proc_dostring(table, write, buffer, lenp, ppos); - if (!error) - validate_coredump_safety(); - return error; -} -#endif - -#ifdef CONFIG_MAGIC_SYSRQ -static int sysrq_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - int tmp, ret; - - tmp = sysrq_mask(); - - ret = __do_proc_dointvec(&tmp, table, write, buffer, - lenp, ppos, NULL, NULL); - if (ret || !write) - return ret; - - if (write) - sysrq_toggle_support(tmp); - - return 0; -} -#endif - -static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos, - unsigned long convmul, - unsigned long convdiv) -{ - unsigned long *i, *min, *max; - int vleft, first = 1, err = 0; - size_t left; - char *kbuf = NULL, *p; - - if (!data || !table->maxlen || !*lenp || (*ppos && !write)) { - *lenp = 0; - return 0; - } - - i = (unsigned long *) data; - min = (unsigned long *) table->extra1; - max = (unsigned long *) table->extra2; - vleft = table->maxlen / sizeof(unsigned long); - left = *lenp; - - if (write) { - if (proc_first_pos_non_zero_ignore(ppos, table)) - goto out; - - if (left > PAGE_SIZE - 1) - left = PAGE_SIZE - 1; - p = kbuf = memdup_user_nul(buffer, left); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); - } - - for (; left && vleft--; i++, first = 0) { - unsigned long val; - - if (write) { - bool neg; - - left -= proc_skip_spaces(&p); - if (!left) - break; - - err = proc_get_long(&p, &left, &val, &neg, - proc_wspace_sep, - sizeof(proc_wspace_sep), NULL); - if (err) - break; - if (neg) - continue; - val = convmul * val / convdiv; - if ((min && val < *min) || (max && val > *max)) { - err = -EINVAL; - break; - } - *i = val; - } else { - val = convdiv * (*i) / convmul; - if (!first) { - err = proc_put_char(&buffer, &left, '\t'); - if (err) - break; - } - err = proc_put_long(&buffer, &left, val, false); - if (err) - break; - } - } - - if (!write && !first && left && !err) - err = proc_put_char(&buffer, &left, '\n'); - if (write && !err) - left -= proc_skip_spaces(&p); - if (write) { - kfree(kbuf); - if (first) - return err ? : -EINVAL; - } - *lenp -= left; -out: - *ppos += *lenp; - return err; -} - -static int do_proc_doulongvec_minmax(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos, - unsigned long convmul, - unsigned long convdiv) -{ - return __do_proc_doulongvec_minmax(table->data, table, write, - buffer, lenp, ppos, convmul, convdiv); -} - -/** - * proc_doulongvec_minmax - read a vector of long integers with min/max values - * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file - * @buffer: the user buffer - * @lenp: the size of the user buffer - * @ppos: file position - * - * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long - * values from/to the user buffer, treated as an ASCII string. - * - * This routine will ensure the values are within the range specified by - * table->extra1 (min) and table->extra2 (max). - * - * Returns 0 on success. - */ -int proc_doulongvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, 1l, 1l); -} - -/** - * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values - * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file - * @buffer: the user buffer - * @lenp: the size of the user buffer - * @ppos: file position - * - * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long - * values from/to the user buffer, treated as an ASCII string. The values - * are treated as milliseconds, and converted to jiffies when they are stored. - * - * This routine will ensure the values are within the range specified by - * table->extra1 (min) and table->extra2 (max). - * - * Returns 0 on success. - */ -int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) -{ - return do_proc_doulongvec_minmax(table, write, buffer, - lenp, ppos, HZ, 1000l); -} - - -static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp, - int *valp, - int write, void *data) -{ - if (write) { - if (*lvalp > INT_MAX / HZ) - return 1; - *valp = *negp ? -(*lvalp*HZ) : (*lvalp*HZ); - } else { - int val = *valp; - unsigned long lval; - if (val < 0) { - *negp = true; - lval = -(unsigned long)val; - } else { - *negp = false; - lval = (unsigned long)val; - } - *lvalp = lval / HZ; - } - return 0; -} - -static int do_proc_dointvec_userhz_jiffies_conv(bool *negp, unsigned long *lvalp, - int *valp, - int write, void *data) -{ - if (write) { - if (USER_HZ < HZ && *lvalp > (LONG_MAX / HZ) * USER_HZ) - return 1; - *valp = clock_t_to_jiffies(*negp ? -*lvalp : *lvalp); - } else { - int val = *valp; - unsigned long lval; - if (val < 0) { - *negp = true; - lval = -(unsigned long)val; - } else { - *negp = false; - lval = (unsigned long)val; - } - *lvalp = jiffies_to_clock_t(lval); - } - return 0; -} - -static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp, - int *valp, - int write, void *data) -{ - if (write) { - unsigned long jif = msecs_to_jiffies(*negp ? -*lvalp : *lvalp); - - if (jif > INT_MAX) - return 1; - *valp = (int)jif; - } else { - int val = *valp; - unsigned long lval; - if (val < 0) { - *negp = true; - lval = -(unsigned long)val; - } else { - *negp = false; - lval = (unsigned long)val; - } - *lvalp = jiffies_to_msecs(lval); - } - return 0; -} - -/** - * proc_dointvec_jiffies - read a vector of integers as seconds - * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file - * @buffer: the user buffer - * @lenp: the size of the user buffer - * @ppos: file position - * - * Reads/writes up to table->maxlen/sizeof(unsigned int) integer - * values from/to the user buffer, treated as an ASCII string. - * The values read are assumed to be in seconds, and are converted into - * jiffies. - * - * Returns 0 on success. - */ -int proc_dointvec_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return do_proc_dointvec(table,write,buffer,lenp,ppos, - do_proc_dointvec_jiffies_conv,NULL); -} - -/** - * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds - * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file - * @buffer: the user buffer - * @lenp: the size of the user buffer - * @ppos: pointer to the file position - * - * Reads/writes up to table->maxlen/sizeof(unsigned int) integer - * values from/to the user buffer, treated as an ASCII string. - * The values read are assumed to be in 1/USER_HZ seconds, and - * are converted into jiffies. - * - * Returns 0 on success. - */ -int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return do_proc_dointvec(table,write,buffer,lenp,ppos, - do_proc_dointvec_userhz_jiffies_conv,NULL); -} - -/** - * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds - * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file - * @buffer: the user buffer - * @lenp: the size of the user buffer - * @ppos: file position - * @ppos: the current position in the file - * - * Reads/writes up to table->maxlen/sizeof(unsigned int) integer - * values from/to the user buffer, treated as an ASCII string. - * The values read are assumed to be in 1/1000 seconds, and - * are converted into jiffies. - * - * Returns 0 on success. - */ -int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return do_proc_dointvec(table, write, buffer, lenp, ppos, - do_proc_dointvec_ms_jiffies_conv, NULL); -} - -static int proc_do_cad_pid(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - struct pid *new_pid; - pid_t tmp; - int r; - - tmp = pid_vnr(cad_pid); - - r = __do_proc_dointvec(&tmp, table, write, buffer, - lenp, ppos, NULL, NULL); - if (r || !write) - return r; - - new_pid = find_get_pid(tmp); - if (!new_pid) - return -ESRCH; - - put_pid(xchg(&cad_pid, new_pid)); - return 0; -} - -/** - * proc_do_large_bitmap - read/write from/to a large bitmap - * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file - * @buffer: the user buffer - * @lenp: the size of the user buffer - * @ppos: file position - * - * The bitmap is stored at table->data and the bitmap length (in bits) - * in table->maxlen. - * - * We use a range comma separated format (e.g. 1,3-4,10-10) so that - * large bitmaps may be represented in a compact manner. Writing into - * the file will clear the bitmap then update it with the given input. - * - * Returns 0 on success. - */ -int proc_do_large_bitmap(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - int err = 0; - bool first = 1; - size_t left = *lenp; - unsigned long bitmap_len = table->maxlen; - unsigned long *bitmap = *(unsigned long **) table->data; - unsigned long *tmp_bitmap = NULL; - char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c; - - if (!bitmap || !bitmap_len || !left || (*ppos && !write)) { - *lenp = 0; - return 0; - } - - if (write) { - char *kbuf, *p; - size_t skipped = 0; - - if (left > PAGE_SIZE - 1) { - left = PAGE_SIZE - 1; - /* How much of the buffer we'll skip this pass */ - skipped = *lenp - left; - } - - p = kbuf = memdup_user_nul(buffer, left); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); - - tmp_bitmap = bitmap_zalloc(bitmap_len, GFP_KERNEL); - if (!tmp_bitmap) { - kfree(kbuf); - return -ENOMEM; - } - proc_skip_char(&p, &left, '\n'); - while (!err && left) { - unsigned long val_a, val_b; - bool neg; - size_t saved_left; - - /* In case we stop parsing mid-number, we can reset */ - saved_left = left; - err = proc_get_long(&p, &left, &val_a, &neg, tr_a, - sizeof(tr_a), &c); - /* - * If we consumed the entirety of a truncated buffer or - * only one char is left (may be a "-"), then stop here, - * reset, & come back for more. - */ - if ((left <= 1) && skipped) { - left = saved_left; - break; - } - - if (err) - break; - if (val_a >= bitmap_len || neg) { - err = -EINVAL; - break; - } - - val_b = val_a; - if (left) { - p++; - left--; - } - - if (c == '-') { - err = proc_get_long(&p, &left, &val_b, - &neg, tr_b, sizeof(tr_b), - &c); - /* - * If we consumed all of a truncated buffer or - * then stop here, reset, & come back for more. - */ - if (!left && skipped) { - left = saved_left; - break; - } - - if (err) - break; - if (val_b >= bitmap_len || neg || - val_a > val_b) { - err = -EINVAL; - break; - } - if (left) { - p++; - left--; - } - } - - bitmap_set(tmp_bitmap, val_a, val_b - val_a + 1); - first = 0; - proc_skip_char(&p, &left, '\n'); - } - kfree(kbuf); - left += skipped; - } else { - unsigned long bit_a, bit_b = 0; - - while (left) { - bit_a = find_next_bit(bitmap, bitmap_len, bit_b); - if (bit_a >= bitmap_len) - break; - bit_b = find_next_zero_bit(bitmap, bitmap_len, - bit_a + 1) - 1; - - if (!first) { - err = proc_put_char(&buffer, &left, ','); - if (err) - break; - } - err = proc_put_long(&buffer, &left, bit_a, false); - if (err) - break; - if (bit_a != bit_b) { - err = proc_put_char(&buffer, &left, '-'); - if (err) - break; - err = proc_put_long(&buffer, &left, bit_b, false); - if (err) - break; - } - - first = 0; bit_b++; - } - if (!err) - err = proc_put_char(&buffer, &left, '\n'); - } - - if (!err) { - if (write) { - if (*ppos) - bitmap_or(bitmap, bitmap, tmp_bitmap, bitmap_len); - else - bitmap_copy(bitmap, tmp_bitmap, bitmap_len); - } - *lenp -= left; - *ppos += *lenp; - } - - bitmap_free(tmp_bitmap); - return err; -} - -#else /* CONFIG_PROC_SYSCTL */ - -int proc_dostring(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -int proc_dointvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -int proc_douintvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -int proc_dointvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -int proc_douintvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -int proc_dointvec_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -int proc_doulongvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -int proc_do_large_bitmap(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -#endif /* CONFIG_PROC_SYSCTL */ - -#if defined(CONFIG_SYSCTL) -int proc_do_static_key(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) -{ - struct static_key *key = (struct static_key *)table->data; - static DEFINE_MUTEX(static_key_mutex); - int val, ret; - struct ctl_table tmp = { - .data = &val, - .maxlen = sizeof(val), - .mode = table->mode, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }; - - if (write && !capable(CAP_SYS_ADMIN)) - return -EPERM; - - mutex_lock(&static_key_mutex); - val = static_key_enabled(key); - ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); - if (write && !ret) { - if (val) - static_key_enable(key); - else - static_key_disable(key); - } - mutex_unlock(&static_key_mutex); - return ret; -} -#endif /* * No sense putting this after each symbol definition, twice, * exception granted :-) diff --git a/kernel/time/timer.c b/kernel/time/timer.c index a5221abb4594..398e6eadb861 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -249,8 +249,7 @@ void timers_update_nohz(void) } int timer_migration_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 92ba69b716dc..3744372a24e2 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -147,7 +147,7 @@ BPF_CALL_3(bpf_probe_read_user, void *, dst, u32, size, return ret; } -static const struct bpf_func_proto bpf_probe_read_user_proto = { +const struct bpf_func_proto bpf_probe_read_user_proto = { .func = bpf_probe_read_user, .gpl_only = true, .ret_type = RET_INTEGER, @@ -167,7 +167,7 @@ BPF_CALL_3(bpf_probe_read_user_str, void *, dst, u32, size, return ret; } -static const struct bpf_func_proto bpf_probe_read_user_str_proto = { +const struct bpf_func_proto bpf_probe_read_user_str_proto = { .func = bpf_probe_read_user_str, .gpl_only = true, .ret_type = RET_INTEGER, @@ -198,7 +198,7 @@ BPF_CALL_3(bpf_probe_read_kernel, void *, dst, u32, size, return bpf_probe_read_kernel_common(dst, size, unsafe_ptr, false); } -static const struct bpf_func_proto bpf_probe_read_kernel_proto = { +const struct bpf_func_proto bpf_probe_read_kernel_proto = { .func = bpf_probe_read_kernel, .gpl_only = true, .ret_type = RET_INTEGER, @@ -253,7 +253,7 @@ BPF_CALL_3(bpf_probe_read_kernel_str, void *, dst, u32, size, return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr, false); } -static const struct bpf_func_proto bpf_probe_read_kernel_str_proto = { +const struct bpf_func_proto bpf_probe_read_kernel_str_proto = { .func = bpf_probe_read_kernel_str, .gpl_only = true, .ret_type = RET_INTEGER, @@ -315,6 +315,9 @@ static const struct bpf_func_proto bpf_probe_write_user_proto = { static const struct bpf_func_proto *bpf_get_probe_write_proto(void) { + if (!capable(CAP_SYS_ADMIN)) + return NULL; + pr_warn_ratelimited("%s[%d] is installing a program with bpf_probe_write_user helper that may corrupt user memory!", current->comm, task_pid_nr(current)); @@ -487,6 +490,212 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void) return &bpf_trace_printk_proto; } +#define MAX_SEQ_PRINTF_VARARGS 12 +#define MAX_SEQ_PRINTF_MAX_MEMCPY 6 +#define MAX_SEQ_PRINTF_STR_LEN 128 + +struct bpf_seq_printf_buf { + char buf[MAX_SEQ_PRINTF_MAX_MEMCPY][MAX_SEQ_PRINTF_STR_LEN]; +}; +static DEFINE_PER_CPU(struct bpf_seq_printf_buf, bpf_seq_printf_buf); +static DEFINE_PER_CPU(int, bpf_seq_printf_buf_used); + +BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size, + const void *, data, u32, data_len) +{ + int err = -EINVAL, fmt_cnt = 0, memcpy_cnt = 0; + int i, buf_used, copy_size, num_args; + u64 params[MAX_SEQ_PRINTF_VARARGS]; + struct bpf_seq_printf_buf *bufs; + const u64 *args = data; + + buf_used = this_cpu_inc_return(bpf_seq_printf_buf_used); + if (WARN_ON_ONCE(buf_used > 1)) { + err = -EBUSY; + goto out; + } + + bufs = this_cpu_ptr(&bpf_seq_printf_buf); + + /* + * bpf_check()->check_func_arg()->check_stack_boundary() + * guarantees that fmt points to bpf program stack, + * fmt_size bytes of it were initialized and fmt_size > 0 + */ + if (fmt[--fmt_size] != 0) + goto out; + + if (data_len & 7) + goto out; + + for (i = 0; i < fmt_size; i++) { + if (fmt[i] == '%') { + if (fmt[i + 1] == '%') + i++; + else if (!data || !data_len) + goto out; + } + } + + num_args = data_len / 8; + + /* check format string for allowed specifiers */ + for (i = 0; i < fmt_size; i++) { + /* only printable ascii for now. */ + if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) { + err = -EINVAL; + goto out; + } + + if (fmt[i] != '%') + continue; + + if (fmt[i + 1] == '%') { + i++; + continue; + } + + if (fmt_cnt >= MAX_SEQ_PRINTF_VARARGS) { + err = -E2BIG; + goto out; + } + + if (fmt_cnt >= num_args) { + err = -EINVAL; + goto out; + } + + /* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */ + i++; + + /* skip optional "[0 +-][num]" width formating field */ + while (fmt[i] == '0' || fmt[i] == '+' || fmt[i] == '-' || + fmt[i] == ' ') + i++; + if (fmt[i] >= '1' && fmt[i] <= '9') { + i++; + while (fmt[i] >= '0' && fmt[i] <= '9') + i++; + } + + if (fmt[i] == 's') { + /* try our best to copy */ + if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) { + err = -E2BIG; + goto out; + } + + err = strncpy_from_unsafe_strict(bufs->buf[memcpy_cnt], + (void *) (long) args[fmt_cnt], + MAX_SEQ_PRINTF_STR_LEN); + if (err < 0) + bufs->buf[memcpy_cnt][0] = '\0'; + params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt]; + + fmt_cnt++; + memcpy_cnt++; + continue; + } + + if (fmt[i] == 'p') { + if (fmt[i + 1] == 0 || + fmt[i + 1] == 'K' || + fmt[i + 1] == 'x') { + /* just kernel pointers */ + params[fmt_cnt] = args[fmt_cnt]; + fmt_cnt++; + continue; + } + + /* only support "%pI4", "%pi4", "%pI6" and "%pi6". */ + if (fmt[i + 1] != 'i' && fmt[i + 1] != 'I') { + err = -EINVAL; + goto out; + } + if (fmt[i + 2] != '4' && fmt[i + 2] != '6') { + err = -EINVAL; + goto out; + } + + if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) { + err = -E2BIG; + goto out; + } + + + copy_size = (fmt[i + 2] == '4') ? 4 : 16; + + err = probe_kernel_read(bufs->buf[memcpy_cnt], + (void *) (long) args[fmt_cnt], + copy_size); + if (err < 0) + memset(bufs->buf[memcpy_cnt], 0, copy_size); + params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt]; + + i += 2; + fmt_cnt++; + memcpy_cnt++; + continue; + } + + if (fmt[i] == 'l') { + i++; + if (fmt[i] == 'l') + i++; + } + + if (fmt[i] != 'i' && fmt[i] != 'd' && + fmt[i] != 'u' && fmt[i] != 'x') { + err = -EINVAL; + goto out; + } + + params[fmt_cnt] = args[fmt_cnt]; + fmt_cnt++; + } + + /* Maximumly we can have MAX_SEQ_PRINTF_VARARGS parameter, just give + * all of them to seq_printf(). + */ + seq_printf(m, fmt, params[0], params[1], params[2], params[3], + params[4], params[5], params[6], params[7], params[8], + params[9], params[10], params[11]); + + err = seq_has_overflowed(m) ? -EOVERFLOW : 0; +out: + this_cpu_dec(bpf_seq_printf_buf_used); + return err; +} + +static int bpf_seq_printf_btf_ids[5]; +static const struct bpf_func_proto bpf_seq_printf_proto = { + .func = bpf_seq_printf, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_PTR_TO_MEM_OR_NULL, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, + .btf_id = bpf_seq_printf_btf_ids, +}; + +BPF_CALL_3(bpf_seq_write, struct seq_file *, m, const void *, data, u32, len) +{ + return seq_write(m, data, len) ? -EOVERFLOW : 0; +} + +static int bpf_seq_write_btf_ids[5]; +static const struct bpf_func_proto bpf_seq_write_proto = { + .func = bpf_seq_write, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .btf_id = bpf_seq_write_btf_ids, +}; + static __always_inline int get_map_perf_counter(struct bpf_map *map, u64 flags, u64 *value, u64 *enabled, u64 *running) @@ -698,7 +907,7 @@ BPF_CALL_0(bpf_get_current_task) return (long) current; } -static const struct bpf_func_proto bpf_get_current_task_proto = { +const struct bpf_func_proto bpf_get_current_task_proto = { .func = bpf_get_current_task, .gpl_only = true, .ret_type = RET_INTEGER, @@ -827,6 +1036,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_map_peek_elem_proto; case BPF_FUNC_ktime_get_ns: return &bpf_ktime_get_ns_proto; + case BPF_FUNC_ktime_get_boot_ns: + return &bpf_ktime_get_boot_ns_proto; case BPF_FUNC_tail_call: return &bpf_tail_call_proto; case BPF_FUNC_get_current_pid_tgid: @@ -877,6 +1088,16 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_perf_event_read_value_proto; case BPF_FUNC_get_ns_current_pid_tgid: return &bpf_get_ns_current_pid_tgid_proto; + case BPF_FUNC_ringbuf_output: + return &bpf_ringbuf_output_proto; + case BPF_FUNC_ringbuf_reserve: + return &bpf_ringbuf_reserve_proto; + case BPF_FUNC_ringbuf_submit: + return &bpf_ringbuf_submit_proto; + case BPF_FUNC_ringbuf_discard: + return &bpf_ringbuf_discard_proto; + case BPF_FUNC_ringbuf_query: + return &bpf_ringbuf_query_proto; default: return NULL; } @@ -1246,7 +1467,7 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } -static const struct bpf_func_proto * +const struct bpf_func_proto * tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { @@ -1256,6 +1477,14 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_xdp_output: return &bpf_xdp_output_proto; #endif + case BPF_FUNC_seq_printf: + return prog->expected_attach_type == BPF_TRACE_ITER ? + &bpf_seq_printf_proto : + NULL; + case BPF_FUNC_seq_write: + return prog->expected_attach_type == BPF_TRACE_ITER ? + &bpf_seq_write_proto : + NULL; default: return raw_tp_prog_func_proto(func_id, prog); } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0b144926bd73..3ab27022c20f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2662,7 +2662,7 @@ static void output_printk(struct trace_event_buffer *fbuffer) } int tracepoint_printk_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, + void *buffer, size_t *lenp, loff_t *ppos) { int save_tracepoint_printk; diff --git a/kernel/umh.c b/kernel/umh.c index 3474d6aa55d8..79f139a7ca03 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -641,7 +641,7 @@ int call_usermodehelper(const char *path, char **argv, char **envp, int wait) EXPORT_SYMBOL(call_usermodehelper); static int proc_cap_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; unsigned long cap_array[_KERNEL_CAPABILITY_U32S]; diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 3732c888a949..4ca61d49885b 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -30,7 +30,7 @@ static void *get_uts(struct ctl_table *table) * to observe. Should this be in kernel/sys.c ???? */ static int proc_do_uts_string(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table uts_table; int r; diff --git a/kernel/watchdog.c b/kernel/watchdog.c index b6b1f54a7837..53ff2c81b084 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -661,7 +661,7 @@ static void proc_watchdog_update(void) * proc_soft_watchdog | soft_watchdog_user_enabled | SOFT_WATCHDOG_ENABLED */ static int proc_watchdog_common(int which, struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int err, old, *param = table->data; @@ -688,7 +688,7 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, * /proc/sys/kernel/watchdog */ int proc_watchdog(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return proc_watchdog_common(NMI_WATCHDOG_ENABLED|SOFT_WATCHDOG_ENABLED, table, write, buffer, lenp, ppos); @@ -698,7 +698,7 @@ int proc_watchdog(struct ctl_table *table, int write, * /proc/sys/kernel/nmi_watchdog */ int proc_nmi_watchdog(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { if (!nmi_watchdog_available && write) return -ENOTSUPP; @@ -710,7 +710,7 @@ int proc_nmi_watchdog(struct ctl_table *table, int write, * /proc/sys/kernel/soft_watchdog */ int proc_soft_watchdog(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return proc_watchdog_common(SOFT_WATCHDOG_ENABLED, table, write, buffer, lenp, ppos); @@ -720,7 +720,7 @@ int proc_soft_watchdog(struct ctl_table *table, int write, * /proc/sys/kernel/watchdog_thresh */ int proc_watchdog_thresh(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int err, old; @@ -743,7 +743,7 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, * been brought online, if desired. */ int proc_watchdog_cpumask(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int err; |