diff options
Diffstat (limited to 'kernel/bpf')
30 files changed, 2539 insertions, 531 deletions
diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig index d56ee177d5f8..2dfe1079f772 100644 --- a/kernel/bpf/Kconfig +++ b/kernel/bpf/Kconfig @@ -27,6 +27,7 @@ config BPF_SYSCALL bool "Enable bpf() system call" select BPF select IRQ_WORK + select TASKS_RCU if PREEMPTION select TASKS_TRACE_RCU select BINARY_PRINTF select NET_SOCK_MSG if NET diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index c1a9be6a4b9f..057ba8e01e70 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -6,7 +6,7 @@ cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse endif CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy) -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 7f145aefbff8..fe40d3b9458f 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -11,6 +11,7 @@ #include <linux/perf_event.h> #include <uapi/linux/btf.h> #include <linux/rcupdate_trace.h> +#include <linux/btf_ids.h> #include "map_in_map.h" @@ -242,6 +243,20 @@ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key) return this_cpu_ptr(array->pptrs[index & array->index_mask]); } +static void *percpu_array_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u32 index = *(u32 *)key; + + if (cpu >= nr_cpu_ids) + return NULL; + + if (unlikely(index >= array->map.max_entries)) + return NULL; + + return per_cpu_ptr(array->pptrs[index & array->index_mask], cpu); +} + int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) { struct bpf_array *array = container_of(map, struct bpf_array, map); @@ -287,10 +302,12 @@ static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key return 0; } -static void check_and_free_timer_in_array(struct bpf_array *arr, void *val) +static void check_and_free_fields(struct bpf_array *arr, void *val) { - if (unlikely(map_value_has_timer(&arr->map))) + if (map_value_has_timer(&arr->map)) bpf_timer_cancel_and_free(val + arr->map.timer_off); + if (map_value_has_kptrs(&arr->map)) + bpf_map_free_kptrs(&arr->map, val); } /* Called from syscall or from eBPF program */ @@ -327,7 +344,7 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, copy_map_value_locked(map, val, value, false); else copy_map_value(map, val, value); - check_and_free_timer_in_array(array, val); + check_and_free_fields(array, val); } return 0; } @@ -386,7 +403,8 @@ static void array_map_free_timers(struct bpf_map *map) struct bpf_array *array = container_of(map, struct bpf_array, map); int i; - if (likely(!map_value_has_timer(map))) + /* We don't reset or free kptr on uref dropping to zero. */ + if (!map_value_has_timer(map)) return; for (i = 0; i < array->map.max_entries; i++) @@ -398,6 +416,13 @@ static void array_map_free_timers(struct bpf_map *map) static void array_map_free(struct bpf_map *map) { struct bpf_array *array = container_of(map, struct bpf_array, map); + int i; + + if (map_value_has_kptrs(map)) { + for (i = 0; i < array->map.max_entries; i++) + bpf_map_free_kptrs(map, array->value + array->elem_size * i); + bpf_map_free_kptr_off_tab(map); + } if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) bpf_array_free_percpu(array); @@ -680,7 +705,7 @@ static int bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback_ return num_elems; } -static int array_map_btf_id; +BTF_ID_LIST_SINGLE(array_map_btf_ids, struct, bpf_array) const struct bpf_map_ops array_map_ops = { .map_meta_equal = array_map_meta_equal, .map_alloc_check = array_map_alloc_check, @@ -701,12 +726,10 @@ const struct bpf_map_ops array_map_ops = { .map_update_batch = generic_map_update_batch, .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_array_elem, - .map_btf_name = "bpf_array", - .map_btf_id = &array_map_btf_id, + .map_btf_id = &array_map_btf_ids[0], .iter_seq_info = &iter_seq_info, }; -static int percpu_array_map_btf_id; const struct bpf_map_ops percpu_array_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = array_map_alloc_check, @@ -716,14 +739,14 @@ const struct bpf_map_ops percpu_array_map_ops = { .map_lookup_elem = percpu_array_map_lookup_elem, .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, + .map_lookup_percpu_elem = percpu_array_map_lookup_percpu_elem, .map_seq_show_elem = percpu_array_map_seq_show_elem, .map_check_btf = array_map_check_btf, .map_lookup_batch = generic_map_lookup_batch, .map_update_batch = generic_map_update_batch, .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_array_elem, - .map_btf_name = "bpf_array", - .map_btf_id = &percpu_array_map_btf_id, + .map_btf_id = &array_map_btf_ids[0], .iter_seq_info = &iter_seq_info, }; @@ -1102,7 +1125,6 @@ static void prog_array_map_free(struct bpf_map *map) * Thus, prog_array_map cannot be used as an inner_map * and map_meta_equal is not implemented. */ -static int prog_array_map_btf_id; const struct bpf_map_ops prog_array_map_ops = { .map_alloc_check = fd_array_map_alloc_check, .map_alloc = prog_array_map_alloc, @@ -1118,8 +1140,7 @@ const struct bpf_map_ops prog_array_map_ops = { .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem, .map_release_uref = prog_array_map_clear, .map_seq_show_elem = prog_array_map_seq_show_elem, - .map_btf_name = "bpf_array", - .map_btf_id = &prog_array_map_btf_id, + .map_btf_id = &array_map_btf_ids[0], }; static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, @@ -1208,7 +1229,6 @@ static void perf_event_fd_array_map_free(struct bpf_map *map) fd_array_map_free(map); } -static int perf_event_array_map_btf_id; const struct bpf_map_ops perf_event_array_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = fd_array_map_alloc_check, @@ -1221,8 +1241,7 @@ const struct bpf_map_ops perf_event_array_map_ops = { .map_fd_put_ptr = perf_event_fd_array_put_ptr, .map_release = perf_event_fd_array_release, .map_check_btf = map_check_no_btf, - .map_btf_name = "bpf_array", - .map_btf_id = &perf_event_array_map_btf_id, + .map_btf_id = &array_map_btf_ids[0], }; #ifdef CONFIG_CGROUPS @@ -1245,7 +1264,6 @@ static void cgroup_fd_array_free(struct bpf_map *map) fd_array_map_free(map); } -static int cgroup_array_map_btf_id; const struct bpf_map_ops cgroup_array_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = fd_array_map_alloc_check, @@ -1257,8 +1275,7 @@ const struct bpf_map_ops cgroup_array_map_ops = { .map_fd_get_ptr = cgroup_fd_array_get_ptr, .map_fd_put_ptr = cgroup_fd_array_put_ptr, .map_check_btf = map_check_no_btf, - .map_btf_name = "bpf_array", - .map_btf_id = &cgroup_array_map_btf_id, + .map_btf_id = &array_map_btf_ids[0], }; #endif @@ -1332,7 +1349,6 @@ static int array_of_map_gen_lookup(struct bpf_map *map, return insn - insn_buf; } -static int array_of_maps_map_btf_id; const struct bpf_map_ops array_of_maps_map_ops = { .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_of_map_alloc, @@ -1344,7 +1360,8 @@ const struct bpf_map_ops array_of_maps_map_ops = { .map_fd_put_ptr = bpf_map_fd_put_ptr, .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, .map_gen_lookup = array_of_map_gen_lookup, + .map_lookup_batch = generic_map_lookup_batch, + .map_update_batch = generic_map_update_batch, .map_check_btf = map_check_no_btf, - .map_btf_name = "bpf_array", - .map_btf_id = &array_of_maps_map_btf_id, + .map_btf_id = &array_map_btf_ids[0], }; diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c index b141a1346f72..b9ea539a5561 100644 --- a/kernel/bpf/bloom_filter.c +++ b/kernel/bpf/bloom_filter.c @@ -7,6 +7,7 @@ #include <linux/err.h> #include <linux/jhash.h> #include <linux/random.h> +#include <linux/btf_ids.h> #define BLOOM_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_ZERO_SEED | BPF_F_ACCESS_MASK) @@ -192,7 +193,7 @@ static int bloom_map_check_btf(const struct bpf_map *map, return btf_type_is_void(key_type) ? 0 : -EINVAL; } -static int bpf_bloom_map_btf_id; +BTF_ID_LIST_SINGLE(bpf_bloom_map_btf_ids, struct, bpf_bloom_filter) const struct bpf_map_ops bloom_filter_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = bloom_map_alloc, @@ -205,6 +206,5 @@ const struct bpf_map_ops bloom_filter_map_ops = { .map_update_elem = bloom_map_update_elem, .map_delete_elem = bloom_map_delete_elem, .map_check_btf = bloom_map_check_btf, - .map_btf_name = "bpf_bloom_filter", - .map_btf_id = &bpf_bloom_map_btf_id, + .map_btf_id = &bpf_bloom_map_btf_ids[0], }; diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index 96be8d518885..5f7683b19199 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -90,7 +90,7 @@ void bpf_inode_storage_free(struct inode *inode) */ bpf_selem_unlink_map(selem); free_inode_storage = bpf_selem_unlink_storage_nolock( - local_storage, selem, false); + local_storage, selem, false, false); } raw_spin_unlock_bh(&local_storage->lock); rcu_read_unlock(); @@ -149,7 +149,7 @@ static int inode_storage_delete(struct inode *inode, struct bpf_map *map) if (!sdata) return -ENOENT; - bpf_selem_unlink(SELEM(sdata)); + bpf_selem_unlink(SELEM(sdata), true); return 0; } @@ -245,7 +245,8 @@ static void inode_storage_map_free(struct bpf_map *map) bpf_local_storage_map_free(smap, NULL); } -static int inode_storage_map_btf_id; +BTF_ID_LIST_SINGLE(inode_storage_map_btf_ids, struct, + bpf_local_storage_map) const struct bpf_map_ops inode_storage_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = bpf_local_storage_map_alloc_check, @@ -256,8 +257,7 @@ const struct bpf_map_ops inode_storage_map_ops = { .map_update_elem = bpf_fd_inode_storage_update_elem, .map_delete_elem = bpf_fd_inode_storage_delete_elem, .map_check_btf = bpf_local_storage_map_check_btf, - .map_btf_name = "bpf_local_storage_map", - .map_btf_id = &inode_storage_map_btf_id, + .map_btf_id = &inode_storage_map_btf_ids[0], .map_owner_storage_ptr = inode_storage_ptr, }; diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 110029ede71e..d5d96ceca105 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -330,35 +330,34 @@ static void cache_btf_id(struct bpf_iter_target_info *tinfo, bool bpf_iter_prog_supported(struct bpf_prog *prog) { const char *attach_fname = prog->aux->attach_func_name; + struct bpf_iter_target_info *tinfo = NULL, *iter; u32 prog_btf_id = prog->aux->attach_btf_id; const char *prefix = BPF_ITER_FUNC_PREFIX; - struct bpf_iter_target_info *tinfo; int prefix_len = strlen(prefix); - bool supported = false; if (strncmp(attach_fname, prefix, prefix_len)) return false; mutex_lock(&targets_mutex); - list_for_each_entry(tinfo, &targets, list) { - if (tinfo->btf_id && tinfo->btf_id == prog_btf_id) { - supported = true; + list_for_each_entry(iter, &targets, list) { + if (iter->btf_id && iter->btf_id == prog_btf_id) { + tinfo = iter; break; } - if (!strcmp(attach_fname + prefix_len, tinfo->reg_info->target)) { - cache_btf_id(tinfo, prog); - supported = true; + if (!strcmp(attach_fname + prefix_len, iter->reg_info->target)) { + cache_btf_id(iter, prog); + tinfo = iter; break; } } mutex_unlock(&targets_mutex); - if (supported) { + if (tinfo) { prog->aux->ctx_arg_info_size = tinfo->reg_info->ctx_arg_info_size; prog->aux->ctx_arg_info = tinfo->reg_info->ctx_arg_info; } - return supported; + return tinfo != NULL; } const struct bpf_func_proto * @@ -499,12 +498,11 @@ bool bpf_link_is_iter(struct bpf_link *link) int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr, struct bpf_prog *prog) { + struct bpf_iter_target_info *tinfo = NULL, *iter; struct bpf_link_primer link_primer; - struct bpf_iter_target_info *tinfo; union bpf_iter_link_info linfo; struct bpf_iter_link *link; u32 prog_btf_id, linfo_len; - bool existed = false; bpfptr_t ulinfo; int err; @@ -530,14 +528,14 @@ int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr, prog_btf_id = prog->aux->attach_btf_id; mutex_lock(&targets_mutex); - list_for_each_entry(tinfo, &targets, list) { - if (tinfo->btf_id == prog_btf_id) { - existed = true; + list_for_each_entry(iter, &targets, list) { + if (iter->btf_id == prog_btf_id) { + tinfo = iter; break; } } mutex_unlock(&targets_mutex); - if (!existed) + if (!tinfo) return -ENOENT; link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN); @@ -547,7 +545,7 @@ int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr, bpf_link_init(&link->link, BPF_LINK_TYPE_ITER, &bpf_iter_link_lops, prog); link->tinfo = tinfo; - err = bpf_link_prime(&link->link, &link_primer); + err = bpf_link_prime(&link->link, &link_primer); if (err) { kfree(link); return err; diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 01aa2b51ec4d..8ce40fd869f6 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -106,7 +106,7 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu) */ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem, - bool uncharge_mem) + bool uncharge_mem, bool use_trace_rcu) { struct bpf_local_storage_map *smap; bool free_local_storage; @@ -150,11 +150,16 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, SDATA(selem)) RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL); - call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu); + if (use_trace_rcu) + call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu); + else + kfree_rcu(selem, rcu); + return free_local_storage; } -static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem) +static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem, + bool use_trace_rcu) { struct bpf_local_storage *local_storage; bool free_local_storage = false; @@ -169,12 +174,16 @@ static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem) raw_spin_lock_irqsave(&local_storage->lock, flags); if (likely(selem_linked_to_storage(selem))) free_local_storage = bpf_selem_unlink_storage_nolock( - local_storage, selem, true); + local_storage, selem, true, use_trace_rcu); raw_spin_unlock_irqrestore(&local_storage->lock, flags); - if (free_local_storage) - call_rcu_tasks_trace(&local_storage->rcu, + if (free_local_storage) { + if (use_trace_rcu) + call_rcu_tasks_trace(&local_storage->rcu, bpf_local_storage_free_rcu); + else + kfree_rcu(local_storage, rcu); + } } void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, @@ -214,14 +223,14 @@ void bpf_selem_link_map(struct bpf_local_storage_map *smap, raw_spin_unlock_irqrestore(&b->lock, flags); } -void bpf_selem_unlink(struct bpf_local_storage_elem *selem) +void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool use_trace_rcu) { /* Always unlink from map before unlinking from local_storage * because selem will be freed after successfully unlinked from * the local_storage. */ bpf_selem_unlink_map(selem); - __bpf_selem_unlink_storage(selem); + __bpf_selem_unlink_storage(selem, use_trace_rcu); } struct bpf_local_storage_data * @@ -466,7 +475,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, if (old_sdata) { bpf_selem_unlink_map(SELEM(old_sdata)); bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata), - false); + false, true); } unlock: @@ -548,7 +557,7 @@ void bpf_local_storage_map_free(struct bpf_local_storage_map *smap, migrate_disable(); __this_cpu_inc(*busy_counter); } - bpf_selem_unlink(selem); + bpf_selem_unlink(selem, false); if (busy_counter) { __this_cpu_dec(*busy_counter); migrate_enable(); diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h index 6b12f06ee18c..4ea227c9c1ad 100644 --- a/kernel/bpf/bpf_lru_list.h +++ b/kernel/bpf/bpf_lru_list.h @@ -4,6 +4,7 @@ #ifndef __BPF_LRU_LIST_H_ #define __BPF_LRU_LIST_H_ +#include <linux/cache.h> #include <linux/list.h> #include <linux/spinlock_types.h> diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 064eccba641d..c1351df9f7ee 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -117,6 +117,21 @@ static const struct bpf_func_proto bpf_ima_file_hash_proto = { .allowed = bpf_ima_inode_hash_allowed, }; +BPF_CALL_1(bpf_get_attach_cookie, void *, ctx) +{ + struct bpf_trace_run_ctx *run_ctx; + + run_ctx = container_of(current->bpf_ctx, struct bpf_trace_run_ctx, run_ctx); + return run_ctx->bpf_cookie; +} + +static const struct bpf_func_proto bpf_get_attach_cookie_proto = { + .func = bpf_get_attach_cookie, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + static const struct bpf_func_proto * bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -141,6 +156,8 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return prog->aux->sleepable ? &bpf_ima_inode_hash_proto : NULL; case BPF_FUNC_ima_file_hash: return prog->aux->sleepable ? &bpf_ima_file_hash_proto : NULL; + case BPF_FUNC_get_attach_cookie: + return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto : NULL; default: return tracing_prog_func_proto(func_id, prog); } diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 21069dbe9138..d9a3c9207240 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -10,6 +10,7 @@ #include <linux/seq_file.h> #include <linux/refcount.h> #include <linux/mutex.h> +#include <linux/btf_ids.h> enum bpf_struct_ops_state { BPF_STRUCT_OPS_STATE_INIT, @@ -32,15 +33,15 @@ struct bpf_struct_ops_map { const struct bpf_struct_ops *st_ops; /* protect map_update */ struct mutex lock; - /* progs has all the bpf_prog that is populated + /* link has all the bpf_links that is populated * to the func ptr of the kernel's struct * (in kvalue.data). */ - struct bpf_prog **progs; + struct bpf_link **links; /* image is a page that has all the trampolines * that stores the func args before calling the bpf_prog. * A PAGE_SIZE "image" is enough to store all trampoline for - * "progs[]". + * "links[]". */ void *image; /* uvalue->data stores the kernel struct @@ -263,7 +264,7 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, /* No lock is needed. state and refcnt do not need * to be updated together under atomic context. */ - uvalue = (struct bpf_struct_ops_value *)value; + uvalue = value; memcpy(uvalue, st_map->uvalue, map->value_size); uvalue->state = state; refcount_set(&uvalue->refcnt, refcount_read(&kvalue->refcnt)); @@ -282,9 +283,9 @@ static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map) u32 i; for (i = 0; i < btf_type_vlen(t); i++) { - if (st_map->progs[i]) { - bpf_prog_put(st_map->progs[i]); - st_map->progs[i] = NULL; + if (st_map->links[i]) { + bpf_link_put(st_map->links[i]); + st_map->links[i] = NULL; } } } @@ -315,18 +316,34 @@ static int check_zero_holes(const struct btf_type *t, void *data) return 0; } -int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_progs *tprogs, - struct bpf_prog *prog, +static void bpf_struct_ops_link_release(struct bpf_link *link) +{ +} + +static void bpf_struct_ops_link_dealloc(struct bpf_link *link) +{ + struct bpf_tramp_link *tlink = container_of(link, struct bpf_tramp_link, link); + + kfree(tlink); +} + +const struct bpf_link_ops bpf_struct_ops_link_lops = { + .release = bpf_struct_ops_link_release, + .dealloc = bpf_struct_ops_link_dealloc, +}; + +int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, + struct bpf_tramp_link *link, const struct btf_func_model *model, void *image, void *image_end) { u32 flags; - tprogs[BPF_TRAMP_FENTRY].progs[0] = prog; - tprogs[BPF_TRAMP_FENTRY].nr_progs = 1; + tlinks[BPF_TRAMP_FENTRY].links[0] = link; + tlinks[BPF_TRAMP_FENTRY].nr_links = 1; flags = model->ret_size > 0 ? BPF_TRAMP_F_RET_FENTRY_RET : 0; return arch_prepare_bpf_trampoline(NULL, image, image_end, - model, flags, tprogs, NULL); + model, flags, tlinks, NULL); } static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, @@ -337,7 +354,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, struct bpf_struct_ops_value *uvalue, *kvalue; const struct btf_member *member; const struct btf_type *t = st_ops->type; - struct bpf_tramp_progs *tprogs = NULL; + struct bpf_tramp_links *tlinks = NULL; void *udata, *kdata; int prog_fd, err = 0; void *image, *image_end; @@ -353,7 +370,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, if (err) return err; - uvalue = (struct bpf_struct_ops_value *)value; + uvalue = value; err = check_zero_holes(t, uvalue->data); if (err) return err; @@ -361,8 +378,8 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, if (uvalue->state || refcount_read(&uvalue->refcnt)) return -EINVAL; - tprogs = kcalloc(BPF_TRAMP_MAX, sizeof(*tprogs), GFP_KERNEL); - if (!tprogs) + tlinks = kcalloc(BPF_TRAMP_MAX, sizeof(*tlinks), GFP_KERNEL); + if (!tlinks) return -ENOMEM; uvalue = (struct bpf_struct_ops_value *)st_map->uvalue; @@ -385,6 +402,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, for_each_member(i, t, member) { const struct btf_type *mtype, *ptype; struct bpf_prog *prog; + struct bpf_tramp_link *link; u32 moff; moff = __btf_member_bit_offset(t, member) / 8; @@ -438,16 +456,26 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, err = PTR_ERR(prog); goto reset_unlock; } - st_map->progs[i] = prog; if (prog->type != BPF_PROG_TYPE_STRUCT_OPS || prog->aux->attach_btf_id != st_ops->type_id || prog->expected_attach_type != i) { + bpf_prog_put(prog); err = -EINVAL; goto reset_unlock; } - err = bpf_struct_ops_prepare_trampoline(tprogs, prog, + link = kzalloc(sizeof(*link), GFP_USER); + if (!link) { + bpf_prog_put(prog); + err = -ENOMEM; + goto reset_unlock; + } + bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, + &bpf_struct_ops_link_lops, prog); + st_map->links[i] = &link->link; + + err = bpf_struct_ops_prepare_trampoline(tlinks, link, &st_ops->func_models[i], image, image_end); if (err < 0) @@ -490,7 +518,7 @@ reset_unlock: memset(uvalue, 0, map->value_size); memset(kvalue, 0, map->value_size); unlock: - kfree(tprogs); + kfree(tlinks); mutex_unlock(&st_map->lock); return err; } @@ -545,9 +573,9 @@ static void bpf_struct_ops_map_free(struct bpf_map *map) { struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; - if (st_map->progs) + if (st_map->links) bpf_struct_ops_map_put_progs(st_map); - bpf_map_area_free(st_map->progs); + bpf_map_area_free(st_map->links); bpf_jit_free_exec(st_map->image); bpf_map_area_free(st_map->uvalue); bpf_map_area_free(st_map); @@ -596,11 +624,11 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) map = &st_map->map; st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE); - st_map->progs = - bpf_map_area_alloc(btf_type_vlen(t) * sizeof(struct bpf_prog *), + st_map->links = + bpf_map_area_alloc(btf_type_vlen(t) * sizeof(struct bpf_links *), NUMA_NO_NODE); st_map->image = bpf_jit_alloc_exec(PAGE_SIZE); - if (!st_map->uvalue || !st_map->progs || !st_map->image) { + if (!st_map->uvalue || !st_map->links || !st_map->image) { bpf_struct_ops_map_free(map); return ERR_PTR(-ENOMEM); } @@ -612,7 +640,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) return map; } -static int bpf_struct_ops_map_btf_id; +BTF_ID_LIST_SINGLE(bpf_struct_ops_map_btf_ids, struct, bpf_struct_ops_map) const struct bpf_map_ops bpf_struct_ops_map_ops = { .map_alloc_check = bpf_struct_ops_map_alloc_check, .map_alloc = bpf_struct_ops_map_alloc, @@ -622,8 +650,7 @@ const struct bpf_map_ops bpf_struct_ops_map_ops = { .map_delete_elem = bpf_struct_ops_map_delete_elem, .map_update_elem = bpf_struct_ops_map_update_elem, .map_seq_show_elem = bpf_struct_ops_map_seq_show_elem, - .map_btf_name = "bpf_struct_ops_map", - .map_btf_id = &bpf_struct_ops_map_btf_id, + .map_btf_id = &bpf_struct_ops_map_btf_ids[0], }; /* "const void *" because some subsystem is diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index 6638a0ecc3d2..e9014dc62682 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -102,7 +102,7 @@ void bpf_task_storage_free(struct task_struct *task) */ bpf_selem_unlink_map(selem); free_task_storage = bpf_selem_unlink_storage_nolock( - local_storage, selem, false); + local_storage, selem, false, false); } raw_spin_unlock_irqrestore(&local_storage->lock, flags); bpf_task_storage_unlock(); @@ -192,7 +192,7 @@ static int task_storage_delete(struct task_struct *task, struct bpf_map *map) if (!sdata) return -ENOENT; - bpf_selem_unlink(SELEM(sdata)); + bpf_selem_unlink(SELEM(sdata), true); return 0; } @@ -307,7 +307,7 @@ static void task_storage_map_free(struct bpf_map *map) bpf_local_storage_map_free(smap, &bpf_task_storage_busy); } -static int task_storage_map_btf_id; +BTF_ID_LIST_SINGLE(task_storage_map_btf_ids, struct, bpf_local_storage_map) const struct bpf_map_ops task_storage_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = bpf_local_storage_map_alloc_check, @@ -318,8 +318,7 @@ const struct bpf_map_ops task_storage_map_ops = { .map_update_elem = bpf_pid_task_storage_update_elem, .map_delete_elem = bpf_pid_task_storage_delete_elem, .map_check_btf = bpf_local_storage_map_check_btf, - .map_btf_name = "bpf_local_storage_map", - .map_btf_id = &task_storage_map_btf_id, + .map_btf_id = &task_storage_map_btf_ids[0], .map_owner_storage_ptr = task_storage_ptr, }; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 0918a39279f6..7bccaa4646e5 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -202,17 +202,25 @@ enum btf_kfunc_hook { BTF_KFUNC_HOOK_XDP, BTF_KFUNC_HOOK_TC, BTF_KFUNC_HOOK_STRUCT_OPS, + BTF_KFUNC_HOOK_TRACING, + BTF_KFUNC_HOOK_SYSCALL, BTF_KFUNC_HOOK_MAX, }; enum { BTF_KFUNC_SET_MAX_CNT = 32, + BTF_DTOR_KFUNC_MAX_CNT = 256, }; struct btf_kfunc_set_tab { struct btf_id_set *sets[BTF_KFUNC_HOOK_MAX][BTF_KFUNC_TYPE_MAX]; }; +struct btf_id_dtor_kfunc_tab { + u32 cnt; + struct btf_id_dtor_kfunc dtors[]; +}; + struct btf { void *data; struct btf_type **types; @@ -228,6 +236,7 @@ struct btf { u32 id; struct rcu_head rcu; struct btf_kfunc_set_tab *kfunc_set_tab; + struct btf_id_dtor_kfunc_tab *dtor_kfunc_tab; /* split BTF support */ struct btf *base_btf; @@ -1616,8 +1625,19 @@ free_tab: btf->kfunc_set_tab = NULL; } +static void btf_free_dtor_kfunc_tab(struct btf *btf) +{ + struct btf_id_dtor_kfunc_tab *tab = btf->dtor_kfunc_tab; + + if (!tab) + return; + kfree(tab); + btf->dtor_kfunc_tab = NULL; +} + static void btf_free(struct btf *btf) { + btf_free_dtor_kfunc_tab(btf); btf_free_kfunc_set_tab(btf); kvfree(btf->types); kvfree(btf->resolved_sizes); @@ -3163,24 +3183,86 @@ static void btf_struct_log(struct btf_verifier_env *env, btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t)); } +enum btf_field_type { + BTF_FIELD_SPIN_LOCK, + BTF_FIELD_TIMER, + BTF_FIELD_KPTR, +}; + +enum { + BTF_FIELD_IGNORE = 0, + BTF_FIELD_FOUND = 1, +}; + +struct btf_field_info { + u32 type_id; + u32 off; + enum bpf_kptr_type type; +}; + +static int btf_find_struct(const struct btf *btf, const struct btf_type *t, + u32 off, int sz, struct btf_field_info *info) +{ + if (!__btf_type_is_struct(t)) + return BTF_FIELD_IGNORE; + if (t->size != sz) + return BTF_FIELD_IGNORE; + info->off = off; + return BTF_FIELD_FOUND; +} + +static int btf_find_kptr(const struct btf *btf, const struct btf_type *t, + u32 off, int sz, struct btf_field_info *info) +{ + enum bpf_kptr_type type; + u32 res_id; + + /* For PTR, sz is always == 8 */ + if (!btf_type_is_ptr(t)) + return BTF_FIELD_IGNORE; + t = btf_type_by_id(btf, t->type); + + if (!btf_type_is_type_tag(t)) + return BTF_FIELD_IGNORE; + /* Reject extra tags */ + if (btf_type_is_type_tag(btf_type_by_id(btf, t->type))) + return -EINVAL; + if (!strcmp("kptr", __btf_name_by_offset(btf, t->name_off))) + type = BPF_KPTR_UNREF; + else if (!strcmp("kptr_ref", __btf_name_by_offset(btf, t->name_off))) + type = BPF_KPTR_REF; + else + return -EINVAL; + + /* Get the base type */ + t = btf_type_skip_modifiers(btf, t->type, &res_id); + /* Only pointer to struct is allowed */ + if (!__btf_type_is_struct(t)) + return -EINVAL; + + info->type_id = res_id; + info->off = off; + info->type = type; + return BTF_FIELD_FOUND; +} + static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t, - const char *name, int sz, int align) + const char *name, int sz, int align, + enum btf_field_type field_type, + struct btf_field_info *info, int info_cnt) { const struct btf_member *member; - u32 i, off = -ENOENT; + struct btf_field_info tmp; + int ret, idx = 0; + u32 i, off; for_each_member(i, t, member) { const struct btf_type *member_type = btf_type_by_id(btf, member->type); - if (!__btf_type_is_struct(member_type)) - continue; - if (member_type->size != sz) - continue; - if (strcmp(__btf_name_by_offset(btf, member_type->name_off), name)) + + if (name && strcmp(__btf_name_by_offset(btf, member_type->name_off), name)) continue; - if (off != -ENOENT) - /* only one such field is allowed */ - return -E2BIG; + off = __btf_member_bit_offset(t, member); if (off % 8) /* valid C code cannot generate such BTF */ @@ -3188,46 +3270,115 @@ static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t off /= 8; if (off % align) return -EINVAL; + + switch (field_type) { + case BTF_FIELD_SPIN_LOCK: + case BTF_FIELD_TIMER: + ret = btf_find_struct(btf, member_type, off, sz, + idx < info_cnt ? &info[idx] : &tmp); + if (ret < 0) + return ret; + break; + case BTF_FIELD_KPTR: + ret = btf_find_kptr(btf, member_type, off, sz, + idx < info_cnt ? &info[idx] : &tmp); + if (ret < 0) + return ret; + break; + default: + return -EFAULT; + } + + if (ret == BTF_FIELD_IGNORE) + continue; + if (idx >= info_cnt) + return -E2BIG; + ++idx; } - return off; + return idx; } static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t, - const char *name, int sz, int align) + const char *name, int sz, int align, + enum btf_field_type field_type, + struct btf_field_info *info, int info_cnt) { const struct btf_var_secinfo *vsi; - u32 i, off = -ENOENT; + struct btf_field_info tmp; + int ret, idx = 0; + u32 i, off; for_each_vsi(i, t, vsi) { const struct btf_type *var = btf_type_by_id(btf, vsi->type); const struct btf_type *var_type = btf_type_by_id(btf, var->type); - if (!__btf_type_is_struct(var_type)) - continue; - if (var_type->size != sz) + off = vsi->offset; + + if (name && strcmp(__btf_name_by_offset(btf, var_type->name_off), name)) continue; if (vsi->size != sz) continue; - if (strcmp(__btf_name_by_offset(btf, var_type->name_off), name)) - continue; - if (off != -ENOENT) - /* only one such field is allowed */ - return -E2BIG; - off = vsi->offset; if (off % align) return -EINVAL; + + switch (field_type) { + case BTF_FIELD_SPIN_LOCK: + case BTF_FIELD_TIMER: + ret = btf_find_struct(btf, var_type, off, sz, + idx < info_cnt ? &info[idx] : &tmp); + if (ret < 0) + return ret; + break; + case BTF_FIELD_KPTR: + ret = btf_find_kptr(btf, var_type, off, sz, + idx < info_cnt ? &info[idx] : &tmp); + if (ret < 0) + return ret; + break; + default: + return -EFAULT; + } + + if (ret == BTF_FIELD_IGNORE) + continue; + if (idx >= info_cnt) + return -E2BIG; + ++idx; } - return off; + return idx; } static int btf_find_field(const struct btf *btf, const struct btf_type *t, - const char *name, int sz, int align) + enum btf_field_type field_type, + struct btf_field_info *info, int info_cnt) { + const char *name; + int sz, align; + + switch (field_type) { + case BTF_FIELD_SPIN_LOCK: + name = "bpf_spin_lock"; + sz = sizeof(struct bpf_spin_lock); + align = __alignof__(struct bpf_spin_lock); + break; + case BTF_FIELD_TIMER: + name = "bpf_timer"; + sz = sizeof(struct bpf_timer); + align = __alignof__(struct bpf_timer); + break; + case BTF_FIELD_KPTR: + name = NULL; + sz = sizeof(u64); + align = 8; + break; + default: + return -EFAULT; + } if (__btf_type_is_struct(t)) - return btf_find_struct_field(btf, t, name, sz, align); + return btf_find_struct_field(btf, t, name, sz, align, field_type, info, info_cnt); else if (btf_type_is_datasec(t)) - return btf_find_datasec_var(btf, t, name, sz, align); + return btf_find_datasec_var(btf, t, name, sz, align, field_type, info, info_cnt); return -EINVAL; } @@ -3237,16 +3388,130 @@ static int btf_find_field(const struct btf *btf, const struct btf_type *t, */ int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t) { - return btf_find_field(btf, t, "bpf_spin_lock", - sizeof(struct bpf_spin_lock), - __alignof__(struct bpf_spin_lock)); + struct btf_field_info info; + int ret; + + ret = btf_find_field(btf, t, BTF_FIELD_SPIN_LOCK, &info, 1); + if (ret < 0) + return ret; + if (!ret) + return -ENOENT; + return info.off; } int btf_find_timer(const struct btf *btf, const struct btf_type *t) { - return btf_find_field(btf, t, "bpf_timer", - sizeof(struct bpf_timer), - __alignof__(struct bpf_timer)); + struct btf_field_info info; + int ret; + + ret = btf_find_field(btf, t, BTF_FIELD_TIMER, &info, 1); + if (ret < 0) + return ret; + if (!ret) + return -ENOENT; + return info.off; +} + +struct bpf_map_value_off *btf_parse_kptrs(const struct btf *btf, + const struct btf_type *t) +{ + struct btf_field_info info_arr[BPF_MAP_VALUE_OFF_MAX]; + struct bpf_map_value_off *tab; + struct btf *kernel_btf = NULL; + struct module *mod = NULL; + int ret, i, nr_off; + + ret = btf_find_field(btf, t, BTF_FIELD_KPTR, info_arr, ARRAY_SIZE(info_arr)); + if (ret < 0) + return ERR_PTR(ret); + if (!ret) + return NULL; + + nr_off = ret; + tab = kzalloc(offsetof(struct bpf_map_value_off, off[nr_off]), GFP_KERNEL | __GFP_NOWARN); + if (!tab) + return ERR_PTR(-ENOMEM); + + for (i = 0; i < nr_off; i++) { + const struct btf_type *t; + s32 id; + + /* Find type in map BTF, and use it to look up the matching type + * in vmlinux or module BTFs, by name and kind. + */ + t = btf_type_by_id(btf, info_arr[i].type_id); + id = bpf_find_btf_id(__btf_name_by_offset(btf, t->name_off), BTF_INFO_KIND(t->info), + &kernel_btf); + if (id < 0) { + ret = id; + goto end; + } + + /* Find and stash the function pointer for the destruction function that + * needs to be eventually invoked from the map free path. + */ + if (info_arr[i].type == BPF_KPTR_REF) { + const struct btf_type *dtor_func; + const char *dtor_func_name; + unsigned long addr; + s32 dtor_btf_id; + + /* This call also serves as a whitelist of allowed objects that + * can be used as a referenced pointer and be stored in a map at + * the same time. + */ + dtor_btf_id = btf_find_dtor_kfunc(kernel_btf, id); + if (dtor_btf_id < 0) { + ret = dtor_btf_id; + goto end_btf; + } + + dtor_func = btf_type_by_id(kernel_btf, dtor_btf_id); + if (!dtor_func) { + ret = -ENOENT; + goto end_btf; + } + + if (btf_is_module(kernel_btf)) { + mod = btf_try_get_module(kernel_btf); + if (!mod) { + ret = -ENXIO; + goto end_btf; + } + } + + /* We already verified dtor_func to be btf_type_is_func + * in register_btf_id_dtor_kfuncs. + */ + dtor_func_name = __btf_name_by_offset(kernel_btf, dtor_func->name_off); + addr = kallsyms_lookup_name(dtor_func_name); + if (!addr) { + ret = -EINVAL; + goto end_mod; + } + tab->off[i].kptr.dtor = (void *)addr; + } + + tab->off[i].offset = info_arr[i].off; + tab->off[i].type = info_arr[i].type; + tab->off[i].kptr.btf_id = id; + tab->off[i].kptr.btf = kernel_btf; + tab->off[i].kptr.module = mod; + } + tab->nr_off = nr_off; + return tab; +end_mod: + module_put(mod); +end_btf: + btf_put(kernel_btf); +end: + while (i--) { + btf_put(tab->off[i].kptr.btf); + if (tab->off[i].kptr.module) + module_put(tab->off[i].kptr.module); + } + kfree(tab); + return ERR_PTR(ret); } static void __btf_struct_show(const struct btf *btf, const struct btf_type *t, @@ -4541,6 +4806,48 @@ static int btf_parse_hdr(struct btf_verifier_env *env) return 0; } +static int btf_check_type_tags(struct btf_verifier_env *env, + struct btf *btf, int start_id) +{ + int i, n, good_id = start_id - 1; + bool in_tags; + + n = btf_nr_types(btf); + for (i = start_id; i < n; i++) { + const struct btf_type *t; + u32 cur_id = i; + + t = btf_type_by_id(btf, i); + if (!t) + return -EINVAL; + if (!btf_type_is_modifier(t)) + continue; + + cond_resched(); + + in_tags = btf_type_is_type_tag(t); + while (btf_type_is_modifier(t)) { + if (btf_type_is_type_tag(t)) { + if (!in_tags) { + btf_verifier_log(env, "Type tags don't precede modifiers"); + return -EINVAL; + } + } else if (in_tags) { + in_tags = false; + } + if (cur_id <= good_id) + break; + /* Move to next type */ + cur_id = t->type; + t = btf_type_by_id(btf, cur_id); + if (!t) + return -EINVAL; + } + good_id = i; + } + return 0; +} + static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size, u32 log_level, char __user *log_ubuf, u32 log_size) { @@ -4608,6 +4915,10 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size, if (err) goto errout; + err = btf_check_type_tags(env, btf, 1); + if (err) + goto errout; + if (log->level && bpf_verifier_log_full(log)) { err = -ENOSPC; goto errout; @@ -4716,41 +5027,6 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, return ctx_type; } -static const struct bpf_map_ops * const btf_vmlinux_map_ops[] = { -#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) -#define BPF_LINK_TYPE(_id, _name) -#define BPF_MAP_TYPE(_id, _ops) \ - [_id] = &_ops, -#include <linux/bpf_types.h> -#undef BPF_PROG_TYPE -#undef BPF_LINK_TYPE -#undef BPF_MAP_TYPE -}; - -static int btf_vmlinux_map_ids_init(const struct btf *btf, - struct bpf_verifier_log *log) -{ - const struct bpf_map_ops *ops; - int i, btf_id; - - for (i = 0; i < ARRAY_SIZE(btf_vmlinux_map_ops); ++i) { - ops = btf_vmlinux_map_ops[i]; - if (!ops || (!ops->map_btf_name && !ops->map_btf_id)) - continue; - if (!ops->map_btf_name || !ops->map_btf_id) { - bpf_log(log, "map type %d is misconfigured\n", i); - return -EINVAL; - } - btf_id = btf_find_by_name_kind(btf, ops->map_btf_name, - BTF_KIND_STRUCT); - if (btf_id < 0) - return btf_id; - *ops->map_btf_id = btf_id; - } - - return 0; -} - static int btf_translate_to_vmlinux(struct bpf_verifier_log *log, struct btf *btf, const struct btf_type *t, @@ -4809,14 +5085,13 @@ struct btf *btf_parse_vmlinux(void) if (err) goto errout; + err = btf_check_type_tags(env, btf, 1); + if (err) + goto errout; + /* btf_parse_vmlinux() runs under bpf_verifier_lock */ bpf_ctx_convert.t = btf_type_by_id(btf, bpf_ctx_convert_btf_id[0]); - /* find bpf map structs for map_ptr access checking */ - err = btf_vmlinux_map_ids_init(btf, log); - if (err < 0) - goto errout; - bpf_struct_ops_init(btf, log); refcount_set(&btf->refcnt, 1); @@ -4894,6 +5169,10 @@ static struct btf *btf_parse_module(const char *module_name, const void *data, u if (err) goto errout; + err = btf_check_type_tags(env, btf, btf_nr_types(base_btf)); + if (err) + goto errout; + btf_verifier_env_free(env); refcount_set(&btf->refcnt, 1); return btf; @@ -5429,7 +5708,8 @@ static bool btf_types_are_same(const struct btf *btf1, u32 id1, bool btf_struct_ids_match(struct bpf_verifier_log *log, const struct btf *btf, u32 id, int off, - const struct btf *need_btf, u32 need_type_id) + const struct btf *need_btf, u32 need_type_id, + bool strict) { const struct btf_type *type; enum bpf_type_flag flag; @@ -5438,7 +5718,12 @@ bool btf_struct_ids_match(struct bpf_verifier_log *log, /* Are we already done? */ if (off == 0 && btf_types_are_same(btf, id, need_btf, need_type_id)) return true; - + /* In case of strict type match, we do not walk struct, the top level + * type match must succeed. When strict is true, off should have already + * been 0. + */ + if (strict) + return false; again: type = btf_type_by_id(btf, id); if (!type) @@ -5772,11 +6057,11 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, struct bpf_verifier_log *log = &env->log; u32 i, nargs, ref_id, ref_obj_id = 0; bool is_kfunc = btf_is_kernel(btf); + bool rel = false, kptr_get = false; const char *func_name, *ref_tname; const struct btf_type *t, *ref_t; const struct btf_param *args; int ref_regno = 0, ret; - bool rel = false; t = btf_type_by_id(btf, func_id); if (!t || !btf_type_is_func(t)) { @@ -5802,14 +6087,19 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, return -EINVAL; } - /* Only kfunc can be release func */ - if (is_kfunc) + if (is_kfunc) { + /* Only kfunc can be release func */ rel = btf_kfunc_id_set_contains(btf, resolve_prog_type(env->prog), BTF_KFUNC_TYPE_RELEASE, func_id); + kptr_get = btf_kfunc_id_set_contains(btf, resolve_prog_type(env->prog), + BTF_KFUNC_TYPE_KPTR_ACQUIRE, func_id); + } + /* check that BTF function arguments match actual types that the * verifier sees. */ for (i = 0; i < nargs; i++) { + enum bpf_arg_type arg_type = ARG_DONTCARE; u32 regno = i + 1; struct bpf_reg_state *reg = ®s[regno]; @@ -5830,12 +6120,58 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id); ref_tname = btf_name_by_offset(btf, ref_t->name_off); - ret = check_func_arg_reg_off(env, reg, regno, ARG_DONTCARE, rel); + if (rel && reg->ref_obj_id) + arg_type |= OBJ_RELEASE; + ret = check_func_arg_reg_off(env, reg, regno, arg_type); if (ret < 0) return ret; - if (btf_get_prog_ctx_type(log, btf, t, - env->prog->type, i)) { + /* kptr_get is only true for kfunc */ + if (i == 0 && kptr_get) { + struct bpf_map_value_off_desc *off_desc; + + if (reg->type != PTR_TO_MAP_VALUE) { + bpf_log(log, "arg#0 expected pointer to map value\n"); + return -EINVAL; + } + + /* check_func_arg_reg_off allows var_off for + * PTR_TO_MAP_VALUE, but we need fixed offset to find + * off_desc. + */ + if (!tnum_is_const(reg->var_off)) { + bpf_log(log, "arg#0 must have constant offset\n"); + return -EINVAL; + } + + off_desc = bpf_map_kptr_off_contains(reg->map_ptr, reg->off + reg->var_off.value); + if (!off_desc || off_desc->type != BPF_KPTR_REF) { + bpf_log(log, "arg#0 no referenced kptr at map value offset=%llu\n", + reg->off + reg->var_off.value); + return -EINVAL; + } + + if (!btf_type_is_ptr(ref_t)) { + bpf_log(log, "arg#0 BTF type must be a double pointer\n"); + return -EINVAL; + } + + ref_t = btf_type_skip_modifiers(btf, ref_t->type, &ref_id); + ref_tname = btf_name_by_offset(btf, ref_t->name_off); + + if (!btf_type_is_struct(ref_t)) { + bpf_log(log, "kernel function %s args#%d pointer type %s %s is not supported\n", + func_name, i, btf_type_str(ref_t), ref_tname); + return -EINVAL; + } + if (!btf_struct_ids_match(log, btf, ref_id, 0, off_desc->kptr.btf, + off_desc->kptr.btf_id, true)) { + bpf_log(log, "kernel function %s args#%d expected pointer to %s %s\n", + func_name, i, btf_type_str(ref_t), ref_tname); + return -EINVAL; + } + /* rest of the arguments can be anything, like normal kfunc */ + } else if (btf_get_prog_ctx_type(log, btf, t, env->prog->type, i)) { /* If function expects ctx type in BTF check that caller * is passing PTR_TO_CTX. */ @@ -5862,11 +6198,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, if (reg->type == PTR_TO_BTF_ID) { reg_btf = reg->btf; reg_ref_id = reg->btf_id; - /* Ensure only one argument is referenced - * PTR_TO_BTF_ID, check_func_arg_reg_off relies - * on only one referenced register being allowed - * for kfuncs. - */ + /* Ensure only one argument is referenced PTR_TO_BTF_ID */ if (reg->ref_obj_id) { if (ref_obj_id) { bpf_log(log, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", @@ -5886,7 +6218,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, reg_ref_tname = btf_name_by_offset(reg_btf, reg_ref_t->name_off); if (!btf_struct_ids_match(log, reg_btf, reg_ref_id, - reg->off, btf, ref_id)) { + reg->off, btf, ref_id, rel && reg->ref_obj_id)) { bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n", func_name, i, btf_type_str(ref_t), ref_tname, @@ -6780,6 +7112,10 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type) return BTF_KFUNC_HOOK_TC; case BPF_PROG_TYPE_STRUCT_OPS: return BTF_KFUNC_HOOK_STRUCT_OPS; + case BPF_PROG_TYPE_TRACING: + return BTF_KFUNC_HOOK_TRACING; + case BPF_PROG_TYPE_SYSCALL: + return BTF_KFUNC_HOOK_SYSCALL; default: return BTF_KFUNC_HOOK_MAX; } @@ -6832,6 +7168,138 @@ int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, } EXPORT_SYMBOL_GPL(register_btf_kfunc_id_set); +s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id) +{ + struct btf_id_dtor_kfunc_tab *tab = btf->dtor_kfunc_tab; + struct btf_id_dtor_kfunc *dtor; + + if (!tab) + return -ENOENT; + /* Even though the size of tab->dtors[0] is > sizeof(u32), we only need + * to compare the first u32 with btf_id, so we can reuse btf_id_cmp_func. + */ + BUILD_BUG_ON(offsetof(struct btf_id_dtor_kfunc, btf_id) != 0); + dtor = bsearch(&btf_id, tab->dtors, tab->cnt, sizeof(tab->dtors[0]), btf_id_cmp_func); + if (!dtor) + return -ENOENT; + return dtor->kfunc_btf_id; +} + +static int btf_check_dtor_kfuncs(struct btf *btf, const struct btf_id_dtor_kfunc *dtors, u32 cnt) +{ + const struct btf_type *dtor_func, *dtor_func_proto, *t; + const struct btf_param *args; + s32 dtor_btf_id; + u32 nr_args, i; + + for (i = 0; i < cnt; i++) { + dtor_btf_id = dtors[i].kfunc_btf_id; + + dtor_func = btf_type_by_id(btf, dtor_btf_id); + if (!dtor_func || !btf_type_is_func(dtor_func)) + return -EINVAL; + + dtor_func_proto = btf_type_by_id(btf, dtor_func->type); + if (!dtor_func_proto || !btf_type_is_func_proto(dtor_func_proto)) + return -EINVAL; + + /* Make sure the prototype of the destructor kfunc is 'void func(type *)' */ + t = btf_type_by_id(btf, dtor_func_proto->type); + if (!t || !btf_type_is_void(t)) + return -EINVAL; + + nr_args = btf_type_vlen(dtor_func_proto); + if (nr_args != 1) + return -EINVAL; + args = btf_params(dtor_func_proto); + t = btf_type_by_id(btf, args[0].type); + /* Allow any pointer type, as width on targets Linux supports + * will be same for all pointer types (i.e. sizeof(void *)) + */ + if (!t || !btf_type_is_ptr(t)) + return -EINVAL; + } + return 0; +} + +/* This function must be invoked only from initcalls/module init functions */ +int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_cnt, + struct module *owner) +{ + struct btf_id_dtor_kfunc_tab *tab; + struct btf *btf; + u32 tab_cnt; + int ret; + + btf = btf_get_module_btf(owner); + if (!btf) { + if (!owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) { + pr_err("missing vmlinux BTF, cannot register dtor kfuncs\n"); + return -ENOENT; + } + if (owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) { + pr_err("missing module BTF, cannot register dtor kfuncs\n"); + return -ENOENT; + } + return 0; + } + if (IS_ERR(btf)) + return PTR_ERR(btf); + + if (add_cnt >= BTF_DTOR_KFUNC_MAX_CNT) { + pr_err("cannot register more than %d kfunc destructors\n", BTF_DTOR_KFUNC_MAX_CNT); + ret = -E2BIG; + goto end; + } + + /* Ensure that the prototype of dtor kfuncs being registered is sane */ + ret = btf_check_dtor_kfuncs(btf, dtors, add_cnt); + if (ret < 0) + goto end; + + tab = btf->dtor_kfunc_tab; + /* Only one call allowed for modules */ + if (WARN_ON_ONCE(tab && btf_is_module(btf))) { + ret = -EINVAL; + goto end; + } + + tab_cnt = tab ? tab->cnt : 0; + if (tab_cnt > U32_MAX - add_cnt) { + ret = -EOVERFLOW; + goto end; + } + if (tab_cnt + add_cnt >= BTF_DTOR_KFUNC_MAX_CNT) { + pr_err("cannot register more than %d kfunc destructors\n", BTF_DTOR_KFUNC_MAX_CNT); + ret = -E2BIG; + goto end; + } + + tab = krealloc(btf->dtor_kfunc_tab, + offsetof(struct btf_id_dtor_kfunc_tab, dtors[tab_cnt + add_cnt]), + GFP_KERNEL | __GFP_NOWARN); + if (!tab) { + ret = -ENOMEM; + goto end; + } + + if (!btf->dtor_kfunc_tab) + tab->cnt = 0; + btf->dtor_kfunc_tab = tab; + + memcpy(tab->dtors + tab->cnt, dtors, add_cnt * sizeof(tab->dtors[0])); + tab->cnt += add_cnt; + + sort(tab->dtors, tab->cnt, sizeof(tab->dtors[0]), btf_id_cmp_func, NULL); + + return 0; +end: + btf_free_dtor_kfunc_tab(btf); + btf_put(btf); + return ret; +} +EXPORT_SYMBOL_GPL(register_btf_id_dtor_kfuncs); + #define MAX_TYPES_ARE_COMPAT_DEPTH 2 static diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 128028efda64..afb414b26d01 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -22,6 +22,45 @@ DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE); EXPORT_SYMBOL(cgroup_bpf_enabled_key); +/* __always_inline is necessary to prevent indirect call through run_prog + * function pointer. + */ +static __always_inline int +bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp, + enum cgroup_bpf_attach_type atype, + const void *ctx, bpf_prog_run_fn run_prog, + int retval, u32 *ret_flags) +{ + const struct bpf_prog_array_item *item; + const struct bpf_prog *prog; + const struct bpf_prog_array *array; + struct bpf_run_ctx *old_run_ctx; + struct bpf_cg_run_ctx run_ctx; + u32 func_ret; + + run_ctx.retval = retval; + migrate_disable(); + rcu_read_lock(); + array = rcu_dereference(cgrp->effective[atype]); + item = &array->items[0]; + old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); + while ((prog = READ_ONCE(item->prog))) { + run_ctx.prog_item = item; + func_ret = run_prog(prog, ctx); + if (ret_flags) { + *(ret_flags) |= (func_ret >> 1); + func_ret &= 1; + } + if (!func_ret && !IS_ERR_VALUE((long)run_ctx.retval)) + run_ctx.retval = -EPERM; + item++; + } + bpf_reset_run_ctx(old_run_ctx); + rcu_read_unlock(); + migrate_enable(); + return run_ctx.retval; +} + void cgroup_bpf_offline(struct cgroup *cgrp) { cgroup_get(cgrp); @@ -1075,11 +1114,38 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, bpf_compute_and_save_data_end(skb, &saved_data_end); if (atype == CGROUP_INET_EGRESS) { - ret = BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY( - cgrp->bpf.effective[atype], skb, __bpf_prog_run_save_cb); + u32 flags = 0; + bool cn; + + ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, skb, + __bpf_prog_run_save_cb, 0, &flags); + + /* Return values of CGROUP EGRESS BPF programs are: + * 0: drop packet + * 1: keep packet + * 2: drop packet and cn + * 3: keep packet and cn + * + * The returned value is then converted to one of the NET_XMIT + * or an error code that is then interpreted as drop packet + * (and no cn): + * 0: NET_XMIT_SUCCESS skb should be transmitted + * 1: NET_XMIT_DROP skb should be dropped and cn + * 2: NET_XMIT_CN skb should be transmitted and cn + * 3: -err skb should be dropped + */ + + cn = flags & BPF_RET_SET_CN; + if (ret && !IS_ERR_VALUE((long)ret)) + ret = -EFAULT; + if (!ret) + ret = (cn ? NET_XMIT_CN : NET_XMIT_SUCCESS); + else + ret = (cn ? NET_XMIT_DROP : ret); } else { - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], skb, - __bpf_prog_run_save_cb, 0); + ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, + skb, __bpf_prog_run_save_cb, 0, + NULL); if (ret && !IS_ERR_VALUE((long)ret)) ret = -EFAULT; } @@ -1109,8 +1175,8 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk, { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - return BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sk, - bpf_prog_run, 0); + return bpf_prog_run_array_cg(&cgrp->bpf, atype, sk, bpf_prog_run, 0, + NULL); } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); @@ -1155,8 +1221,8 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, } cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - return BPF_PROG_RUN_ARRAY_CG_FLAGS(cgrp->bpf.effective[atype], &ctx, - bpf_prog_run, 0, flags); + return bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, + 0, flags); } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr); @@ -1182,8 +1248,8 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - return BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sock_ops, - bpf_prog_run, 0); + return bpf_prog_run_array_cg(&cgrp->bpf, atype, sock_ops, bpf_prog_run, + 0, NULL); } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops); @@ -1200,8 +1266,8 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, rcu_read_lock(); cgrp = task_dfl_cgroup(current); - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx, - bpf_prog_run, 0); + ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0, + NULL); rcu_read_unlock(); return ret; @@ -1366,8 +1432,8 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, rcu_read_lock(); cgrp = task_dfl_cgroup(current); - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx, - bpf_prog_run, 0); + ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0, + NULL); rcu_read_unlock(); kfree(ctx.cur_val); @@ -1459,8 +1525,8 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, } lock_sock(sk); - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_SETSOCKOPT], - &ctx, bpf_prog_run, 0); + ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_SETSOCKOPT, + &ctx, bpf_prog_run, 0, NULL); release_sock(sk); if (ret) @@ -1559,8 +1625,8 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, } lock_sock(sk); - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT], - &ctx, bpf_prog_run, retval); + ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT, + &ctx, bpf_prog_run, retval, NULL); release_sock(sk); if (ret < 0) @@ -1608,8 +1674,8 @@ int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level, * be called if that data shouldn't be "exported". */ - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT], - &ctx, bpf_prog_run, retval); + ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT, + &ctx, bpf_prog_run, retval, NULL); if (ret < 0) return ret; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 13e9dbeeedf3..cacd8684c3c4 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -873,7 +873,7 @@ static size_t select_bpf_prog_pack_size(void) return size; } -static struct bpf_prog_pack *alloc_new_pack(void) +static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_insns) { struct bpf_prog_pack *pack; @@ -886,6 +886,7 @@ static struct bpf_prog_pack *alloc_new_pack(void) kfree(pack); return NULL; } + bpf_fill_ill_insns(pack->ptr, bpf_prog_pack_size); bitmap_zero(pack->bitmap, bpf_prog_pack_size / BPF_PROG_CHUNK_SIZE); list_add_tail(&pack->list, &pack_list); @@ -895,7 +896,7 @@ static struct bpf_prog_pack *alloc_new_pack(void) return pack; } -static void *bpf_prog_pack_alloc(u32 size) +static void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns) { unsigned int nbits = BPF_PROG_SIZE_TO_NBITS(size); struct bpf_prog_pack *pack; @@ -910,6 +911,7 @@ static void *bpf_prog_pack_alloc(u32 size) size = round_up(size, PAGE_SIZE); ptr = module_alloc(size); if (ptr) { + bpf_fill_ill_insns(ptr, size); set_vm_flush_reset_perms(ptr); set_memory_ro((unsigned long)ptr, size / PAGE_SIZE); set_memory_x((unsigned long)ptr, size / PAGE_SIZE); @@ -923,7 +925,7 @@ static void *bpf_prog_pack_alloc(u32 size) goto found_free_area; } - pack = alloc_new_pack(); + pack = alloc_new_pack(bpf_fill_ill_insns); if (!pack) goto out; @@ -966,6 +968,9 @@ static void bpf_prog_pack_free(struct bpf_binary_header *hdr) nbits = BPF_PROG_SIZE_TO_NBITS(hdr->size); pos = ((unsigned long)hdr - (unsigned long)pack_ptr) >> BPF_PROG_CHUNK_SHIFT; + WARN_ONCE(bpf_arch_text_invalidate(hdr, hdr->size), + "bpf_prog_pack bug: missing bpf_arch_text_invalidate?\n"); + bitmap_clear(pack->bitmap, pos, nbits); if (bitmap_find_next_zero_area(pack->bitmap, bpf_prog_chunk_count(), 0, bpf_prog_chunk_count(), 0) == 0) { @@ -1102,7 +1107,7 @@ bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr, if (bpf_jit_charge_modmem(size)) return NULL; - ro_header = bpf_prog_pack_alloc(size); + ro_header = bpf_prog_pack_alloc(size, bpf_fill_ill_insns); if (!ro_header) { bpf_jit_uncharge_modmem(size); return NULL; @@ -1434,6 +1439,16 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) insn = clone->insnsi; for (i = 0; i < insn_cnt; i++, insn++) { + if (bpf_pseudo_func(insn)) { + /* ld_imm64 with an address of bpf subprog is not + * a user controlled constant. Don't randomize it, + * since it will conflict with jit_subprogs() logic. + */ + insn++; + i++; + continue; + } + /* We temporarily need to hold the original ld64 insn * so that we can still access the first part in the * second blinding run. @@ -2619,6 +2634,7 @@ const struct bpf_func_proto bpf_map_delete_elem_proto __weak; const struct bpf_func_proto bpf_map_push_elem_proto __weak; const struct bpf_func_proto bpf_map_pop_elem_proto __weak; const struct bpf_func_proto bpf_map_peek_elem_proto __weak; +const struct bpf_func_proto bpf_map_lookup_percpu_elem_proto __weak; const struct bpf_func_proto bpf_spin_lock_proto __weak; const struct bpf_func_proto bpf_spin_unlock_proto __weak; const struct bpf_func_proto bpf_jiffies64_proto __weak; @@ -2727,6 +2743,11 @@ void * __weak bpf_arch_text_copy(void *dst, void *src, size_t len) return ERR_PTR(-ENOTSUPP); } +int __weak bpf_arch_text_invalidate(void *dst, size_t len) +{ + return -ENOTSUPP; +} + DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key); EXPORT_SYMBOL(bpf_stats_enabled_key); diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 650e5d21f90d..f4860ac756cd 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -27,6 +27,7 @@ #include <linux/kthread.h> #include <linux/capability.h> #include <trace/events/xdp.h> +#include <linux/btf_ids.h> #include <linux/netdevice.h> /* netif_receive_skb_list */ #include <linux/etherdevice.h> /* eth_type_trans */ @@ -673,7 +674,7 @@ static int cpu_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) __cpu_map_lookup_elem); } -static int cpu_map_btf_id; +BTF_ID_LIST_SINGLE(cpu_map_btf_ids, struct, bpf_cpu_map) const struct bpf_map_ops cpu_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = cpu_map_alloc, @@ -683,8 +684,7 @@ const struct bpf_map_ops cpu_map_ops = { .map_lookup_elem = cpu_map_lookup_elem, .map_get_next_key = cpu_map_get_next_key, .map_check_btf = map_check_no_btf, - .map_btf_name = "bpf_cpu_map", - .map_btf_id = &cpu_map_btf_id, + .map_btf_id = &cpu_map_btf_ids[0], .map_redirect = cpu_map_redirect, }; diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 038f6d7a83e4..c2867068e5bd 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -48,6 +48,7 @@ #include <net/xdp.h> #include <linux/filter.h> #include <trace/events/xdp.h> +#include <linux/btf_ids.h> #define DEV_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) @@ -1005,7 +1006,7 @@ static int dev_hash_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) __dev_map_hash_lookup_elem); } -static int dev_map_btf_id; +BTF_ID_LIST_SINGLE(dev_map_btf_ids, struct, bpf_dtab) const struct bpf_map_ops dev_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = dev_map_alloc, @@ -1015,12 +1016,10 @@ const struct bpf_map_ops dev_map_ops = { .map_update_elem = dev_map_update_elem, .map_delete_elem = dev_map_delete_elem, .map_check_btf = map_check_no_btf, - .map_btf_name = "bpf_dtab", - .map_btf_id = &dev_map_btf_id, + .map_btf_id = &dev_map_btf_ids[0], .map_redirect = dev_map_redirect, }; -static int dev_map_hash_map_btf_id; const struct bpf_map_ops dev_map_hash_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = dev_map_alloc, @@ -1030,8 +1029,7 @@ const struct bpf_map_ops dev_map_hash_ops = { .map_update_elem = dev_map_hash_update_elem, .map_delete_elem = dev_map_hash_delete_elem, .map_check_btf = map_check_no_btf, - .map_btf_name = "bpf_dtab", - .map_btf_id = &dev_map_hash_map_btf_id, + .map_btf_id = &dev_map_btf_ids[0], .map_redirect = dev_hash_map_redirect, }; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 65877967f414..17fb69c0e0dc 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -10,6 +10,7 @@ #include <linux/random.h> #include <uapi/linux/btf.h> #include <linux/rcupdate_trace.h> +#include <linux/btf_ids.h> #include "percpu_freelist.h" #include "bpf_lru_list.h" #include "map_in_map.h" @@ -139,7 +140,7 @@ static inline bool htab_use_raw_lock(const struct bpf_htab *htab) static void htab_init_buckets(struct bpf_htab *htab) { - unsigned i; + unsigned int i; for (i = 0; i < htab->n_buckets; i++) { INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i); @@ -238,7 +239,7 @@ static void htab_free_prealloced_timers(struct bpf_htab *htab) u32 num_entries = htab->map.max_entries; int i; - if (likely(!map_value_has_timer(&htab->map))) + if (!map_value_has_timer(&htab->map)) return; if (htab_has_extra_elems(htab)) num_entries += num_possible_cpus(); @@ -254,6 +255,25 @@ static void htab_free_prealloced_timers(struct bpf_htab *htab) } } +static void htab_free_prealloced_kptrs(struct bpf_htab *htab) +{ + u32 num_entries = htab->map.max_entries; + int i; + + if (!map_value_has_kptrs(&htab->map)) + return; + if (htab_has_extra_elems(htab)) + num_entries += num_possible_cpus(); + + for (i = 0; i < num_entries; i++) { + struct htab_elem *elem; + + elem = get_htab_elem(htab, i); + bpf_map_free_kptrs(&htab->map, elem->key + round_up(htab->map.key_size, 8)); + cond_resched(); + } +} + static void htab_free_elems(struct bpf_htab *htab) { int i; @@ -725,12 +745,15 @@ static int htab_lru_map_gen_lookup(struct bpf_map *map, return insn - insn_buf; } -static void check_and_free_timer(struct bpf_htab *htab, struct htab_elem *elem) +static void check_and_free_fields(struct bpf_htab *htab, + struct htab_elem *elem) { - if (unlikely(map_value_has_timer(&htab->map))) - bpf_timer_cancel_and_free(elem->key + - round_up(htab->map.key_size, 8) + - htab->map.timer_off); + void *map_value = elem->key + round_up(htab->map.key_size, 8); + + if (map_value_has_timer(&htab->map)) + bpf_timer_cancel_and_free(map_value + htab->map.timer_off); + if (map_value_has_kptrs(&htab->map)) + bpf_map_free_kptrs(&htab->map, map_value); } /* It is called from the bpf_lru_list when the LRU needs to delete @@ -738,7 +761,7 @@ static void check_and_free_timer(struct bpf_htab *htab, struct htab_elem *elem) */ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) { - struct bpf_htab *htab = (struct bpf_htab *)arg; + struct bpf_htab *htab = arg; struct htab_elem *l = NULL, *tgt_l; struct hlist_nulls_head *head; struct hlist_nulls_node *n; @@ -757,7 +780,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) if (l == tgt_l) { hlist_nulls_del_rcu(&l->hash_node); - check_and_free_timer(htab, l); + check_and_free_fields(htab, l); break; } @@ -829,7 +852,7 @@ static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l) { if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) free_percpu(htab_elem_get_ptr(l, htab->map.key_size)); - check_and_free_timer(htab, l); + check_and_free_fields(htab, l); kfree(l); } @@ -857,7 +880,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) htab_put_fd_value(htab, l); if (htab_is_prealloc(htab)) { - check_and_free_timer(htab, l); + check_and_free_fields(htab, l); __pcpu_freelist_push(&htab->freelist, &l->fnode); } else { atomic_dec(&htab->count); @@ -1104,7 +1127,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, if (!htab_is_prealloc(htab)) free_htab_elem(htab, l_old); else - check_and_free_timer(htab, l_old); + check_and_free_fields(htab, l_old); } ret = 0; err: @@ -1114,7 +1137,7 @@ err: static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem) { - check_and_free_timer(htab, elem); + check_and_free_fields(htab, elem); bpf_lru_push_free(&htab->lru, &elem->lru_node); } @@ -1419,8 +1442,14 @@ static void htab_free_malloced_timers(struct bpf_htab *htab) struct hlist_nulls_node *n; struct htab_elem *l; - hlist_nulls_for_each_entry(l, n, head, hash_node) - check_and_free_timer(htab, l); + hlist_nulls_for_each_entry(l, n, head, hash_node) { + /* We don't reset or free kptr on uref dropping to zero, + * hence just free timer. + */ + bpf_timer_cancel_and_free(l->key + + round_up(htab->map.key_size, 8) + + htab->map.timer_off); + } cond_resched_rcu(); } rcu_read_unlock(); @@ -1430,7 +1459,8 @@ static void htab_map_free_timers(struct bpf_map *map) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - if (likely(!map_value_has_timer(&htab->map))) + /* We don't reset or free kptr on uref dropping to zero. */ + if (!map_value_has_timer(&htab->map)) return; if (!htab_is_prealloc(htab)) htab_free_malloced_timers(htab); @@ -1453,11 +1483,14 @@ static void htab_map_free(struct bpf_map *map) * not have executed. Wait for them. */ rcu_barrier(); - if (!htab_is_prealloc(htab)) + if (!htab_is_prealloc(htab)) { delete_all_elements(htab); - else + } else { + htab_free_prealloced_kptrs(htab); prealloc_destroy(htab); + } + bpf_map_free_kptr_off_tab(map); free_percpu(htab->extra_elems); bpf_map_area_free(htab->buckets); for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) @@ -1594,7 +1627,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, void __user *uvalues = u64_to_user_ptr(attr->batch.values); void __user *ukeys = u64_to_user_ptr(attr->batch.keys); void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch); - u32 batch, max_count, size, bucket_size; + u32 batch, max_count, size, bucket_size, map_id; struct htab_elem *node_to_free = NULL; u64 elem_map_flags, map_flags; struct hlist_nulls_head *head; @@ -1719,6 +1752,14 @@ again_nocopy: } } else { value = l->key + roundup_key_size; + if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { + struct bpf_map **inner_map = value; + + /* Actual value is the id of the inner map */ + map_id = map->ops->map_fd_sys_lookup_elem(*inner_map); + value = &map_id; + } + if (elem_map_flags & BPF_F_LOCK) copy_map_value_locked(map, dst_val, value, true); @@ -2105,7 +2146,7 @@ out: return num_elems; } -static int htab_map_btf_id; +BTF_ID_LIST_SINGLE(htab_map_btf_ids, struct, bpf_htab) const struct bpf_map_ops htab_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = htab_map_alloc_check, @@ -2122,12 +2163,10 @@ const struct bpf_map_ops htab_map_ops = { .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_hash_elem, BATCH_OPS(htab), - .map_btf_name = "bpf_htab", - .map_btf_id = &htab_map_btf_id, + .map_btf_id = &htab_map_btf_ids[0], .iter_seq_info = &iter_seq_info, }; -static int htab_lru_map_btf_id; const struct bpf_map_ops htab_lru_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = htab_map_alloc_check, @@ -2145,8 +2184,7 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_hash_elem, BATCH_OPS(htab_lru), - .map_btf_name = "bpf_htab", - .map_btf_id = &htab_lru_map_btf_id, + .map_btf_id = &htab_map_btf_ids[0], .iter_seq_info = &iter_seq_info, }; @@ -2161,6 +2199,20 @@ static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) return NULL; } +static void *htab_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu) +{ + struct htab_elem *l; + + if (cpu >= nr_cpu_ids) + return NULL; + + l = __htab_map_lookup_elem(map, key); + if (l) + return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu); + else + return NULL; +} + static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key) { struct htab_elem *l = __htab_map_lookup_elem(map, key); @@ -2173,6 +2225,22 @@ static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key) return NULL; } +static void *htab_lru_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu) +{ + struct htab_elem *l; + + if (cpu >= nr_cpu_ids) + return NULL; + + l = __htab_map_lookup_elem(map, key); + if (l) { + bpf_lru_node_set_ref(&l->lru_node); + return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu); + } + + return NULL; +} + int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) { struct htab_elem *l; @@ -2252,7 +2320,6 @@ static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key, rcu_read_unlock(); } -static int htab_percpu_map_btf_id; const struct bpf_map_ops htab_percpu_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = htab_map_alloc_check, @@ -2263,16 +2330,15 @@ const struct bpf_map_ops htab_percpu_map_ops = { .map_lookup_and_delete_elem = htab_percpu_map_lookup_and_delete_elem, .map_update_elem = htab_percpu_map_update_elem, .map_delete_elem = htab_map_delete_elem, + .map_lookup_percpu_elem = htab_percpu_map_lookup_percpu_elem, .map_seq_show_elem = htab_percpu_map_seq_show_elem, .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_hash_elem, BATCH_OPS(htab_percpu), - .map_btf_name = "bpf_htab", - .map_btf_id = &htab_percpu_map_btf_id, + .map_btf_id = &htab_map_btf_ids[0], .iter_seq_info = &iter_seq_info, }; -static int htab_lru_percpu_map_btf_id; const struct bpf_map_ops htab_lru_percpu_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = htab_map_alloc_check, @@ -2283,12 +2349,12 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = { .map_lookup_and_delete_elem = htab_lru_percpu_map_lookup_and_delete_elem, .map_update_elem = htab_lru_percpu_map_update_elem, .map_delete_elem = htab_lru_map_delete_elem, + .map_lookup_percpu_elem = htab_lru_percpu_map_lookup_percpu_elem, .map_seq_show_elem = htab_percpu_map_seq_show_elem, .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_hash_elem, BATCH_OPS(htab_lru_percpu), - .map_btf_name = "bpf_htab", - .map_btf_id = &htab_lru_percpu_map_btf_id, + .map_btf_id = &htab_map_btf_ids[0], .iter_seq_info = &iter_seq_info, }; @@ -2412,7 +2478,6 @@ static void htab_of_map_free(struct bpf_map *map) fd_htab_map_free(map); } -static int htab_of_maps_map_btf_id; const struct bpf_map_ops htab_of_maps_map_ops = { .map_alloc_check = fd_htab_map_alloc_check, .map_alloc = htab_of_map_alloc, @@ -2425,6 +2490,6 @@ const struct bpf_map_ops htab_of_maps_map_ops = { .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, .map_gen_lookup = htab_of_map_gen_lookup, .map_check_btf = map_check_no_btf, - .map_btf_name = "bpf_htab", - .map_btf_id = &htab_of_maps_map_btf_id, + BATCH_OPS(htab), + .map_btf_id = &htab_map_btf_ids[0], }; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 315053ef6a75..225806a02efb 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -103,7 +103,7 @@ const struct bpf_func_proto bpf_map_pop_elem_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_UNINIT_MAP_VALUE, + .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT, }; BPF_CALL_2(bpf_map_peek_elem, struct bpf_map *, map, void *, value) @@ -116,7 +116,23 @@ const struct bpf_func_proto bpf_map_peek_elem_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_UNINIT_MAP_VALUE, + .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT, +}; + +BPF_CALL_3(bpf_map_lookup_percpu_elem, struct bpf_map *, map, void *, key, u32, cpu) +{ + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); + return (unsigned long) map->ops->map_lookup_percpu_elem(map, key, cpu); +} + +const struct bpf_func_proto bpf_map_lookup_percpu_elem_proto = { + .func = bpf_map_lookup_percpu_elem, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_MAP_KEY, + .arg3_type = ARG_ANYTHING, }; const struct bpf_func_proto bpf_get_prandom_u32_proto = { @@ -1374,6 +1390,191 @@ out: kfree(t); } +BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr) +{ + unsigned long *kptr = map_value; + + return xchg(kptr, (unsigned long)ptr); +} + +/* Unlike other PTR_TO_BTF_ID helpers the btf_id in bpf_kptr_xchg() + * helper is determined dynamically by the verifier. + */ +#define BPF_PTR_POISON ((void *)((0xeB9FUL << 2) + POISON_POINTER_DELTA)) + +const struct bpf_func_proto bpf_kptr_xchg_proto = { + .func = bpf_kptr_xchg, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .ret_btf_id = BPF_PTR_POISON, + .arg1_type = ARG_PTR_TO_KPTR, + .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL | OBJ_RELEASE, + .arg2_btf_id = BPF_PTR_POISON, +}; + +/* Since the upper 8 bits of dynptr->size is reserved, the + * maximum supported size is 2^24 - 1. + */ +#define DYNPTR_MAX_SIZE ((1UL << 24) - 1) +#define DYNPTR_TYPE_SHIFT 28 +#define DYNPTR_SIZE_MASK 0xFFFFFF +#define DYNPTR_RDONLY_BIT BIT(31) + +static bool bpf_dynptr_is_rdonly(struct bpf_dynptr_kern *ptr) +{ + return ptr->size & DYNPTR_RDONLY_BIT; +} + +static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum bpf_dynptr_type type) +{ + ptr->size |= type << DYNPTR_TYPE_SHIFT; +} + +static u32 bpf_dynptr_get_size(struct bpf_dynptr_kern *ptr) +{ + return ptr->size & DYNPTR_SIZE_MASK; +} + +int bpf_dynptr_check_size(u32 size) +{ + return size > DYNPTR_MAX_SIZE ? -E2BIG : 0; +} + +void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, + enum bpf_dynptr_type type, u32 offset, u32 size) +{ + ptr->data = data; + ptr->offset = offset; + ptr->size = size; + bpf_dynptr_set_type(ptr, type); +} + +void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr) +{ + memset(ptr, 0, sizeof(*ptr)); +} + +static int bpf_dynptr_check_off_len(struct bpf_dynptr_kern *ptr, u32 offset, u32 len) +{ + u32 size = bpf_dynptr_get_size(ptr); + + if (len > size || offset > size - len) + return -E2BIG; + + return 0; +} + +BPF_CALL_4(bpf_dynptr_from_mem, void *, data, u32, size, u64, flags, struct bpf_dynptr_kern *, ptr) +{ + int err; + + err = bpf_dynptr_check_size(size); + if (err) + goto error; + + /* flags is currently unsupported */ + if (flags) { + err = -EINVAL; + goto error; + } + + bpf_dynptr_init(ptr, data, BPF_DYNPTR_TYPE_LOCAL, 0, size); + + return 0; + +error: + bpf_dynptr_set_null(ptr); + return err; +} + +const struct bpf_func_proto bpf_dynptr_from_mem_proto = { + .func = bpf_dynptr_from_mem, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT, +}; + +BPF_CALL_4(bpf_dynptr_read, void *, dst, u32, len, struct bpf_dynptr_kern *, src, u32, offset) +{ + int err; + + if (!src->data) + return -EINVAL; + + err = bpf_dynptr_check_off_len(src, offset, len); + if (err) + return err; + + memcpy(dst, src->data + src->offset + offset, len); + + return 0; +} + +const struct bpf_func_proto bpf_dynptr_read_proto = { + .func = bpf_dynptr_read, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, + .arg3_type = ARG_PTR_TO_DYNPTR, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_dynptr_write, struct bpf_dynptr_kern *, dst, u32, offset, void *, src, u32, len) +{ + int err; + + if (!dst->data || bpf_dynptr_is_rdonly(dst)) + return -EINVAL; + + err = bpf_dynptr_check_off_len(dst, offset, len); + if (err) + return err; + + memcpy(dst->data + dst->offset + offset, src, len); + + return 0; +} + +const struct bpf_func_proto bpf_dynptr_write_proto = { + .func = bpf_dynptr_write, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_DYNPTR, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg4_type = ARG_CONST_SIZE_OR_ZERO, +}; + +BPF_CALL_3(bpf_dynptr_data, struct bpf_dynptr_kern *, ptr, u32, offset, u32, len) +{ + int err; + + if (!ptr->data) + return 0; + + err = bpf_dynptr_check_off_len(ptr, offset, len); + if (err) + return 0; + + if (bpf_dynptr_is_rdonly(ptr)) + return 0; + + return (unsigned long)(ptr->data + ptr->offset + offset); +} + +const struct bpf_func_proto bpf_dynptr_data_proto = { + .func = bpf_dynptr_data, + .gpl_only = false, + .ret_type = RET_PTR_TO_DYNPTR_MEM_OR_NULL, + .arg1_type = ARG_PTR_TO_DYNPTR, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_CONST_ALLOC_SIZE_OR_ZERO, +}; + const struct bpf_func_proto bpf_get_current_task_proto __weak; const struct bpf_func_proto bpf_get_current_task_btf_proto __weak; const struct bpf_func_proto bpf_probe_read_user_proto __weak; @@ -1398,6 +1599,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_map_pop_elem_proto; case BPF_FUNC_map_peek_elem: return &bpf_map_peek_elem_proto; + case BPF_FUNC_map_lookup_percpu_elem: + return &bpf_map_lookup_percpu_elem_proto; case BPF_FUNC_get_prandom_u32: return &bpf_get_prandom_u32_proto; case BPF_FUNC_get_smp_processor_id: @@ -1420,12 +1623,26 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_ringbuf_discard_proto; case BPF_FUNC_ringbuf_query: return &bpf_ringbuf_query_proto; + case BPF_FUNC_ringbuf_reserve_dynptr: + return &bpf_ringbuf_reserve_dynptr_proto; + case BPF_FUNC_ringbuf_submit_dynptr: + return &bpf_ringbuf_submit_dynptr_proto; + case BPF_FUNC_ringbuf_discard_dynptr: + return &bpf_ringbuf_discard_dynptr_proto; case BPF_FUNC_for_each_map_elem: return &bpf_for_each_map_elem_proto; case BPF_FUNC_loop: return &bpf_loop_proto; case BPF_FUNC_strncmp: return &bpf_strncmp_proto; + case BPF_FUNC_dynptr_from_mem: + return &bpf_dynptr_from_mem_proto; + case BPF_FUNC_dynptr_read: + return &bpf_dynptr_read_proto; + case BPF_FUNC_dynptr_write: + return &bpf_dynptr_write_proto; + case BPF_FUNC_dynptr_data: + return &bpf_dynptr_data_proto; default: break; } @@ -1452,6 +1669,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_timer_start_proto; case BPF_FUNC_timer_cancel: return &bpf_timer_cancel_proto; + case BPF_FUNC_kptr_xchg: + return &bpf_kptr_xchg_proto; default: break; } diff --git a/kernel/bpf/link_iter.c b/kernel/bpf/link_iter.c new file mode 100644 index 000000000000..fec8005a121c --- /dev/null +++ b/kernel/bpf/link_iter.c @@ -0,0 +1,107 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2022 Red Hat, Inc. */ +#include <linux/bpf.h> +#include <linux/fs.h> +#include <linux/filter.h> +#include <linux/kernel.h> +#include <linux/btf_ids.h> + +struct bpf_iter_seq_link_info { + u32 link_id; +}; + +static void *bpf_link_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct bpf_iter_seq_link_info *info = seq->private; + struct bpf_link *link; + + link = bpf_link_get_curr_or_next(&info->link_id); + if (!link) + return NULL; + + if (*pos == 0) + ++*pos; + return link; +} + +static void *bpf_link_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct bpf_iter_seq_link_info *info = seq->private; + + ++*pos; + ++info->link_id; + bpf_link_put((struct bpf_link *)v); + return bpf_link_get_curr_or_next(&info->link_id); +} + +struct bpf_iter__bpf_link { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct bpf_link *, link); +}; + +DEFINE_BPF_ITER_FUNC(bpf_link, struct bpf_iter_meta *meta, struct bpf_link *link) + +static int __bpf_link_seq_show(struct seq_file *seq, void *v, bool in_stop) +{ + struct bpf_iter__bpf_link ctx; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + int ret = 0; + + ctx.meta = &meta; + ctx.link = v; + meta.seq = seq; + prog = bpf_iter_get_info(&meta, in_stop); + if (prog) + ret = bpf_iter_run_prog(prog, &ctx); + + return ret; +} + +static int bpf_link_seq_show(struct seq_file *seq, void *v) +{ + return __bpf_link_seq_show(seq, v, false); +} + +static void bpf_link_seq_stop(struct seq_file *seq, void *v) +{ + if (!v) + (void)__bpf_link_seq_show(seq, v, true); + else + bpf_link_put((struct bpf_link *)v); +} + +static const struct seq_operations bpf_link_seq_ops = { + .start = bpf_link_seq_start, + .next = bpf_link_seq_next, + .stop = bpf_link_seq_stop, + .show = bpf_link_seq_show, +}; + +BTF_ID_LIST(btf_bpf_link_id) +BTF_ID(struct, bpf_link) + +static const struct bpf_iter_seq_info bpf_link_seq_info = { + .seq_ops = &bpf_link_seq_ops, + .init_seq_private = NULL, + .fini_seq_private = NULL, + .seq_priv_size = sizeof(struct bpf_iter_seq_link_info), +}; + +static struct bpf_iter_reg bpf_link_reg_info = { + .target = "bpf_link", + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__bpf_link, link), + PTR_TO_BTF_ID_OR_NULL }, + }, + .seq_info = &bpf_link_seq_info, +}; + +static int __init bpf_link_iter_init(void) +{ + bpf_link_reg_info.ctx_arg_info[0].btf_id = *btf_bpf_link_id; + return bpf_iter_reg_target(&bpf_link_reg_info); +} + +late_initcall(bpf_link_iter_init); diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 497916060ac7..8654fc97f5fe 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -9,6 +9,7 @@ #include <linux/rbtree.h> #include <linux/slab.h> #include <uapi/linux/btf.h> +#include <linux/btf_ids.h> #ifdef CONFIG_CGROUP_BPF @@ -446,7 +447,8 @@ static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *key, rcu_read_unlock(); } -static int cgroup_storage_map_btf_id; +BTF_ID_LIST_SINGLE(cgroup_storage_map_btf_ids, struct, + bpf_cgroup_storage_map) const struct bpf_map_ops cgroup_storage_map_ops = { .map_alloc = cgroup_storage_map_alloc, .map_free = cgroup_storage_map_free, @@ -456,8 +458,7 @@ const struct bpf_map_ops cgroup_storage_map_ops = { .map_delete_elem = cgroup_storage_delete_elem, .map_check_btf = cgroup_storage_check_btf, .map_seq_show_elem = cgroup_storage_seq_show_elem, - .map_btf_name = "bpf_cgroup_storage_map", - .map_btf_id = &cgroup_storage_map_btf_id, + .map_btf_id = &cgroup_storage_map_btf_ids[0], }; int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *_map) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 5763cc7ac4f1..f0d05a3cc4b9 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -14,6 +14,7 @@ #include <linux/vmalloc.h> #include <net/ipv6.h> #include <uapi/linux/btf.h> +#include <linux/btf_ids.h> /* Intermediate node */ #define LPM_TREE_NODE_FLAG_IM BIT(0) @@ -719,7 +720,7 @@ static int trie_check_btf(const struct bpf_map *map, -EINVAL : 0; } -static int trie_map_btf_id; +BTF_ID_LIST_SINGLE(trie_map_btf_ids, struct, lpm_trie) const struct bpf_map_ops trie_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = trie_alloc, @@ -732,6 +733,5 @@ const struct bpf_map_ops trie_map_ops = { .map_update_batch = generic_map_update_batch, .map_delete_batch = generic_map_delete_batch, .map_check_btf = trie_check_btf, - .map_btf_name = "lpm_trie", - .map_btf_id = &trie_map_btf_id, + .map_btf_id = &trie_map_btf_ids[0], }; diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 5cd8f5277279..135205d0d560 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -52,6 +52,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) inner_map_meta->max_entries = inner_map->max_entries; inner_map_meta->spin_lock_off = inner_map->spin_lock_off; inner_map_meta->timer_off = inner_map->timer_off; + inner_map_meta->kptr_off_tab = bpf_map_copy_kptr_off_tab(inner_map); if (inner_map->btf) { btf_get(inner_map->btf); inner_map_meta->btf = inner_map->btf; @@ -71,6 +72,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) void bpf_map_meta_free(struct bpf_map *map_meta) { + bpf_map_free_kptr_off_tab(map_meta); btf_put(map_meta->btf); kfree(map_meta); } @@ -83,7 +85,8 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0, meta0->key_size == meta1->key_size && meta0->value_size == meta1->value_size && meta0->timer_off == meta1->timer_off && - meta0->map_flags == meta1->map_flags; + meta0->map_flags == meta1->map_flags && + bpf_map_equal_kptr_off_tab(meta0, meta1); } void *bpf_map_fd_get_ptr(struct bpf_map *map, diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index f9c734aaa990..a1c0794ae49d 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -8,6 +8,7 @@ #include <linux/list.h> #include <linux/slab.h> #include <linux/capability.h> +#include <linux/btf_ids.h> #include "percpu_freelist.h" #define QUEUE_STACK_CREATE_FLAG_MASK \ @@ -247,7 +248,7 @@ static int queue_stack_map_get_next_key(struct bpf_map *map, void *key, return -EINVAL; } -static int queue_map_btf_id; +BTF_ID_LIST_SINGLE(queue_map_btf_ids, struct, bpf_queue_stack) const struct bpf_map_ops queue_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = queue_stack_map_alloc_check, @@ -260,11 +261,9 @@ const struct bpf_map_ops queue_map_ops = { .map_pop_elem = queue_map_pop_elem, .map_peek_elem = queue_map_peek_elem, .map_get_next_key = queue_stack_map_get_next_key, - .map_btf_name = "bpf_queue_stack", - .map_btf_id = &queue_map_btf_id, + .map_btf_id = &queue_map_btf_ids[0], }; -static int stack_map_btf_id; const struct bpf_map_ops stack_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = queue_stack_map_alloc_check, @@ -277,6 +276,5 @@ const struct bpf_map_ops stack_map_ops = { .map_pop_elem = stack_map_pop_elem, .map_peek_elem = stack_map_peek_elem, .map_get_next_key = queue_stack_map_get_next_key, - .map_btf_name = "bpf_queue_stack", - .map_btf_id = &stack_map_btf_id, + .map_btf_id = &queue_map_btf_ids[0], }; diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index 8251243022a2..e2618fb5870e 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -6,6 +6,7 @@ #include <linux/err.h> #include <linux/sock_diag.h> #include <net/sock_reuseport.h> +#include <linux/btf_ids.h> struct reuseport_array { struct bpf_map map; @@ -337,7 +338,7 @@ static int reuseport_array_get_next_key(struct bpf_map *map, void *key, return 0; } -static int reuseport_array_map_btf_id; +BTF_ID_LIST_SINGLE(reuseport_array_map_btf_ids, struct, reuseport_array) const struct bpf_map_ops reuseport_array_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = reuseport_array_alloc_check, @@ -346,6 +347,5 @@ const struct bpf_map_ops reuseport_array_ops = { .map_lookup_elem = reuseport_array_lookup_elem, .map_get_next_key = reuseport_array_get_next_key, .map_delete_elem = reuseport_array_delete_elem, - .map_btf_name = "reuseport_array", - .map_btf_id = &reuseport_array_map_btf_id, + .map_btf_id = &reuseport_array_map_btf_ids[0], }; diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 710ba9de12ce..ded4faeca192 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -10,6 +10,7 @@ #include <linux/poll.h> #include <linux/kmemleak.h> #include <uapi/linux/btf.h> +#include <linux/btf_ids.h> #define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE) @@ -263,7 +264,7 @@ static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp, return 0; } -static int ringbuf_map_btf_id; +BTF_ID_LIST_SINGLE(ringbuf_map_btf_ids, struct, bpf_ringbuf_map) const struct bpf_map_ops ringbuf_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = ringbuf_map_alloc, @@ -274,8 +275,7 @@ const struct bpf_map_ops ringbuf_map_ops = { .map_update_elem = ringbuf_map_update_elem, .map_delete_elem = ringbuf_map_delete_elem, .map_get_next_key = ringbuf_map_get_next_key, - .map_btf_name = "bpf_ringbuf_map", - .map_btf_id = &ringbuf_map_btf_id, + .map_btf_id = &ringbuf_map_btf_ids[0], }; /* Given pointer to ring buffer record metadata and struct bpf_ringbuf itself, @@ -404,7 +404,7 @@ BPF_CALL_2(bpf_ringbuf_submit, void *, sample, u64, flags) const struct bpf_func_proto bpf_ringbuf_submit_proto = { .func = bpf_ringbuf_submit, .ret_type = RET_VOID, - .arg1_type = ARG_PTR_TO_ALLOC_MEM, + .arg1_type = ARG_PTR_TO_ALLOC_MEM | OBJ_RELEASE, .arg2_type = ARG_ANYTHING, }; @@ -417,7 +417,7 @@ BPF_CALL_2(bpf_ringbuf_discard, void *, sample, u64, flags) const struct bpf_func_proto bpf_ringbuf_discard_proto = { .func = bpf_ringbuf_discard, .ret_type = RET_VOID, - .arg1_type = ARG_PTR_TO_ALLOC_MEM, + .arg1_type = ARG_PTR_TO_ALLOC_MEM | OBJ_RELEASE, .arg2_type = ARG_ANYTHING, }; @@ -475,3 +475,81 @@ const struct bpf_func_proto bpf_ringbuf_query_proto = { .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_ANYTHING, }; + +BPF_CALL_4(bpf_ringbuf_reserve_dynptr, struct bpf_map *, map, u32, size, u64, flags, + struct bpf_dynptr_kern *, ptr) +{ + struct bpf_ringbuf_map *rb_map; + void *sample; + int err; + + if (unlikely(flags)) { + bpf_dynptr_set_null(ptr); + return -EINVAL; + } + + err = bpf_dynptr_check_size(size); + if (err) { + bpf_dynptr_set_null(ptr); + return err; + } + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + + sample = __bpf_ringbuf_reserve(rb_map->rb, size); + if (!sample) { + bpf_dynptr_set_null(ptr); + return -EINVAL; + } + + bpf_dynptr_init(ptr, sample, BPF_DYNPTR_TYPE_RINGBUF, 0, size); + + return 0; +} + +const struct bpf_func_proto bpf_ringbuf_reserve_dynptr_proto = { + .func = bpf_ringbuf_reserve_dynptr, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | MEM_UNINIT, +}; + +BPF_CALL_2(bpf_ringbuf_submit_dynptr, struct bpf_dynptr_kern *, ptr, u64, flags) +{ + if (!ptr->data) + return 0; + + bpf_ringbuf_commit(ptr->data, flags, false /* discard */); + + bpf_dynptr_set_null(ptr); + + return 0; +} + +const struct bpf_func_proto bpf_ringbuf_submit_dynptr_proto = { + .func = bpf_ringbuf_submit_dynptr, + .ret_type = RET_VOID, + .arg1_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | OBJ_RELEASE, + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_ringbuf_discard_dynptr, struct bpf_dynptr_kern *, ptr, u64, flags) +{ + if (!ptr->data) + return 0; + + bpf_ringbuf_commit(ptr->data, flags, true /* discard */); + + bpf_dynptr_set_null(ptr); + + return 0; +} + +const struct bpf_func_proto bpf_ringbuf_discard_dynptr_proto = { + .func = bpf_ringbuf_discard_dynptr, + .ret_type = RET_VOID, + .arg1_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | OBJ_RELEASE, + .arg2_type = ARG_ANYTHING, +}; diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 34725bfa1e97..1adbe67cdb95 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -100,13 +100,11 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) return ERR_PTR(-E2BIG); cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap); - cost += n_buckets * (value_size + sizeof(struct stack_map_bucket)); smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr)); if (!smap) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&smap->map, attr); - smap->map.value_size = value_size; smap->n_buckets = n_buckets; err = get_callchain_buffers(sysctl_perf_event_max_stack); @@ -656,7 +654,7 @@ static void stack_map_free(struct bpf_map *map) put_callchain_buffers(); } -static int stack_trace_map_btf_id; +BTF_ID_LIST_SINGLE(stack_trace_map_btf_ids, struct, bpf_stack_map) const struct bpf_map_ops stack_trace_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = stack_map_alloc, @@ -666,6 +664,5 @@ const struct bpf_map_ops stack_trace_map_ops = { .map_update_elem = stack_map_update_elem, .map_delete_elem = stack_map_delete_elem, .map_check_btf = map_check_no_btf, - .map_btf_name = "bpf_stack_map", - .map_btf_id = &stack_trace_map_btf_id, + .map_btf_id = &stack_trace_map_btf_ids[0], }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index cdaa1152436a..2b69306d3c6e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -6,6 +6,7 @@ #include <linux/bpf_trace.h> #include <linux/bpf_lirc.h> #include <linux/bpf_verifier.h> +#include <linux/bsearch.h> #include <linux/btf.h> #include <linux/syscalls.h> #include <linux/slab.h> @@ -29,6 +30,7 @@ #include <linux/pgtable.h> #include <linux/bpf_lsm.h> #include <linux/poll.h> +#include <linux/sort.h> #include <linux/bpf-netns.h> #include <linux/rcupdate_trace.h> #include <linux/memcontrol.h> @@ -473,14 +475,128 @@ static void bpf_map_release_memcg(struct bpf_map *map) } #endif +static int bpf_map_kptr_off_cmp(const void *a, const void *b) +{ + const struct bpf_map_value_off_desc *off_desc1 = a, *off_desc2 = b; + + if (off_desc1->offset < off_desc2->offset) + return -1; + else if (off_desc1->offset > off_desc2->offset) + return 1; + return 0; +} + +struct bpf_map_value_off_desc *bpf_map_kptr_off_contains(struct bpf_map *map, u32 offset) +{ + /* Since members are iterated in btf_find_field in increasing order, + * offsets appended to kptr_off_tab are in increasing order, so we can + * do bsearch to find exact match. + */ + struct bpf_map_value_off *tab; + + if (!map_value_has_kptrs(map)) + return NULL; + tab = map->kptr_off_tab; + return bsearch(&offset, tab->off, tab->nr_off, sizeof(tab->off[0]), bpf_map_kptr_off_cmp); +} + +void bpf_map_free_kptr_off_tab(struct bpf_map *map) +{ + struct bpf_map_value_off *tab = map->kptr_off_tab; + int i; + + if (!map_value_has_kptrs(map)) + return; + for (i = 0; i < tab->nr_off; i++) { + if (tab->off[i].kptr.module) + module_put(tab->off[i].kptr.module); + btf_put(tab->off[i].kptr.btf); + } + kfree(tab); + map->kptr_off_tab = NULL; +} + +struct bpf_map_value_off *bpf_map_copy_kptr_off_tab(const struct bpf_map *map) +{ + struct bpf_map_value_off *tab = map->kptr_off_tab, *new_tab; + int size, i; + + if (!map_value_has_kptrs(map)) + return ERR_PTR(-ENOENT); + size = offsetof(struct bpf_map_value_off, off[tab->nr_off]); + new_tab = kmemdup(tab, size, GFP_KERNEL | __GFP_NOWARN); + if (!new_tab) + return ERR_PTR(-ENOMEM); + /* Do a deep copy of the kptr_off_tab */ + for (i = 0; i < tab->nr_off; i++) { + btf_get(tab->off[i].kptr.btf); + if (tab->off[i].kptr.module && !try_module_get(tab->off[i].kptr.module)) { + while (i--) { + if (tab->off[i].kptr.module) + module_put(tab->off[i].kptr.module); + btf_put(tab->off[i].kptr.btf); + } + kfree(new_tab); + return ERR_PTR(-ENXIO); + } + } + return new_tab; +} + +bool bpf_map_equal_kptr_off_tab(const struct bpf_map *map_a, const struct bpf_map *map_b) +{ + struct bpf_map_value_off *tab_a = map_a->kptr_off_tab, *tab_b = map_b->kptr_off_tab; + bool a_has_kptr = map_value_has_kptrs(map_a), b_has_kptr = map_value_has_kptrs(map_b); + int size; + + if (!a_has_kptr && !b_has_kptr) + return true; + if (a_has_kptr != b_has_kptr) + return false; + if (tab_a->nr_off != tab_b->nr_off) + return false; + size = offsetof(struct bpf_map_value_off, off[tab_a->nr_off]); + return !memcmp(tab_a, tab_b, size); +} + +/* Caller must ensure map_value_has_kptrs is true. Note that this function can + * be called on a map value while the map_value is visible to BPF programs, as + * it ensures the correct synchronization, and we already enforce the same using + * the bpf_kptr_xchg helper on the BPF program side for referenced kptrs. + */ +void bpf_map_free_kptrs(struct bpf_map *map, void *map_value) +{ + struct bpf_map_value_off *tab = map->kptr_off_tab; + unsigned long *btf_id_ptr; + int i; + + for (i = 0; i < tab->nr_off; i++) { + struct bpf_map_value_off_desc *off_desc = &tab->off[i]; + unsigned long old_ptr; + + btf_id_ptr = map_value + off_desc->offset; + if (off_desc->type == BPF_KPTR_UNREF) { + u64 *p = (u64 *)btf_id_ptr; + + WRITE_ONCE(p, 0); + continue; + } + old_ptr = xchg(btf_id_ptr, 0); + off_desc->kptr.dtor((void *)old_ptr); + } +} + /* called from workqueue */ static void bpf_map_free_deferred(struct work_struct *work) { struct bpf_map *map = container_of(work, struct bpf_map, work); security_bpf_map_free(map); + kfree(map->off_arr); bpf_map_release_memcg(map); - /* implementation dependent freeing */ + /* implementation dependent freeing, map_free callback also does + * bpf_map_free_kptr_off_tab, if needed. + */ map->ops->map_free(map); } @@ -640,7 +756,7 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) int err; if (!map->ops->map_mmap || map_value_has_spin_lock(map) || - map_value_has_timer(map)) + map_value_has_timer(map) || map_value_has_kptrs(map)) return -ENOTSUPP; if (!(vma->vm_flags & VM_SHARED)) @@ -767,6 +883,84 @@ int map_check_no_btf(const struct bpf_map *map, return -ENOTSUPP; } +static int map_off_arr_cmp(const void *_a, const void *_b, const void *priv) +{ + const u32 a = *(const u32 *)_a; + const u32 b = *(const u32 *)_b; + + if (a < b) + return -1; + else if (a > b) + return 1; + return 0; +} + +static void map_off_arr_swap(void *_a, void *_b, int size, const void *priv) +{ + struct bpf_map *map = (struct bpf_map *)priv; + u32 *off_base = map->off_arr->field_off; + u32 *a = _a, *b = _b; + u8 *sz_a, *sz_b; + + sz_a = map->off_arr->field_sz + (a - off_base); + sz_b = map->off_arr->field_sz + (b - off_base); + + swap(*a, *b); + swap(*sz_a, *sz_b); +} + +static int bpf_map_alloc_off_arr(struct bpf_map *map) +{ + bool has_spin_lock = map_value_has_spin_lock(map); + bool has_timer = map_value_has_timer(map); + bool has_kptrs = map_value_has_kptrs(map); + struct bpf_map_off_arr *off_arr; + u32 i; + + if (!has_spin_lock && !has_timer && !has_kptrs) { + map->off_arr = NULL; + return 0; + } + + off_arr = kmalloc(sizeof(*map->off_arr), GFP_KERNEL | __GFP_NOWARN); + if (!off_arr) + return -ENOMEM; + map->off_arr = off_arr; + + off_arr->cnt = 0; + if (has_spin_lock) { + i = off_arr->cnt; + + off_arr->field_off[i] = map->spin_lock_off; + off_arr->field_sz[i] = sizeof(struct bpf_spin_lock); + off_arr->cnt++; + } + if (has_timer) { + i = off_arr->cnt; + + off_arr->field_off[i] = map->timer_off; + off_arr->field_sz[i] = sizeof(struct bpf_timer); + off_arr->cnt++; + } + if (has_kptrs) { + struct bpf_map_value_off *tab = map->kptr_off_tab; + u32 *off = &off_arr->field_off[off_arr->cnt]; + u8 *sz = &off_arr->field_sz[off_arr->cnt]; + + for (i = 0; i < tab->nr_off; i++) { + *off++ = tab->off[i].offset; + *sz++ = sizeof(u64); + } + off_arr->cnt += tab->nr_off; + } + + if (off_arr->cnt == 1) + return 0; + sort_r(off_arr->field_off, off_arr->cnt, sizeof(off_arr->field_off[0]), + map_off_arr_cmp, map_off_arr_swap, map); + return 0; +} + static int map_check_btf(struct bpf_map *map, const struct btf *btf, u32 btf_key_id, u32 btf_value_id) { @@ -820,10 +1014,34 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, return -EOPNOTSUPP; } - if (map->ops->map_check_btf) + map->kptr_off_tab = btf_parse_kptrs(btf, value_type); + if (map_value_has_kptrs(map)) { + if (!bpf_capable()) { + ret = -EPERM; + goto free_map_tab; + } + if (map->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) { + ret = -EACCES; + goto free_map_tab; + } + if (map->map_type != BPF_MAP_TYPE_HASH && + map->map_type != BPF_MAP_TYPE_LRU_HASH && + map->map_type != BPF_MAP_TYPE_ARRAY) { + ret = -EOPNOTSUPP; + goto free_map_tab; + } + } + + if (map->ops->map_check_btf) { ret = map->ops->map_check_btf(map, btf, key_type, value_type); + if (ret < 0) + goto free_map_tab; + } return ret; +free_map_tab: + bpf_map_free_kptr_off_tab(map); + return ret; } #define BPF_MAP_CREATE_LAST_FIELD map_extra @@ -912,10 +1130,14 @@ static int map_create(union bpf_attr *attr) attr->btf_vmlinux_value_type_id; } - err = security_bpf_map_alloc(map); + err = bpf_map_alloc_off_arr(map); if (err) goto free_map; + err = security_bpf_map_alloc(map); + if (err) + goto free_map_off_arr; + err = bpf_map_alloc_id(map); if (err) goto free_map_sec; @@ -938,6 +1160,8 @@ static int map_create(union bpf_attr *attr) free_map_sec: security_bpf_map_free(map); +free_map_off_arr: + kfree(map->off_arr); free_map: btf_put(map->btf); map->ops->map_free(map); @@ -1639,7 +1863,7 @@ static int map_freeze(const union bpf_attr *attr) return PTR_ERR(map); if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS || - map_value_has_timer(map)) { + map_value_has_timer(map) || map_value_has_kptrs(map)) { fdput(f); return -ENOTSUPP; } @@ -2640,19 +2864,12 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd) } EXPORT_SYMBOL(bpf_link_get_from_fd); -struct bpf_tracing_link { - struct bpf_link link; - enum bpf_attach_type attach_type; - struct bpf_trampoline *trampoline; - struct bpf_prog *tgt_prog; -}; - static void bpf_tracing_link_release(struct bpf_link *link) { struct bpf_tracing_link *tr_link = - container_of(link, struct bpf_tracing_link, link); + container_of(link, struct bpf_tracing_link, link.link); - WARN_ON_ONCE(bpf_trampoline_unlink_prog(link->prog, + WARN_ON_ONCE(bpf_trampoline_unlink_prog(&tr_link->link, tr_link->trampoline)); bpf_trampoline_put(tr_link->trampoline); @@ -2665,7 +2882,7 @@ static void bpf_tracing_link_release(struct bpf_link *link) static void bpf_tracing_link_dealloc(struct bpf_link *link) { struct bpf_tracing_link *tr_link = - container_of(link, struct bpf_tracing_link, link); + container_of(link, struct bpf_tracing_link, link.link); kfree(tr_link); } @@ -2674,7 +2891,7 @@ static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link, struct seq_file *seq) { struct bpf_tracing_link *tr_link = - container_of(link, struct bpf_tracing_link, link); + container_of(link, struct bpf_tracing_link, link.link); seq_printf(seq, "attach_type:\t%d\n", @@ -2685,7 +2902,7 @@ static int bpf_tracing_link_fill_link_info(const struct bpf_link *link, struct bpf_link_info *info) { struct bpf_tracing_link *tr_link = - container_of(link, struct bpf_tracing_link, link); + container_of(link, struct bpf_tracing_link, link.link); info->tracing.attach_type = tr_link->attach_type; bpf_trampoline_unpack_key(tr_link->trampoline->key, @@ -2704,7 +2921,8 @@ static const struct bpf_link_ops bpf_tracing_link_lops = { static int bpf_tracing_prog_attach(struct bpf_prog *prog, int tgt_prog_fd, - u32 btf_id) + u32 btf_id, + u64 bpf_cookie) { struct bpf_link_primer link_primer; struct bpf_prog *tgt_prog = NULL; @@ -2766,9 +2984,10 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, err = -ENOMEM; goto out_put_prog; } - bpf_link_init(&link->link, BPF_LINK_TYPE_TRACING, + bpf_link_init(&link->link.link, BPF_LINK_TYPE_TRACING, &bpf_tracing_link_lops, prog); link->attach_type = prog->expected_attach_type; + link->link.cookie = bpf_cookie; mutex_lock(&prog->aux->dst_mutex); @@ -2836,11 +3055,11 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, tgt_prog = prog->aux->dst_prog; } - err = bpf_link_prime(&link->link, &link_primer); + err = bpf_link_prime(&link->link.link, &link_primer); if (err) goto out_unlock; - err = bpf_trampoline_link_prog(prog, tr); + err = bpf_trampoline_link_prog(&link->link, tr); if (err) { bpf_link_cleanup(&link_primer); link = NULL; @@ -3030,66 +3249,45 @@ static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *pro } #endif /* CONFIG_PERF_EVENTS */ -#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd - -static int bpf_raw_tracepoint_open(const union bpf_attr *attr) +static int bpf_raw_tp_link_attach(struct bpf_prog *prog, + const char __user *user_tp_name) { struct bpf_link_primer link_primer; struct bpf_raw_tp_link *link; struct bpf_raw_event_map *btp; - struct bpf_prog *prog; const char *tp_name; char buf[128]; int err; - if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN)) - return -EINVAL; - - prog = bpf_prog_get(attr->raw_tracepoint.prog_fd); - if (IS_ERR(prog)) - return PTR_ERR(prog); - switch (prog->type) { case BPF_PROG_TYPE_TRACING: case BPF_PROG_TYPE_EXT: case BPF_PROG_TYPE_LSM: - if (attr->raw_tracepoint.name) { + if (user_tp_name) /* The attach point for this category of programs * should be specified via btf_id during program load. */ - err = -EINVAL; - goto out_put_prog; - } + return -EINVAL; if (prog->type == BPF_PROG_TYPE_TRACING && prog->expected_attach_type == BPF_TRACE_RAW_TP) { tp_name = prog->aux->attach_func_name; break; } - err = bpf_tracing_prog_attach(prog, 0, 0); - if (err >= 0) - return err; - goto out_put_prog; + return bpf_tracing_prog_attach(prog, 0, 0, 0); case BPF_PROG_TYPE_RAW_TRACEPOINT: case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: - if (strncpy_from_user(buf, - u64_to_user_ptr(attr->raw_tracepoint.name), - sizeof(buf) - 1) < 0) { - err = -EFAULT; - goto out_put_prog; - } + if (strncpy_from_user(buf, user_tp_name, sizeof(buf) - 1) < 0) + return -EFAULT; buf[sizeof(buf) - 1] = 0; tp_name = buf; break; default: - err = -EINVAL; - goto out_put_prog; + return -EINVAL; } btp = bpf_get_raw_tracepoint(tp_name); - if (!btp) { - err = -ENOENT; - goto out_put_prog; - } + if (!btp) + return -ENOENT; link = kzalloc(sizeof(*link), GFP_USER); if (!link) { @@ -3116,11 +3314,29 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) out_put_btp: bpf_put_raw_tracepoint(btp); -out_put_prog: - bpf_prog_put(prog); return err; } +#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd + +static int bpf_raw_tracepoint_open(const union bpf_attr *attr) +{ + struct bpf_prog *prog; + int fd; + + if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN)) + return -EINVAL; + + prog = bpf_prog_get(attr->raw_tracepoint.prog_fd); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + fd = bpf_raw_tp_link_attach(prog, u64_to_user_ptr(attr->raw_tracepoint.name)); + if (fd < 0) + bpf_prog_put(prog); + return fd; +} + static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, enum bpf_attach_type attach_type) { @@ -3189,7 +3405,13 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) case BPF_CGROUP_SETSOCKOPT: return BPF_PROG_TYPE_CGROUP_SOCKOPT; case BPF_TRACE_ITER: + case BPF_TRACE_RAW_TP: + case BPF_TRACE_FENTRY: + case BPF_TRACE_FEXIT: + case BPF_MODIFY_RETURN: return BPF_PROG_TYPE_TRACING; + case BPF_LSM_MAC: + return BPF_PROG_TYPE_LSM; case BPF_SK_LOOKUP: return BPF_PROG_TYPE_SK_LOOKUP; case BPF_XDP: @@ -4246,21 +4468,6 @@ err_put: return err; } -static int tracing_bpf_link_attach(const union bpf_attr *attr, bpfptr_t uattr, - struct bpf_prog *prog) -{ - if (attr->link_create.attach_type != prog->expected_attach_type) - return -EINVAL; - - if (prog->expected_attach_type == BPF_TRACE_ITER) - return bpf_iter_link_attach(attr, uattr, prog); - else if (prog->type == BPF_PROG_TYPE_EXT) - return bpf_tracing_prog_attach(prog, - attr->link_create.target_fd, - attr->link_create.target_btf_id); - return -EINVAL; -} - #define BPF_LINK_CREATE_LAST_FIELD link_create.kprobe_multi.cookies static int link_create(union bpf_attr *attr, bpfptr_t uattr) { @@ -4282,15 +4489,13 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) switch (prog->type) { case BPF_PROG_TYPE_EXT: - ret = tracing_bpf_link_attach(attr, uattr, prog); - goto out; + break; case BPF_PROG_TYPE_PERF_EVENT: case BPF_PROG_TYPE_TRACEPOINT: if (attr->link_create.attach_type != BPF_PERF_EVENT) { ret = -EINVAL; goto out; } - ptype = prog->type; break; case BPF_PROG_TYPE_KPROBE: if (attr->link_create.attach_type != BPF_PERF_EVENT && @@ -4298,7 +4503,6 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) ret = -EINVAL; goto out; } - ptype = prog->type; break; default: ptype = attach_type_to_prog_type(attr->link_create.attach_type); @@ -4309,7 +4513,7 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) break; } - switch (ptype) { + switch (prog->type) { case BPF_PROG_TYPE_CGROUP_SKB: case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: @@ -4319,8 +4523,27 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) case BPF_PROG_TYPE_CGROUP_SOCKOPT: ret = cgroup_bpf_link_attach(attr, prog); break; + case BPF_PROG_TYPE_EXT: + ret = bpf_tracing_prog_attach(prog, + attr->link_create.target_fd, + attr->link_create.target_btf_id, + attr->link_create.tracing.cookie); + break; + case BPF_PROG_TYPE_LSM: case BPF_PROG_TYPE_TRACING: - ret = tracing_bpf_link_attach(attr, uattr, prog); + if (attr->link_create.attach_type != prog->expected_attach_type) { + ret = -EINVAL; + goto out; + } + if (prog->expected_attach_type == BPF_TRACE_RAW_TP) + ret = bpf_raw_tp_link_attach(prog, NULL); + else if (prog->expected_attach_type == BPF_TRACE_ITER) + ret = bpf_iter_link_attach(attr, uattr, prog); + else + ret = bpf_tracing_prog_attach(prog, + attr->link_create.target_fd, + attr->link_create.target_btf_id, + attr->link_create.tracing.cookie); break; case BPF_PROG_TYPE_FLOW_DISSECTOR: case BPF_PROG_TYPE_SK_LOOKUP: @@ -4454,6 +4677,25 @@ struct bpf_link *bpf_link_by_id(u32 id) return link; } +struct bpf_link *bpf_link_get_curr_or_next(u32 *id) +{ + struct bpf_link *link; + + spin_lock_bh(&link_idr_lock); +again: + link = idr_get_next(&link_idr, id); + if (link) { + link = bpf_link_inc_not_zero(link); + if (IS_ERR(link)) { + (*id)++; + goto again; + } + } + spin_unlock_bh(&link_idr_lock); + + return link; +} + #define BPF_LINK_GET_FD_BY_ID_LAST_FIELD link_id static int bpf_link_get_fd_by_id(const union bpf_attr *attr) @@ -4621,9 +4863,21 @@ out_prog_put: static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size) { union bpf_attr attr; + bool capable; int err; - if (sysctl_unprivileged_bpf_disabled && !bpf_capable()) + capable = bpf_capable() || !sysctl_unprivileged_bpf_disabled; + + /* Intent here is for unprivileged_bpf_disabled to block key object + * creation commands for unprivileged users; other actions depend + * of fd availability and access to bpffs, so are dependent on + * object creation success. Capabilities are later verified for + * operations such as load and map create, so even with unprivileged + * BPF disabled, capability checks are still carried out for these + * and other operations. + */ + if (!capable && + (cmd == BPF_MAP_CREATE || cmd == BPF_PROG_LOAD)) return -EPERM; err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size); @@ -4782,6 +5036,7 @@ static bool syscall_prog_is_valid_access(int off, int size, BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size) { struct bpf_prog * __maybe_unused prog; + struct bpf_tramp_run_ctx __maybe_unused run_ctx; switch (cmd) { case BPF_MAP_CREATE: @@ -4809,13 +5064,15 @@ BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size) return -EINVAL; } - if (!__bpf_prog_enter_sleepable(prog)) { + run_ctx.bpf_cookie = 0; + run_ctx.saved_run_ctx = NULL; + if (!__bpf_prog_enter_sleepable(prog, &run_ctx)) { /* recursion detected */ bpf_prog_put(prog); return -EBUSY; } attr->test.retval = bpf_prog_run(prog, (void *) (long) attr->test.ctx_in); - __bpf_prog_exit_sleepable(prog, 0 /* bpf_prog_run does runtime stats */); + __bpf_prog_exit_sleepable(prog, 0 /* bpf_prog_run does runtime stats */, &run_ctx); bpf_prog_put(prog); return 0; #endif @@ -4908,3 +5165,90 @@ const struct bpf_verifier_ops bpf_syscall_verifier_ops = { const struct bpf_prog_ops bpf_syscall_prog_ops = { .test_run = bpf_prog_test_run_syscall, }; + +#ifdef CONFIG_SYSCTL +static int bpf_stats_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct static_key *key = (struct static_key *)table->data; + static int saved_val; + int val, ret; + struct ctl_table tmp = { + .data = &val, + .maxlen = sizeof(val), + .mode = table->mode, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + mutex_lock(&bpf_stats_enabled_mutex); + val = saved_val; + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + if (write && !ret && val != saved_val) { + if (val) + static_key_slow_inc(key); + else + static_key_slow_dec(key); + saved_val = val; + } + mutex_unlock(&bpf_stats_enabled_mutex); + return ret; +} + +void __weak unpriv_ebpf_notify(int new_state) +{ +} + +static int bpf_unpriv_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int ret, unpriv_enable = *(int *)table->data; + bool locked_state = unpriv_enable == 1; + struct ctl_table tmp = *table; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + tmp.data = &unpriv_enable; + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + if (write && !ret) { + if (locked_state && unpriv_enable != 1) + return -EPERM; + *(int *)table->data = unpriv_enable; + } + + unpriv_ebpf_notify(unpriv_enable); + + return ret; +} + +static struct ctl_table bpf_syscall_table[] = { + { + .procname = "unprivileged_bpf_disabled", + .data = &sysctl_unprivileged_bpf_disabled, + .maxlen = sizeof(sysctl_unprivileged_bpf_disabled), + .mode = 0644, + .proc_handler = bpf_unpriv_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, + }, + { + .procname = "bpf_stats_enabled", + .data = &bpf_stats_enabled_key.key, + .maxlen = sizeof(bpf_stats_enabled_key), + .mode = 0644, + .proc_handler = bpf_stats_handler, + }, + { } +}; + +static int __init bpf_syscall_sysctl_init(void) +{ + register_sysctl_init("kernel", bpf_syscall_table); + return 0; +} +late_initcall(bpf_syscall_sysctl_init); +#endif /* CONFIG_SYSCTL */ diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index d94696198ef8..8c921799def4 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -99,7 +99,6 @@ static int __task_seq_show(struct seq_file *seq, struct task_struct *task, if (!prog) return 0; - meta.seq = seq; ctx.meta = &meta; ctx.task = task; return bpf_iter_run_prog(prog, &ctx); diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index ada97751ae1b..93c7675f0c9e 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -30,9 +30,12 @@ static DEFINE_MUTEX(trampoline_mutex); bool bpf_prog_has_trampoline(const struct bpf_prog *prog) { enum bpf_attach_type eatype = prog->expected_attach_type; + enum bpf_prog_type ptype = prog->type; - return eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT || - eatype == BPF_MODIFY_RETURN; + return (ptype == BPF_PROG_TYPE_TRACING && + (eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT || + eatype == BPF_MODIFY_RETURN)) || + (ptype == BPF_PROG_TYPE_LSM && eatype == BPF_LSM_MAC); } void *bpf_jit_alloc_exec_page(void) @@ -168,30 +171,30 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr) return ret; } -static struct bpf_tramp_progs * +static struct bpf_tramp_links * bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total, bool *ip_arg) { - const struct bpf_prog_aux *aux; - struct bpf_tramp_progs *tprogs; - struct bpf_prog **progs; + struct bpf_tramp_link *link; + struct bpf_tramp_links *tlinks; + struct bpf_tramp_link **links; int kind; *total = 0; - tprogs = kcalloc(BPF_TRAMP_MAX, sizeof(*tprogs), GFP_KERNEL); - if (!tprogs) + tlinks = kcalloc(BPF_TRAMP_MAX, sizeof(*tlinks), GFP_KERNEL); + if (!tlinks) return ERR_PTR(-ENOMEM); for (kind = 0; kind < BPF_TRAMP_MAX; kind++) { - tprogs[kind].nr_progs = tr->progs_cnt[kind]; + tlinks[kind].nr_links = tr->progs_cnt[kind]; *total += tr->progs_cnt[kind]; - progs = tprogs[kind].progs; + links = tlinks[kind].links; - hlist_for_each_entry(aux, &tr->progs_hlist[kind], tramp_hlist) { - *ip_arg |= aux->prog->call_get_func_ip; - *progs++ = aux->prog; + hlist_for_each_entry(link, &tr->progs_hlist[kind], tramp_hlist) { + *ip_arg |= link->link.prog->call_get_func_ip; + *links++ = link; } } - return tprogs; + return tlinks; } static void __bpf_tramp_image_put_deferred(struct work_struct *work) @@ -330,14 +333,14 @@ out: static int bpf_trampoline_update(struct bpf_trampoline *tr) { struct bpf_tramp_image *im; - struct bpf_tramp_progs *tprogs; + struct bpf_tramp_links *tlinks; u32 flags = BPF_TRAMP_F_RESTORE_REGS; bool ip_arg = false; int err, total; - tprogs = bpf_trampoline_get_progs(tr, &total, &ip_arg); - if (IS_ERR(tprogs)) - return PTR_ERR(tprogs); + tlinks = bpf_trampoline_get_progs(tr, &total, &ip_arg); + if (IS_ERR(tlinks)) + return PTR_ERR(tlinks); if (total == 0) { err = unregister_fentry(tr, tr->cur_image->image); @@ -353,15 +356,15 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) goto out; } - if (tprogs[BPF_TRAMP_FEXIT].nr_progs || - tprogs[BPF_TRAMP_MODIFY_RETURN].nr_progs) + if (tlinks[BPF_TRAMP_FEXIT].nr_links || + tlinks[BPF_TRAMP_MODIFY_RETURN].nr_links) flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME; if (ip_arg) flags |= BPF_TRAMP_F_IP_ARG; err = arch_prepare_bpf_trampoline(im, im->image, im->image + PAGE_SIZE, - &tr->func.model, flags, tprogs, + &tr->func.model, flags, tlinks, tr->func.addr); if (err < 0) goto out; @@ -381,7 +384,7 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) tr->cur_image = im; tr->selector++; out: - kfree(tprogs); + kfree(tlinks); return err; } @@ -407,13 +410,14 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog) } } -int bpf_trampoline_link_prog(struct bpf_prog *prog, struct bpf_trampoline *tr) +int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) { enum bpf_tramp_prog_type kind; + struct bpf_tramp_link *link_exiting; int err = 0; - int cnt; + int cnt = 0, i; - kind = bpf_attach_type_to_tramp(prog); + kind = bpf_attach_type_to_tramp(link->link.prog); mutex_lock(&tr->mutex); if (tr->extension_prog) { /* cannot attach fentry/fexit if extension prog is attached. @@ -422,32 +426,43 @@ int bpf_trampoline_link_prog(struct bpf_prog *prog, struct bpf_trampoline *tr) err = -EBUSY; goto out; } - cnt = tr->progs_cnt[BPF_TRAMP_FENTRY] + tr->progs_cnt[BPF_TRAMP_FEXIT]; + + for (i = 0; i < BPF_TRAMP_MAX; i++) + cnt += tr->progs_cnt[i]; + if (kind == BPF_TRAMP_REPLACE) { /* Cannot attach extension if fentry/fexit are in use. */ if (cnt) { err = -EBUSY; goto out; } - tr->extension_prog = prog; + tr->extension_prog = link->link.prog; err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL, - prog->bpf_func); + link->link.prog->bpf_func); goto out; } - if (cnt >= BPF_MAX_TRAMP_PROGS) { + if (cnt >= BPF_MAX_TRAMP_LINKS) { err = -E2BIG; goto out; } - if (!hlist_unhashed(&prog->aux->tramp_hlist)) { + if (!hlist_unhashed(&link->tramp_hlist)) { + /* prog already linked */ + err = -EBUSY; + goto out; + } + hlist_for_each_entry(link_exiting, &tr->progs_hlist[kind], tramp_hlist) { + if (link_exiting->link.prog != link->link.prog) + continue; /* prog already linked */ err = -EBUSY; goto out; } - hlist_add_head(&prog->aux->tramp_hlist, &tr->progs_hlist[kind]); + + hlist_add_head(&link->tramp_hlist, &tr->progs_hlist[kind]); tr->progs_cnt[kind]++; err = bpf_trampoline_update(tr); if (err) { - hlist_del_init(&prog->aux->tramp_hlist); + hlist_del_init(&link->tramp_hlist); tr->progs_cnt[kind]--; } out: @@ -456,12 +471,12 @@ out: } /* bpf_trampoline_unlink_prog() should never fail. */ -int bpf_trampoline_unlink_prog(struct bpf_prog *prog, struct bpf_trampoline *tr) +int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) { enum bpf_tramp_prog_type kind; int err; - kind = bpf_attach_type_to_tramp(prog); + kind = bpf_attach_type_to_tramp(link->link.prog); mutex_lock(&tr->mutex); if (kind == BPF_TRAMP_REPLACE) { WARN_ON_ONCE(!tr->extension_prog); @@ -470,7 +485,7 @@ int bpf_trampoline_unlink_prog(struct bpf_prog *prog, struct bpf_trampoline *tr) tr->extension_prog = NULL; goto out; } - hlist_del_init(&prog->aux->tramp_hlist); + hlist_del_init(&link->tramp_hlist); tr->progs_cnt[kind]--; err = bpf_trampoline_update(tr); out: @@ -500,16 +515,19 @@ out: void bpf_trampoline_put(struct bpf_trampoline *tr) { + int i; + if (!tr) return; mutex_lock(&trampoline_mutex); if (!refcount_dec_and_test(&tr->refcnt)) goto out; WARN_ON_ONCE(mutex_is_locked(&tr->mutex)); - if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FENTRY]))) - goto out; - if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT]))) - goto out; + + for (i = 0; i < BPF_TRAMP_MAX; i++) + if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[i]))) + goto out; + /* This code will be executed even when the last bpf_tramp_image * is alive. All progs are detached from the trampoline and the * trampoline image is patched with jmp into epilogue to skip @@ -559,11 +577,14 @@ static void notrace inc_misses_counter(struct bpf_prog *prog) * [2..MAX_U64] - execute bpf prog and record execution time. * This is start time. */ -u64 notrace __bpf_prog_enter(struct bpf_prog *prog) +u64 notrace __bpf_prog_enter(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx) __acquires(RCU) { rcu_read_lock(); migrate_disable(); + + run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); + if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) { inc_misses_counter(prog); return 0; @@ -593,29 +614,38 @@ static void notrace update_prog_stats(struct bpf_prog *prog, } } -void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start) +void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start, struct bpf_tramp_run_ctx *run_ctx) __releases(RCU) { + bpf_reset_run_ctx(run_ctx->saved_run_ctx); + update_prog_stats(prog, start); __this_cpu_dec(*(prog->active)); migrate_enable(); rcu_read_unlock(); } -u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog) +u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx) { rcu_read_lock_trace(); migrate_disable(); might_fault(); + if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) { inc_misses_counter(prog); return 0; } + + run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); + return bpf_prog_start_time(); } -void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start) +void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start, + struct bpf_tramp_run_ctx *run_ctx) { + bpf_reset_run_ctx(run_ctx->saved_run_ctx); + update_prog_stats(prog, start); __this_cpu_dec(*(prog->active)); migrate_enable(); @@ -635,7 +665,7 @@ void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr) int __weak arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end, const struct btf_func_model *m, u32 flags, - struct bpf_tramp_progs *tprogs, + struct bpf_tramp_links *tlinks, void *orig_call) { return -ENOTSUPP; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d175b70067b3..aedac2ac02b9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -187,6 +187,9 @@ struct bpf_verifier_stack_elem { POISON_POINTER_DELTA)) #define BPF_MAP_PTR(X) ((struct bpf_map *)((X) & ~BPF_MAP_PTR_UNPRIV)) +static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx); +static int release_reference(struct bpf_verifier_env *env, int ref_obj_id); + static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux) { return BPF_MAP_PTR(aux->map_ptr_state) == BPF_MAP_PTR_POISON; @@ -245,6 +248,7 @@ struct bpf_call_arg_meta { struct bpf_map *map_ptr; bool raw_mode; bool pkt_access; + u8 release_regno; int regno; int access_size; int mem_size; @@ -257,6 +261,8 @@ struct bpf_call_arg_meta { struct btf *ret_btf; u32 ret_btf_id; u32 subprogno; + struct bpf_map_value_off_desc *kptr_off_desc; + u8 uninit_dynptr_regno; }; struct btf *btf_vmlinux; @@ -471,17 +477,6 @@ static bool type_may_be_null(u32 type) return type & PTR_MAYBE_NULL; } -/* Determine whether the function releases some resources allocated by another - * function call. The first reference type argument will be assumed to be - * released by release_reference(). - */ -static bool is_release_function(enum bpf_func_id func_id) -{ - return func_id == BPF_FUNC_sk_release || - func_id == BPF_FUNC_ringbuf_submit || - func_id == BPF_FUNC_ringbuf_discard; -} - static bool may_be_acquire_function(enum bpf_func_id func_id) { return func_id == BPF_FUNC_sk_lookup_tcp || @@ -499,7 +494,8 @@ static bool is_acquire_function(enum bpf_func_id func_id, if (func_id == BPF_FUNC_sk_lookup_tcp || func_id == BPF_FUNC_sk_lookup_udp || func_id == BPF_FUNC_skc_lookup_tcp || - func_id == BPF_FUNC_ringbuf_reserve) + func_id == BPF_FUNC_ringbuf_reserve || + func_id == BPF_FUNC_kptr_xchg) return true; if (func_id == BPF_FUNC_map_lookup_elem && @@ -517,6 +513,7 @@ static bool is_ptr_cast_function(enum bpf_func_id func_id) func_id == BPF_FUNC_skc_to_tcp_sock || func_id == BPF_FUNC_skc_to_tcp6_sock || func_id == BPF_FUNC_skc_to_udp6_sock || + func_id == BPF_FUNC_skc_to_mptcp_sock || func_id == BPF_FUNC_skc_to_tcp_timewait_sock || func_id == BPF_FUNC_skc_to_tcp_request_sock; } @@ -575,6 +572,8 @@ static const char *reg_type_str(struct bpf_verifier_env *env, strncpy(prefix, "user_", 32); if (type & MEM_PERCPU) strncpy(prefix, "percpu_", 32); + if (type & PTR_UNTRUSTED) + strncpy(prefix, "untrusted_", 32); snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s%s", prefix, str[base_type(type)], postfix); @@ -586,6 +585,7 @@ static char slot_type_char[] = { [STACK_SPILL] = 'r', [STACK_MISC] = 'm', [STACK_ZERO] = '0', + [STACK_DYNPTR] = 'd', }; static void print_liveness(struct bpf_verifier_env *env, @@ -601,6 +601,25 @@ static void print_liveness(struct bpf_verifier_env *env, verbose(env, "D"); } +static int get_spi(s32 off) +{ + return (-off - 1) / BPF_REG_SIZE; +} + +static bool is_spi_bounds_valid(struct bpf_func_state *state, int spi, int nr_slots) +{ + int allocated_slots = state->allocated_stack / BPF_REG_SIZE; + + /* We need to check that slots between [spi - nr_slots + 1, spi] are + * within [0, allocated_stack). + * + * Please note that the spi grows downwards. For example, a dynptr + * takes the size of two stack slots; the first slot will be at + * spi and the second slot will be at spi - 1. + */ + return spi - nr_slots + 1 >= 0 && spi < allocated_slots; +} + static struct bpf_func_state *func(struct bpf_verifier_env *env, const struct bpf_reg_state *reg) { @@ -652,6 +671,132 @@ static void mark_verifier_state_scratched(struct bpf_verifier_env *env) env->scratched_stack_slots = ~0ULL; } +static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type) +{ + switch (arg_type & DYNPTR_TYPE_FLAG_MASK) { + case DYNPTR_TYPE_LOCAL: + return BPF_DYNPTR_TYPE_LOCAL; + case DYNPTR_TYPE_RINGBUF: + return BPF_DYNPTR_TYPE_RINGBUF; + default: + return BPF_DYNPTR_TYPE_INVALID; + } +} + +static bool dynptr_type_refcounted(enum bpf_dynptr_type type) +{ + return type == BPF_DYNPTR_TYPE_RINGBUF; +} + +static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + enum bpf_arg_type arg_type, int insn_idx) +{ + struct bpf_func_state *state = func(env, reg); + enum bpf_dynptr_type type; + int spi, i, id; + + spi = get_spi(reg->off); + + if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS)) + return -EINVAL; + + for (i = 0; i < BPF_REG_SIZE; i++) { + state->stack[spi].slot_type[i] = STACK_DYNPTR; + state->stack[spi - 1].slot_type[i] = STACK_DYNPTR; + } + + type = arg_to_dynptr_type(arg_type); + if (type == BPF_DYNPTR_TYPE_INVALID) + return -EINVAL; + + state->stack[spi].spilled_ptr.dynptr.first_slot = true; + state->stack[spi].spilled_ptr.dynptr.type = type; + state->stack[spi - 1].spilled_ptr.dynptr.type = type; + + if (dynptr_type_refcounted(type)) { + /* The id is used to track proper releasing */ + id = acquire_reference_state(env, insn_idx); + if (id < 0) + return id; + + state->stack[spi].spilled_ptr.id = id; + state->stack[spi - 1].spilled_ptr.id = id; + } + + return 0; +} + +static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +{ + struct bpf_func_state *state = func(env, reg); + int spi, i; + + spi = get_spi(reg->off); + + if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS)) + return -EINVAL; + + for (i = 0; i < BPF_REG_SIZE; i++) { + state->stack[spi].slot_type[i] = STACK_INVALID; + state->stack[spi - 1].slot_type[i] = STACK_INVALID; + } + + /* Invalidate any slices associated with this dynptr */ + if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) { + release_reference(env, state->stack[spi].spilled_ptr.id); + state->stack[spi].spilled_ptr.id = 0; + state->stack[spi - 1].spilled_ptr.id = 0; + } + + state->stack[spi].spilled_ptr.dynptr.first_slot = false; + state->stack[spi].spilled_ptr.dynptr.type = 0; + state->stack[spi - 1].spilled_ptr.dynptr.type = 0; + + return 0; +} + +static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +{ + struct bpf_func_state *state = func(env, reg); + int spi = get_spi(reg->off); + int i; + + if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS)) + return true; + + for (i = 0; i < BPF_REG_SIZE; i++) { + if (state->stack[spi].slot_type[i] == STACK_DYNPTR || + state->stack[spi - 1].slot_type[i] == STACK_DYNPTR) + return false; + } + + return true; +} + +static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + enum bpf_arg_type arg_type) +{ + struct bpf_func_state *state = func(env, reg); + int spi = get_spi(reg->off); + int i; + + if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) || + !state->stack[spi].spilled_ptr.dynptr.first_slot) + return false; + + for (i = 0; i < BPF_REG_SIZE; i++) { + if (state->stack[spi].slot_type[i] != STACK_DYNPTR || + state->stack[spi - 1].slot_type[i] != STACK_DYNPTR) + return false; + } + + /* ARG_PTR_TO_DYNPTR takes any type of dynptr */ + if (arg_type == ARG_PTR_TO_DYNPTR) + return true; + + return state->stack[spi].spilled_ptr.dynptr.type == arg_to_dynptr_type(arg_type); +} + /* The reg state of a pointer or a bounded scalar was saved when * it was spilled to the stack. */ @@ -1821,8 +1966,7 @@ void bpf_free_kfunc_btf_tab(struct bpf_kfunc_btf_tab *tab) kfree(tab); } -static struct btf *find_kfunc_desc_btf(struct bpf_verifier_env *env, - u32 func_id, s16 offset) +static struct btf *find_kfunc_desc_btf(struct bpf_verifier_env *env, s16 offset) { if (offset) { if (offset < 0) { @@ -1897,7 +2041,7 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) prog_aux->kfunc_btf_tab = btf_tab; } - desc_btf = find_kfunc_desc_btf(env, func_id, offset); + desc_btf = find_kfunc_desc_btf(env, offset); if (IS_ERR(desc_btf)) { verbose(env, "failed to find BTF for kernel function\n"); return PTR_ERR(desc_btf); @@ -2366,7 +2510,7 @@ static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn) if (insn->src_reg != BPF_PSEUDO_KFUNC_CALL) return NULL; - desc_btf = find_kfunc_desc_btf(data, insn->imm, insn->off); + desc_btf = find_kfunc_desc_btf(data, insn->off); if (IS_ERR(desc_btf)) return "<error>"; @@ -3211,7 +3355,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, return 0; } -enum stack_access_src { +enum bpf_access_src { ACCESS_DIRECT = 1, /* the access is performed by an instruction */ ACCESS_HELPER = 2, /* the access is performed by a helper */ }; @@ -3219,7 +3363,7 @@ enum stack_access_src { static int check_stack_range_initialized(struct bpf_verifier_env *env, int regno, int off, int access_size, bool zero_size_allowed, - enum stack_access_src type, + enum bpf_access_src type, struct bpf_call_arg_meta *meta); static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno) @@ -3469,9 +3613,175 @@ static int check_mem_region_access(struct bpf_verifier_env *env, u32 regno, return 0; } +static int __check_ptr_off_reg(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int regno, + bool fixed_off_ok) +{ + /* Access to this pointer-typed register or passing it to a helper + * is only allowed in its original, unmodified form. + */ + + if (reg->off < 0) { + verbose(env, "negative offset %s ptr R%d off=%d disallowed\n", + reg_type_str(env, reg->type), regno, reg->off); + return -EACCES; + } + + if (!fixed_off_ok && reg->off) { + verbose(env, "dereference of modified %s ptr R%d off=%d disallowed\n", + reg_type_str(env, reg->type), regno, reg->off); + return -EACCES; + } + + if (!tnum_is_const(reg->var_off) || reg->var_off.value) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "variable %s access var_off=%s disallowed\n", + reg_type_str(env, reg->type), tn_buf); + return -EACCES; + } + + return 0; +} + +int check_ptr_off_reg(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int regno) +{ + return __check_ptr_off_reg(env, reg, regno, false); +} + +static int map_kptr_match_type(struct bpf_verifier_env *env, + struct bpf_map_value_off_desc *off_desc, + struct bpf_reg_state *reg, u32 regno) +{ + const char *targ_name = kernel_type_name(off_desc->kptr.btf, off_desc->kptr.btf_id); + int perm_flags = PTR_MAYBE_NULL; + const char *reg_name = ""; + + /* Only unreferenced case accepts untrusted pointers */ + if (off_desc->type == BPF_KPTR_UNREF) + perm_flags |= PTR_UNTRUSTED; + + if (base_type(reg->type) != PTR_TO_BTF_ID || (type_flag(reg->type) & ~perm_flags)) + goto bad_type; + + if (!btf_is_kernel(reg->btf)) { + verbose(env, "R%d must point to kernel BTF\n", regno); + return -EINVAL; + } + /* We need to verify reg->type and reg->btf, before accessing reg->btf */ + reg_name = kernel_type_name(reg->btf, reg->btf_id); + + /* For ref_ptr case, release function check should ensure we get one + * referenced PTR_TO_BTF_ID, and that its fixed offset is 0. For the + * normal store of unreferenced kptr, we must ensure var_off is zero. + * Since ref_ptr cannot be accessed directly by BPF insns, checks for + * reg->off and reg->ref_obj_id are not needed here. + */ + if (__check_ptr_off_reg(env, reg, regno, true)) + return -EACCES; + + /* A full type match is needed, as BTF can be vmlinux or module BTF, and + * we also need to take into account the reg->off. + * + * We want to support cases like: + * + * struct foo { + * struct bar br; + * struct baz bz; + * }; + * + * struct foo *v; + * v = func(); // PTR_TO_BTF_ID + * val->foo = v; // reg->off is zero, btf and btf_id match type + * val->bar = &v->br; // reg->off is still zero, but we need to retry with + * // first member type of struct after comparison fails + * val->baz = &v->bz; // reg->off is non-zero, so struct needs to be walked + * // to match type + * + * In the kptr_ref case, check_func_arg_reg_off already ensures reg->off + * is zero. We must also ensure that btf_struct_ids_match does not walk + * the struct to match type against first member of struct, i.e. reject + * second case from above. Hence, when type is BPF_KPTR_REF, we set + * strict mode to true for type match. + */ + if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off, + off_desc->kptr.btf, off_desc->kptr.btf_id, + off_desc->type == BPF_KPTR_REF)) + goto bad_type; + return 0; +bad_type: + verbose(env, "invalid kptr access, R%d type=%s%s ", regno, + reg_type_str(env, reg->type), reg_name); + verbose(env, "expected=%s%s", reg_type_str(env, PTR_TO_BTF_ID), targ_name); + if (off_desc->type == BPF_KPTR_UNREF) + verbose(env, " or %s%s\n", reg_type_str(env, PTR_TO_BTF_ID | PTR_UNTRUSTED), + targ_name); + else + verbose(env, "\n"); + return -EINVAL; +} + +static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, + int value_regno, int insn_idx, + struct bpf_map_value_off_desc *off_desc) +{ + struct bpf_insn *insn = &env->prog->insnsi[insn_idx]; + int class = BPF_CLASS(insn->code); + struct bpf_reg_state *val_reg; + + /* Things we already checked for in check_map_access and caller: + * - Reject cases where variable offset may touch kptr + * - size of access (must be BPF_DW) + * - tnum_is_const(reg->var_off) + * - off_desc->offset == off + reg->var_off.value + */ + /* Only BPF_[LDX,STX,ST] | BPF_MEM | BPF_DW is supported */ + if (BPF_MODE(insn->code) != BPF_MEM) { + verbose(env, "kptr in map can only be accessed using BPF_MEM instruction mode\n"); + return -EACCES; + } + + /* We only allow loading referenced kptr, since it will be marked as + * untrusted, similar to unreferenced kptr. + */ + if (class != BPF_LDX && off_desc->type == BPF_KPTR_REF) { + verbose(env, "store to referenced kptr disallowed\n"); + return -EACCES; + } + + if (class == BPF_LDX) { + val_reg = reg_state(env, value_regno); + /* We can simply mark the value_regno receiving the pointer + * value from map as PTR_TO_BTF_ID, with the correct type. + */ + mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, off_desc->kptr.btf, + off_desc->kptr.btf_id, PTR_MAYBE_NULL | PTR_UNTRUSTED); + /* For mark_ptr_or_null_reg */ + val_reg->id = ++env->id_gen; + } else if (class == BPF_STX) { + val_reg = reg_state(env, value_regno); + if (!register_is_null(val_reg) && + map_kptr_match_type(env, off_desc, val_reg, value_regno)) + return -EACCES; + } else if (class == BPF_ST) { + if (insn->imm) { + verbose(env, "BPF_ST imm must be 0 when storing to kptr at off=%u\n", + off_desc->offset); + return -EACCES; + } + } else { + verbose(env, "kptr in map can only be accessed using BPF_LDX/BPF_STX/BPF_ST\n"); + return -EACCES; + } + return 0; +} + /* check read/write into a map element with possible variable offset */ static int check_map_access(struct bpf_verifier_env *env, u32 regno, - int off, int size, bool zero_size_allowed) + int off, int size, bool zero_size_allowed, + enum bpf_access_src src) { struct bpf_verifier_state *vstate = env->cur_state; struct bpf_func_state *state = vstate->frame[vstate->curframe]; @@ -3507,6 +3817,36 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, return -EACCES; } } + if (map_value_has_kptrs(map)) { + struct bpf_map_value_off *tab = map->kptr_off_tab; + int i; + + for (i = 0; i < tab->nr_off; i++) { + u32 p = tab->off[i].offset; + + if (reg->smin_value + off < p + sizeof(u64) && + p < reg->umax_value + off + size) { + if (src != ACCESS_DIRECT) { + verbose(env, "kptr cannot be accessed indirectly by helper\n"); + return -EACCES; + } + if (!tnum_is_const(reg->var_off)) { + verbose(env, "kptr access cannot have variable offset\n"); + return -EACCES; + } + if (p != off + reg->var_off.value) { + verbose(env, "kptr access misaligned expected=%u off=%llu\n", + p, off + reg->var_off.value); + return -EACCES; + } + if (size != bpf_size_to_bytes(BPF_DW)) { + verbose(env, "kptr access size must be BPF_DW\n"); + return -EACCES; + } + break; + } + } + } return err; } @@ -3980,44 +4320,6 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env, } #endif -static int __check_ptr_off_reg(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, int regno, - bool fixed_off_ok) -{ - /* Access to this pointer-typed register or passing it to a helper - * is only allowed in its original, unmodified form. - */ - - if (reg->off < 0) { - verbose(env, "negative offset %s ptr R%d off=%d disallowed\n", - reg_type_str(env, reg->type), regno, reg->off); - return -EACCES; - } - - if (!fixed_off_ok && reg->off) { - verbose(env, "dereference of modified %s ptr R%d off=%d disallowed\n", - reg_type_str(env, reg->type), regno, reg->off); - return -EACCES; - } - - if (!tnum_is_const(reg->var_off) || reg->var_off.value) { - char tn_buf[48]; - - tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "variable %s access var_off=%s disallowed\n", - reg_type_str(env, reg->type), tn_buf); - return -EACCES; - } - - return 0; -} - -int check_ptr_off_reg(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, int regno) -{ - return __check_ptr_off_reg(env, reg, regno, false); -} - static int __check_buffer_access(struct bpf_verifier_env *env, const char *buf_info, const struct bpf_reg_state *reg, @@ -4224,6 +4526,12 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, if (ret < 0) return ret; + /* If this is an untrusted pointer, all pointers formed by walking it + * also inherit the untrusted flag. + */ + if (type_flag(reg->type) & PTR_UNTRUSTED) + flag |= PTR_UNTRUSTED; + if (atype == BPF_READ && value_regno >= 0) mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id, flag); @@ -4316,7 +4624,7 @@ static int check_stack_slot_within_bounds(int off, static int check_stack_access_within_bounds( struct bpf_verifier_env *env, int regno, int off, int access_size, - enum stack_access_src src, enum bpf_access_type type) + enum bpf_access_src src, enum bpf_access_type type) { struct bpf_reg_state *regs = cur_regs(env); struct bpf_reg_state *reg = regs + regno; @@ -4412,6 +4720,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (value_regno >= 0) mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_MAP_VALUE) { + struct bpf_map_value_off_desc *kptr_off_desc = NULL; + if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { verbose(env, "R%d leaks addr into map\n", value_regno); @@ -4420,8 +4730,16 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn err = check_map_access_type(env, regno, off, size, t); if (err) return err; - err = check_map_access(env, regno, off, size, false); - if (!err && t == BPF_READ && value_regno >= 0) { + err = check_map_access(env, regno, off, size, false, ACCESS_DIRECT); + if (err) + return err; + if (tnum_is_const(reg->var_off)) + kptr_off_desc = bpf_map_kptr_off_contains(reg->map_ptr, + off + reg->var_off.value); + if (kptr_off_desc) { + err = check_map_kptr_access(env, regno, value_regno, insn_idx, + kptr_off_desc); + } else if (t == BPF_READ && value_regno >= 0) { struct bpf_map *map = reg->map_ptr; /* if map is read-only, track its contents as scalars */ @@ -4724,7 +5042,7 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i static int check_stack_range_initialized( struct bpf_verifier_env *env, int regno, int off, int access_size, bool zero_size_allowed, - enum stack_access_src type, struct bpf_call_arg_meta *meta) + enum bpf_access_src type, struct bpf_call_arg_meta *meta) { struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_func_state *state = func(env, reg); @@ -4861,6 +5179,11 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, return check_packet_access(env, regno, reg->off, access_size, zero_size_allowed); case PTR_TO_MAP_KEY: + if (meta && meta->raw_mode) { + verbose(env, "R%d cannot write into %s\n", regno, + reg_type_str(env, reg->type)); + return -EACCES; + } return check_mem_region_access(env, regno, reg->off, access_size, reg->map_ptr->key_size, false); case PTR_TO_MAP_VALUE: @@ -4869,15 +5192,25 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, BPF_READ)) return -EACCES; return check_map_access(env, regno, reg->off, access_size, - zero_size_allowed); + zero_size_allowed, ACCESS_HELPER); case PTR_TO_MEM: + if (type_is_rdonly_mem(reg->type)) { + if (meta && meta->raw_mode) { + verbose(env, "R%d cannot write into %s\n", regno, + reg_type_str(env, reg->type)); + return -EACCES; + } + } return check_mem_region_access(env, regno, reg->off, access_size, reg->mem_size, zero_size_allowed); case PTR_TO_BUF: if (type_is_rdonly_mem(reg->type)) { - if (meta && meta->raw_mode) + if (meta && meta->raw_mode) { + verbose(env, "R%d cannot write into %s\n", regno, + reg_type_str(env, reg->type)); return -EACCES; + } max_access = &env->prog->aux->max_rdonly_access; } else { @@ -4919,8 +5252,7 @@ static int check_mem_size_reg(struct bpf_verifier_env *env, * out. Only upper bounds can be learned because retval is an * int type and negative retvals are allowed. */ - if (meta) - meta->msize_max_value = reg->umax_value; + meta->msize_max_value = reg->umax_value; /* The register is SCALAR_VALUE; the access check * happens using its boundaries. @@ -4963,24 +5295,33 @@ static int check_mem_size_reg(struct bpf_verifier_env *env, int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, u32 mem_size) { + bool may_be_null = type_may_be_null(reg->type); + struct bpf_reg_state saved_reg; + struct bpf_call_arg_meta meta; + int err; + if (register_is_null(reg)) return 0; - if (type_may_be_null(reg->type)) { - /* Assuming that the register contains a value check if the memory - * access is safe. Temporarily save and restore the register's state as - * the conversion shouldn't be visible to a caller. - */ - const struct bpf_reg_state saved_reg = *reg; - int rv; - + memset(&meta, 0, sizeof(meta)); + /* Assuming that the register contains a value check if the memory + * access is safe. Temporarily save and restore the register's state as + * the conversion shouldn't be visible to a caller. + */ + if (may_be_null) { + saved_reg = *reg; mark_ptr_not_null_reg(reg); - rv = check_helper_mem_access(env, regno, mem_size, true, NULL); - *reg = saved_reg; - return rv; } - return check_helper_mem_access(env, regno, mem_size, true, NULL); + err = check_helper_mem_access(env, regno, mem_size, true, &meta); + /* Check access for BPF_WRITE */ + meta.raw_mode = true; + err = err ?: check_helper_mem_access(env, regno, mem_size, true, &meta); + + if (may_be_null) + *reg = saved_reg; + + return err; } int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, @@ -4989,16 +5330,22 @@ int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state struct bpf_reg_state *mem_reg = &cur_regs(env)[regno - 1]; bool may_be_null = type_may_be_null(mem_reg->type); struct bpf_reg_state saved_reg; + struct bpf_call_arg_meta meta; int err; WARN_ON_ONCE(regno < BPF_REG_2 || regno > BPF_REG_5); + memset(&meta, 0, sizeof(meta)); + if (may_be_null) { saved_reg = *mem_reg; mark_ptr_not_null_reg(mem_reg); } - err = check_mem_size_reg(env, reg, regno, true, NULL); + err = check_mem_size_reg(env, reg, regno, true, &meta); + /* Check access for BPF_WRITE */ + meta.raw_mode = true; + err = err ?: check_mem_size_reg(env, reg, regno, true, &meta); if (may_be_null) *mem_reg = saved_reg; @@ -5134,10 +5481,51 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno, return 0; } -static bool arg_type_is_mem_ptr(enum bpf_arg_type type) +static int process_kptr_func(struct bpf_verifier_env *env, int regno, + struct bpf_call_arg_meta *meta) { - return base_type(type) == ARG_PTR_TO_MEM || - base_type(type) == ARG_PTR_TO_UNINIT_MEM; + struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_map_value_off_desc *off_desc; + struct bpf_map *map_ptr = reg->map_ptr; + u32 kptr_off; + int ret; + + if (!tnum_is_const(reg->var_off)) { + verbose(env, + "R%d doesn't have constant offset. kptr has to be at the constant offset\n", + regno); + return -EINVAL; + } + if (!map_ptr->btf) { + verbose(env, "map '%s' has to have BTF in order to use bpf_kptr_xchg\n", + map_ptr->name); + return -EINVAL; + } + if (!map_value_has_kptrs(map_ptr)) { + ret = PTR_ERR_OR_ZERO(map_ptr->kptr_off_tab); + if (ret == -E2BIG) + verbose(env, "map '%s' has more than %d kptr\n", map_ptr->name, + BPF_MAP_VALUE_OFF_MAX); + else if (ret == -EEXIST) + verbose(env, "map '%s' has repeating kptr BTF tags\n", map_ptr->name); + else + verbose(env, "map '%s' has no valid kptr\n", map_ptr->name); + return -EINVAL; + } + + meta->map_ptr = map_ptr; + kptr_off = reg->off + reg->var_off.value; + off_desc = bpf_map_kptr_off_contains(map_ptr, kptr_off); + if (!off_desc) { + verbose(env, "off=%d doesn't point to kptr\n", kptr_off); + return -EACCES; + } + if (off_desc->type != BPF_KPTR_REF) { + verbose(env, "off=%d kptr isn't referenced kptr\n", kptr_off); + return -EACCES; + } + meta->kptr_off_desc = off_desc; + return 0; } static bool arg_type_is_mem_size(enum bpf_arg_type type) @@ -5157,6 +5545,16 @@ static bool arg_type_is_int_ptr(enum bpf_arg_type type) type == ARG_PTR_TO_LONG; } +static bool arg_type_is_release(enum bpf_arg_type type) +{ + return type & OBJ_RELEASE; +} + +static bool arg_type_is_dynptr(enum bpf_arg_type type) +{ + return base_type(type) == ARG_PTR_TO_DYNPTR; +} + static int int_ptr_type_to_size(enum bpf_arg_type type) { if (type == ARG_PTR_TO_INT) @@ -5269,11 +5667,11 @@ static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } }; static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } }; static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } }; static const struct bpf_reg_types timer_types = { .types = { PTR_TO_MAP_VALUE } }; +static const struct bpf_reg_types kptr_types = { .types = { PTR_TO_MAP_VALUE } }; static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_MAP_KEY] = &map_key_value_types, [ARG_PTR_TO_MAP_VALUE] = &map_key_value_types, - [ARG_PTR_TO_UNINIT_MAP_VALUE] = &map_key_value_types, [ARG_CONST_SIZE] = &scalar_types, [ARG_CONST_SIZE_OR_ZERO] = &scalar_types, [ARG_CONST_ALLOC_SIZE_OR_ZERO] = &scalar_types, @@ -5287,7 +5685,6 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_BTF_ID] = &btf_ptr_types, [ARG_PTR_TO_SPIN_LOCK] = &spin_lock_types, [ARG_PTR_TO_MEM] = &mem_types, - [ARG_PTR_TO_UNINIT_MEM] = &mem_types, [ARG_PTR_TO_ALLOC_MEM] = &alloc_mem_types, [ARG_PTR_TO_INT] = &int_ptr_types, [ARG_PTR_TO_LONG] = &int_ptr_types, @@ -5296,11 +5693,14 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_STACK] = &stack_ptr_types, [ARG_PTR_TO_CONST_STR] = &const_str_ptr_types, [ARG_PTR_TO_TIMER] = &timer_types, + [ARG_PTR_TO_KPTR] = &kptr_types, + [ARG_PTR_TO_DYNPTR] = &stack_ptr_types, }; static int check_reg_type(struct bpf_verifier_env *env, u32 regno, enum bpf_arg_type arg_type, - const u32 *arg_btf_id) + const u32 *arg_btf_id, + struct bpf_call_arg_meta *meta) { struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; enum bpf_reg_type expected, type = reg->type; @@ -5345,6 +5745,13 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, found: if (reg->type == PTR_TO_BTF_ID) { + /* For bpf_sk_release, it needs to match against first member + * 'struct sock_common', hence make an exception for it. This + * allows bpf_sk_release to work for multiple socket types. + */ + bool strict_type_match = arg_type_is_release(arg_type) && + meta->func_id != BPF_FUNC_sk_release; + if (!arg_btf_id) { if (!compatible->btf_id) { verbose(env, "verifier internal error: missing arg compatible BTF ID\n"); @@ -5353,8 +5760,12 @@ found: arg_btf_id = compatible->btf_id; } - if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off, - btf_vmlinux, *arg_btf_id)) { + if (meta->func_id == BPF_FUNC_kptr_xchg) { + if (map_kptr_match_type(env, meta->kptr_off_desc, reg, regno)) + return -EACCES; + } else if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off, + btf_vmlinux, *arg_btf_id, + strict_type_match)) { verbose(env, "R%d is of type %s but %s is expected\n", regno, kernel_type_name(reg->btf, reg->btf_id), kernel_type_name(btf_vmlinux, *arg_btf_id)); @@ -5367,15 +5778,19 @@ found: int check_func_arg_reg_off(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int regno, - enum bpf_arg_type arg_type, - bool is_release_func) + enum bpf_arg_type arg_type) { - bool fixed_off_ok = false, release_reg; enum bpf_reg_type type = reg->type; + bool fixed_off_ok = false; switch ((u32)type) { - case SCALAR_VALUE: /* Pointer types where reg offset is explicitly allowed: */ + case PTR_TO_STACK: + if (arg_type_is_dynptr(arg_type) && reg->off % BPF_REG_SIZE) { + verbose(env, "cannot pass in dynptr at an offset\n"); + return -EINVAL; + } + fallthrough; case PTR_TO_PACKET: case PTR_TO_PACKET_META: case PTR_TO_MAP_KEY: @@ -5385,11 +5800,11 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env, case PTR_TO_MEM | MEM_ALLOC: case PTR_TO_BUF: case PTR_TO_BUF | MEM_RDONLY: - case PTR_TO_STACK: + case SCALAR_VALUE: /* Some of the argument types nevertheless require a * zero register offset. */ - if (arg_type != ARG_PTR_TO_ALLOC_MEM) + if (base_type(arg_type) != ARG_PTR_TO_ALLOC_MEM) return 0; break; /* All the rest must be rejected, except PTR_TO_BTF_ID which allows @@ -5397,19 +5812,17 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env, */ case PTR_TO_BTF_ID: /* When referenced PTR_TO_BTF_ID is passed to release function, - * it's fixed offset must be 0. We rely on the property that - * only one referenced register can be passed to BPF helpers and - * kfuncs. In the other cases, fixed offset can be non-zero. + * it's fixed offset must be 0. In the other cases, fixed offset + * can be non-zero. */ - release_reg = is_release_func && reg->ref_obj_id; - if (release_reg && reg->off) { + if (arg_type_is_release(arg_type) && reg->off) { verbose(env, "R%d must have zero offset when passed to release func\n", regno); return -EINVAL; } - /* For release_reg == true, fixed_off_ok must be false, but we - * already checked and rejected reg->off != 0 above, so set to - * true to allow fixed offset for all other cases. + /* For arg is release pointer, fixed_off_ok must be false, but + * we already checked and rejected reg->off != 0 above, so set + * to true to allow fixed offset for all other cases. */ fixed_off_ok = true; break; @@ -5419,6 +5832,14 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env, return __check_ptr_off_reg(env, reg, regno, fixed_off_ok); } +static u32 stack_slot_get_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +{ + struct bpf_func_state *state = func(env, reg); + int spi = get_spi(reg->off); + + return state->stack[spi].spilled_ptr.id; +} + static int check_func_arg(struct bpf_verifier_env *env, u32 arg, struct bpf_call_arg_meta *meta, const struct bpf_func_proto *fn) @@ -5451,8 +5872,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, return -EACCES; } - if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE || - base_type(arg_type) == ARG_PTR_TO_UNINIT_MAP_VALUE) { + if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE) { err = resolve_map_arg_type(env, meta, &arg_type); if (err) return err; @@ -5464,18 +5884,37 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, */ goto skip_type_check; - err = check_reg_type(env, regno, arg_type, fn->arg_btf_id[arg]); + err = check_reg_type(env, regno, arg_type, fn->arg_btf_id[arg], meta); if (err) return err; - err = check_func_arg_reg_off(env, reg, regno, arg_type, is_release_function(meta->func_id)); + err = check_func_arg_reg_off(env, reg, regno, arg_type); if (err) return err; skip_type_check: - /* check_func_arg_reg_off relies on only one referenced register being - * allowed for BPF helpers. - */ + if (arg_type_is_release(arg_type)) { + if (arg_type_is_dynptr(arg_type)) { + struct bpf_func_state *state = func(env, reg); + int spi = get_spi(reg->off); + + if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) || + !state->stack[spi].spilled_ptr.id) { + verbose(env, "arg %d is an unacquired reference\n", regno); + return -EINVAL; + } + } else if (!reg->ref_obj_id && !register_is_null(reg)) { + verbose(env, "R%d must be referenced when passed to release function\n", + regno); + return -EINVAL; + } + if (meta->release_regno) { + verbose(env, "verifier internal error: more than one release argument\n"); + return -EFAULT; + } + meta->release_regno = regno; + } + if (reg->ref_obj_id) { if (meta->ref_obj_id) { verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", @@ -5528,8 +5967,7 @@ skip_type_check: err = check_helper_mem_access(env, regno, meta->map_ptr->key_size, false, NULL); - } else if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE || - base_type(arg_type) == ARG_PTR_TO_UNINIT_MAP_VALUE) { + } else if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE) { if (type_may_be_null(arg_type) && register_is_null(reg)) return 0; @@ -5541,7 +5979,7 @@ skip_type_check: verbose(env, "invalid map_ptr to access map->value\n"); return -EACCES; } - meta->raw_mode = (arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE); + meta->raw_mode = arg_type & MEM_UNINIT; err = check_helper_mem_access(env, regno, meta->map_ptr->value_size, false, meta); @@ -5568,15 +6006,49 @@ skip_type_check: return -EACCES; } else if (arg_type == ARG_PTR_TO_FUNC) { meta->subprogno = reg->subprogno; - } else if (arg_type_is_mem_ptr(arg_type)) { + } else if (base_type(arg_type) == ARG_PTR_TO_MEM) { /* The access to this pointer is only checked when we hit the * next is_mem_size argument below. */ - meta->raw_mode = (arg_type == ARG_PTR_TO_UNINIT_MEM); + meta->raw_mode = arg_type & MEM_UNINIT; } else if (arg_type_is_mem_size(arg_type)) { bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO); err = check_mem_size_reg(env, reg, regno, zero_size_allowed, meta); + } else if (arg_type_is_dynptr(arg_type)) { + if (arg_type & MEM_UNINIT) { + if (!is_dynptr_reg_valid_uninit(env, reg)) { + verbose(env, "Dynptr has to be an uninitialized dynptr\n"); + return -EINVAL; + } + + /* We only support one dynptr being uninitialized at the moment, + * which is sufficient for the helper functions we have right now. + */ + if (meta->uninit_dynptr_regno) { + verbose(env, "verifier internal error: multiple uninitialized dynptr args\n"); + return -EFAULT; + } + + meta->uninit_dynptr_regno = regno; + } else if (!is_dynptr_reg_valid_init(env, reg, arg_type)) { + const char *err_extra = ""; + + switch (arg_type & DYNPTR_TYPE_FLAG_MASK) { + case DYNPTR_TYPE_LOCAL: + err_extra = "local "; + break; + case DYNPTR_TYPE_RINGBUF: + err_extra = "ringbuf "; + break; + default: + break; + } + + verbose(env, "Expected an initialized %sdynptr as arg #%d\n", + err_extra, arg + 1); + return -EINVAL; + } } else if (arg_type_is_alloc_size(arg_type)) { if (!tnum_is_const(reg->var_off)) { verbose(env, "R%d is not a known constant'\n", @@ -5613,7 +6085,8 @@ skip_type_check: } err = check_map_access(env, regno, reg->off, - map->value_size - reg->off, false); + map->value_size - reg->off, false, + ACCESS_HELPER); if (err) return err; @@ -5629,6 +6102,9 @@ skip_type_check: verbose(env, "string is not zero-terminated\n"); return -EINVAL; } + } else if (arg_type == ARG_PTR_TO_KPTR) { + if (process_kptr_func(env, regno, meta)) + return -EACCES; } return err; @@ -5694,7 +6170,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, case BPF_MAP_TYPE_RINGBUF: if (func_id != BPF_FUNC_ringbuf_output && func_id != BPF_FUNC_ringbuf_reserve && - func_id != BPF_FUNC_ringbuf_query) + func_id != BPF_FUNC_ringbuf_query && + func_id != BPF_FUNC_ringbuf_reserve_dynptr && + func_id != BPF_FUNC_ringbuf_submit_dynptr && + func_id != BPF_FUNC_ringbuf_discard_dynptr) goto error; break; case BPF_MAP_TYPE_STACK_TRACE: @@ -5810,6 +6289,9 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, case BPF_FUNC_ringbuf_output: case BPF_FUNC_ringbuf_reserve: case BPF_FUNC_ringbuf_query: + case BPF_FUNC_ringbuf_reserve_dynptr: + case BPF_FUNC_ringbuf_submit_dynptr: + case BPF_FUNC_ringbuf_discard_dynptr: if (map->map_type != BPF_MAP_TYPE_RINGBUF) goto error; break; @@ -5864,6 +6346,12 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, map->map_type != BPF_MAP_TYPE_BLOOM_FILTER) goto error; break; + case BPF_FUNC_map_lookup_percpu_elem: + if (map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY && + map->map_type != BPF_MAP_TYPE_PERCPU_HASH && + map->map_type != BPF_MAP_TYPE_LRU_PERCPU_HASH) + goto error; + break; case BPF_FUNC_sk_storage_get: case BPF_FUNC_sk_storage_delete: if (map->map_type != BPF_MAP_TYPE_SK_STORAGE) @@ -5915,10 +6403,8 @@ static bool check_raw_mode_ok(const struct bpf_func_proto *fn) static bool check_args_pair_invalid(enum bpf_arg_type arg_curr, enum bpf_arg_type arg_next) { - return (arg_type_is_mem_ptr(arg_curr) && - !arg_type_is_mem_size(arg_next)) || - (!arg_type_is_mem_ptr(arg_curr) && - arg_type_is_mem_size(arg_next)); + return (base_type(arg_curr) == ARG_PTR_TO_MEM) != + arg_type_is_mem_size(arg_next); } static bool check_arg_pair_ok(const struct bpf_func_proto *fn) @@ -5929,7 +6415,7 @@ static bool check_arg_pair_ok(const struct bpf_func_proto *fn) * helper function specification. */ if (arg_type_is_mem_size(fn->arg1_type) || - arg_type_is_mem_ptr(fn->arg5_type) || + base_type(fn->arg5_type) == ARG_PTR_TO_MEM || check_args_pair_invalid(fn->arg1_type, fn->arg2_type) || check_args_pair_invalid(fn->arg2_type, fn->arg3_type) || check_args_pair_invalid(fn->arg3_type, fn->arg4_type) || @@ -5971,17 +6457,18 @@ static bool check_btf_id_ok(const struct bpf_func_proto *fn) int i; for (i = 0; i < ARRAY_SIZE(fn->arg_type); i++) { - if (fn->arg_type[i] == ARG_PTR_TO_BTF_ID && !fn->arg_btf_id[i]) + if (base_type(fn->arg_type[i]) == ARG_PTR_TO_BTF_ID && !fn->arg_btf_id[i]) return false; - if (fn->arg_type[i] != ARG_PTR_TO_BTF_ID && fn->arg_btf_id[i]) + if (base_type(fn->arg_type[i]) != ARG_PTR_TO_BTF_ID && fn->arg_btf_id[i]) return false; } return true; } -static int check_func_proto(const struct bpf_func_proto *fn, int func_id) +static int check_func_proto(const struct bpf_func_proto *fn, int func_id, + struct bpf_call_arg_meta *meta) { return check_raw_mode_ok(fn) && check_arg_pair_ok(fn) && @@ -6476,7 +6963,8 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, func_id != BPF_FUNC_map_pop_elem && func_id != BPF_FUNC_map_peek_elem && func_id != BPF_FUNC_for_each_map_elem && - func_id != BPF_FUNC_redirect_map) + func_id != BPF_FUNC_redirect_map && + func_id != BPF_FUNC_map_lookup_percpu_elem) return 0; if (map == NULL) { @@ -6665,7 +7153,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn memset(&meta, 0, sizeof(meta)); meta.pkt_access = fn->pkt_access; - err = check_func_proto(fn, func_id); + err = check_func_proto(fn, func_id, &meta); if (err) { verbose(env, "kernel subsystem misconfigured func %s#%d\n", func_id_name(func_id), func_id); @@ -6698,8 +7186,35 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn return err; } - if (is_release_function(func_id)) { - err = release_reference(env, meta.ref_obj_id); + regs = cur_regs(env); + + if (meta.uninit_dynptr_regno) { + /* we write BPF_DW bits (8 bytes) at a time */ + for (i = 0; i < BPF_DYNPTR_SIZE; i += 8) { + err = check_mem_access(env, insn_idx, meta.uninit_dynptr_regno, + i, BPF_DW, BPF_WRITE, -1, false); + if (err) + return err; + } + + err = mark_stack_slots_dynptr(env, ®s[meta.uninit_dynptr_regno], + fn->arg_type[meta.uninit_dynptr_regno - BPF_REG_1], + insn_idx); + if (err) + return err; + } + + if (meta.release_regno) { + err = -EINVAL; + if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1])) + err = unmark_stack_slots_dynptr(env, ®s[meta.release_regno]); + else if (meta.ref_obj_id) + err = release_reference(env, meta.ref_obj_id); + /* meta.ref_obj_id can only be 0 if register that is meant to be + * released is NULL, which must be > R0. + */ + else if (register_is_null(®s[meta.release_regno])) + err = 0; if (err) { verbose(env, "func %s#%d reference has not been acquired before\n", func_id_name(func_id), func_id); @@ -6707,8 +7222,6 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn } } - regs = cur_regs(env); - switch (func_id) { case BPF_FUNC_tail_call: err = check_reference_leak(env); @@ -6745,6 +7258,12 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, set_loop_callback_state); break; + case BPF_FUNC_dynptr_from_mem: + if (regs[BPF_REG_1].type != PTR_TO_MAP_VALUE) { + verbose(env, "Unsupported reg type %s for bpf_dynptr_from_mem data\n", + reg_type_str(env, regs[BPF_REG_1].type)); + return -EACCES; + } } if (err) @@ -6832,21 +7351,25 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn regs[BPF_REG_0].btf_id = meta.ret_btf_id; } } else if (base_type(ret_type) == RET_PTR_TO_BTF_ID) { + struct btf *ret_btf; int ret_btf_id; mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag; - ret_btf_id = *fn->ret_btf_id; + if (func_id == BPF_FUNC_kptr_xchg) { + ret_btf = meta.kptr_off_desc->kptr.btf; + ret_btf_id = meta.kptr_off_desc->kptr.btf_id; + } else { + ret_btf = btf_vmlinux; + ret_btf_id = *fn->ret_btf_id; + } if (ret_btf_id == 0) { verbose(env, "invalid return type %u of func %s#%d\n", base_type(ret_type), func_id_name(func_id), func_id); return -EINVAL; } - /* current BPF helper definitions are only coming from - * built-in code with type IDs from vmlinux BTF - */ - regs[BPF_REG_0].btf = btf_vmlinux; + regs[BPF_REG_0].btf = ret_btf; regs[BPF_REG_0].btf_id = ret_btf_id; } else { verbose(env, "unknown return type %u of func %s#%d\n", @@ -6869,6 +7392,21 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn regs[BPF_REG_0].id = id; /* For release_reference() */ regs[BPF_REG_0].ref_obj_id = id; + } else if (func_id == BPF_FUNC_dynptr_data) { + int dynptr_id = 0, i; + + /* Find the id of the dynptr we're acquiring a reference to */ + for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) { + if (arg_type_is_dynptr(fn->arg_type[i])) { + if (dynptr_id) { + verbose(env, "verifier internal error: multiple dynptr args in func\n"); + return -EFAULT; + } + dynptr_id = stack_slot_get_id(env, ®s[BPF_REG_1 + i]); + } + } + /* For release_reference() */ + regs[BPF_REG_0].ref_obj_id = dynptr_id; } do_refine_retval_range(regs, fn->ret_type, func_id, &meta); @@ -6951,7 +7489,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (!insn->imm) return 0; - desc_btf = find_kfunc_desc_btf(env, insn->imm, insn->off); + desc_btf = find_kfunc_desc_btf(env, insn->off); if (IS_ERR(desc_btf)) return PTR_ERR(desc_btf); @@ -7433,7 +7971,7 @@ static int sanitize_check_bounds(struct bpf_verifier_env *env, return -EACCES; break; case PTR_TO_MAP_VALUE: - if (check_map_access(env, dst, dst_reg->off, 1, false)) { + if (check_map_access(env, dst, dst_reg->off, 1, false, ACCESS_HELPER)) { verbose(env, "R%d pointer arithmetic of map value goes out of range, " "prohibited for !root\n", dst); return -EACCES; @@ -12822,7 +13360,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) if (!ctx_access) continue; - switch (env->insn_aux_data[i + delta].ptr_type) { + switch ((int)env->insn_aux_data[i + delta].ptr_type) { case PTR_TO_CTX: if (!ops->convert_ctx_access) continue; @@ -12839,6 +13377,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) convert_ctx_access = bpf_xdp_sock_convert_ctx_access; break; case PTR_TO_BTF_ID: + case PTR_TO_BTF_ID | PTR_UNTRUSTED: if (type == BPF_READ) { insn->code = BPF_LDX | BPF_PROBE_MEM | BPF_SIZE((insn)->code); @@ -13524,7 +14063,8 @@ static int do_misc_fixups(struct bpf_verifier_env *env) insn->imm == BPF_FUNC_map_pop_elem || insn->imm == BPF_FUNC_map_peek_elem || insn->imm == BPF_FUNC_redirect_map || - insn->imm == BPF_FUNC_for_each_map_elem)) { + insn->imm == BPF_FUNC_for_each_map_elem || + insn->imm == BPF_FUNC_map_lookup_percpu_elem)) { aux = &env->insn_aux_data[i + delta]; if (bpf_map_ptr_poisoned(aux)) goto patch_call_imm; @@ -13573,6 +14113,8 @@ static int do_misc_fixups(struct bpf_verifier_env *env) bpf_callback_t callback_fn, void *callback_ctx, u64 flags))NULL)); + BUILD_BUG_ON(!__same_type(ops->map_lookup_percpu_elem, + (void *(*)(struct bpf_map *map, void *key, u32 cpu))NULL)); patch_map_ops_generic: switch (insn->imm) { @@ -13600,6 +14142,9 @@ patch_map_ops_generic: case BPF_FUNC_for_each_map_elem: insn->imm = BPF_CALL_IMM(ops->map_for_each_callback); continue; + case BPF_FUNC_map_lookup_percpu_elem: + insn->imm = BPF_CALL_IMM(ops->map_lookup_percpu_elem); + continue; } goto patch_call_imm; |