diff options
Diffstat (limited to 'kernel')
228 files changed, 14204 insertions, 3612 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 9a20016d4900..af601b9bda0e 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -12,7 +12,7 @@ obj-y = fork.o exec_domain.o panic.o \ notifier.o ksysfs.o cred.o reboot.o \ async.o range.o smpboot.o ucount.o regset.o -obj-$(CONFIG_BPFILTER) += usermode_driver.o +obj-$(CONFIG_USERMODE_DRIVER) += usermode_driver.o obj-$(CONFIG_MODULES) += kmod.o obj-$(CONFIG_MULTIUSER) += groups.o @@ -38,9 +38,6 @@ KASAN_SANITIZE_kcov.o := n KCSAN_SANITIZE_kcov.o := n CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack) -fno-stack-protector -# cond_syscall is currently not LTO compatible -CFLAGS_sys_ni.o = $(DISABLE_LTO) - obj-y += sched/ obj-y += locking/ obj-y += power/ @@ -111,6 +108,7 @@ obj-$(CONFIG_CPU_PM) += cpu_pm.o obj-$(CONFIG_BPF) += bpf/ obj-$(CONFIG_KCSAN) += kcsan/ obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o +obj-$(CONFIG_HAVE_STATIC_CALL_INLINE) += static_call.o obj-$(CONFIG_PERF_EVENTS) += events/ @@ -133,6 +131,8 @@ KASAN_SANITIZE_stackleak.o := n KCSAN_SANITIZE_stackleak.o := n KCOV_INSTRUMENT_stackleak.o := n +obj-$(CONFIG_SCF_TORTURE_TEST) += scftorture.o + $(obj)/configs.o: $(obj)/config_data.gz targets += config_data.gz diff --git a/kernel/acct.c b/kernel/acct.c index b0c5b3a9f5af..f175df8f6aa4 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -25,7 +25,7 @@ * Now we silently close acct_file on attempt to reopen. Cleaned sys_acct(). * XTerms and EMACS are manifestations of pure evil. 21/10/98, AV. * - * Fixed a nasty interaction with with sys_umount(). If the accointing + * Fixed a nasty interaction with sys_umount(). If the accounting * was suspeneded we failed to stop it on umount(). Messy. * Another one: remount to readonly didn't stop accounting. * Question: what should we do if we have CAP_SYS_ADMIN but not @@ -263,12 +263,12 @@ static DEFINE_MUTEX(acct_on_mutex); * sys_acct - enable/disable process accounting * @name: file name for accounting records or NULL to shutdown accounting * - * Returns 0 for success or negative errno values for failure. - * * sys_acct() is the only system call needed to implement process * accounting. It takes the name of the file where accounting records * should be written. If the filename is NULL, accounting will be * shutdown. + * + * Returns: 0 for success or negative errno values for failure. */ SYSCALL_DEFINE1(acct, const char __user *, name) { @@ -586,9 +586,7 @@ static void slow_acct_process(struct pid_namespace *ns) } /** - * acct_process - * - * handles process accounting for an exiting task + * acct_process - handles process accounting for an exiting task */ void acct_process(void) { diff --git a/kernel/audit.c b/kernel/audit.c index 7efaece534a9..68cee3bc8cfe 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -123,9 +123,9 @@ static u32 audit_backlog_limit = 64; static u32 audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME; /* The identity of the user shutting down the audit system. */ -kuid_t audit_sig_uid = INVALID_UID; -pid_t audit_sig_pid = -1; -u32 audit_sig_sid = 0; +static kuid_t audit_sig_uid = INVALID_UID; +static pid_t audit_sig_pid = -1; +static u32 audit_sig_sid; /* Records can be lost in several ways: 0) [suppressed in audit_alloc] @@ -934,8 +934,7 @@ static void audit_free_reply(struct audit_reply *reply) if (!reply) return; - if (reply->skb) - kfree_skb(reply->skb); + kfree_skb(reply->skb); if (reply->net) put_net(reply->net); kfree(reply); diff --git a/kernel/audit.h b/kernel/audit.h index ddc22878433d..3b9c0945225a 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -327,10 +327,6 @@ static inline int audit_signal_info_syscall(struct task_struct *t) extern char *audit_unpack_string(void **bufp, size_t *remain, size_t len); -extern pid_t audit_sig_pid; -extern kuid_t audit_sig_uid; -extern u32 audit_sig_sid; - extern int audit_filter(int msgtype, unsigned int listtype); extern void audit_ctl_lock(void); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index a10e2997aa6c..333b3bcfc545 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -681,7 +681,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) data->values[i] = AUDIT_UID_UNSET; break; } - /* fall through - if set */ + fallthrough; /* if set */ default: data->values[i] = f->val; } diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index e6eb9c0402da..bdc8cd1b6767 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -5,6 +5,7 @@ CFLAGS_core.o += $(call cc-disable-warning, override-init) obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o +obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o obj-$(CONFIG_BPF_JIT) += trampoline.o obj-$(CONFIG_BPF_SYSCALL) += btf.o @@ -12,6 +13,7 @@ obj-$(CONFIG_BPF_JIT) += dispatcher.o ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o obj-$(CONFIG_BPF_SYSCALL) += cpumap.o +obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o obj-$(CONFIG_BPF_SYSCALL) += offload.o obj-$(CONFIG_BPF_SYSCALL) += net_namespace.o endif @@ -29,3 +31,4 @@ ifeq ($(CONFIG_BPF_JIT),y) obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o obj-${CONFIG_BPF_LSM} += bpf_lsm.o endif +obj-$(CONFIG_BPF_PRELOAD) += preload/ diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 8ff419b632a6..c6c81eceb68f 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -10,11 +10,13 @@ #include <linux/filter.h> #include <linux/perf_event.h> #include <uapi/linux/btf.h> +#include <linux/rcupdate_trace.h> #include "map_in_map.h" #define ARRAY_CREATE_FLAG_MASK \ - (BPF_F_NUMA_NODE | BPF_F_MMAPABLE | BPF_F_ACCESS_MASK) + (BPF_F_NUMA_NODE | BPF_F_MMAPABLE | BPF_F_ACCESS_MASK | \ + BPF_F_PRESERVE_ELEMS | BPF_F_INNER_MAP) static void bpf_array_free_percpu(struct bpf_array *array) { @@ -60,7 +62,11 @@ int array_map_alloc_check(union bpf_attr *attr) return -EINVAL; if (attr->map_type != BPF_MAP_TYPE_ARRAY && - attr->map_flags & BPF_F_MMAPABLE) + attr->map_flags & (BPF_F_MMAPABLE | BPF_F_INNER_MAP)) + return -EINVAL; + + if (attr->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY && + attr->map_flags & BPF_F_PRESERVE_ELEMS) return -EINVAL; if (attr->value_size > KMALLOC_MAX_SIZE) @@ -208,7 +214,7 @@ static int array_map_direct_value_meta(const struct bpf_map *map, u64 imm, } /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */ -static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) +static int array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_insn *insn = insn_buf; @@ -217,6 +223,9 @@ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) const int map_ptr = BPF_REG_1; const int index = BPF_REG_2; + if (map->map_flags & BPF_F_INNER_MAP) + return -EOPNOTSUPP; + *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); if (!map->bypass_spec_v1) { @@ -487,6 +496,15 @@ static int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) vma->vm_pgoff + pgoff); } +static bool array_map_meta_equal(const struct bpf_map *meta0, + const struct bpf_map *meta1) +{ + if (!bpf_map_meta_equal(meta0, meta1)) + return false; + return meta0->map_flags & BPF_F_INNER_MAP ? true : + meta0->max_entries == meta1->max_entries; +} + struct bpf_iter_seq_array_map_info { struct bpf_map *map; void *percpu_value_buf; @@ -625,6 +643,7 @@ static const struct bpf_iter_seq_info iter_seq_info = { static int array_map_btf_id; const struct bpf_map_ops array_map_ops = { + .map_meta_equal = array_map_meta_equal, .map_alloc_check = array_map_alloc_check, .map_alloc = array_map_alloc, .map_free = array_map_free, @@ -647,6 +666,7 @@ const struct bpf_map_ops array_map_ops = { static int percpu_array_map_btf_id; const struct bpf_map_ops percpu_array_map_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = array_map_alloc_check, .map_alloc = array_map_alloc, .map_free = array_map_free, @@ -888,6 +908,7 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key, struct bpf_prog *old, struct bpf_prog *new) { + u8 *old_addr, *new_addr, *old_bypass_addr; struct prog_poke_elem *elem; struct bpf_array_aux *aux; @@ -908,12 +929,13 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key, * there could be danger of use after free otherwise. * 2) Initially when we start tracking aux, the program * is not JITed yet and also does not have a kallsyms - * entry. We skip these as poke->ip_stable is not - * active yet. The JIT will do the final fixup before - * setting it stable. The various poke->ip_stable are - * successively activated, so tail call updates can - * arrive from here while JIT is still finishing its - * final fixup for non-activated poke entries. + * entry. We skip these as poke->tailcall_target_stable + * is not active yet. The JIT will do the final fixup + * before setting it stable. The various + * poke->tailcall_target_stable are successively + * activated, so tail call updates can arrive from here + * while JIT is still finishing its final fixup for + * non-activated poke entries. * 3) On program teardown, the program's kallsym entry gets * removed out of RCU callback, but we can only untrack * from sleepable context, therefore bpf_arch_text_poke() @@ -930,7 +952,7 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key, * 5) Any other error happening below from bpf_arch_text_poke() * is a unexpected bug. */ - if (!READ_ONCE(poke->ip_stable)) + if (!READ_ONCE(poke->tailcall_target_stable)) continue; if (poke->reason != BPF_POKE_REASON_TAIL_CALL) continue; @@ -938,12 +960,39 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key, poke->tail_call.key != key) continue; - ret = bpf_arch_text_poke(poke->ip, BPF_MOD_JUMP, - old ? (u8 *)old->bpf_func + - poke->adj_off : NULL, - new ? (u8 *)new->bpf_func + - poke->adj_off : NULL); - BUG_ON(ret < 0 && ret != -EINVAL); + old_bypass_addr = old ? NULL : poke->bypass_addr; + old_addr = old ? (u8 *)old->bpf_func + poke->adj_off : NULL; + new_addr = new ? (u8 *)new->bpf_func + poke->adj_off : NULL; + + if (new) { + ret = bpf_arch_text_poke(poke->tailcall_target, + BPF_MOD_JUMP, + old_addr, new_addr); + BUG_ON(ret < 0 && ret != -EINVAL); + if (!old) { + ret = bpf_arch_text_poke(poke->tailcall_bypass, + BPF_MOD_JUMP, + poke->bypass_addr, + NULL); + BUG_ON(ret < 0 && ret != -EINVAL); + } + } else { + ret = bpf_arch_text_poke(poke->tailcall_bypass, + BPF_MOD_JUMP, + old_bypass_addr, + poke->bypass_addr); + BUG_ON(ret < 0 && ret != -EINVAL); + /* let other CPUs finish the execution of program + * so that it will not possible to expose them + * to invalid nop, stack unwind, nop state + */ + if (!ret) + synchronize_rcu(); + ret = bpf_arch_text_poke(poke->tailcall_target, + BPF_MOD_JUMP, + old_addr, NULL); + BUG_ON(ret < 0 && ret != -EINVAL); + } } } } @@ -1003,6 +1052,11 @@ static void prog_array_map_free(struct bpf_map *map) fd_array_map_free(map); } +/* prog_array->aux->{type,jited} is a runtime binding. + * Doing static check alone in the verifier is not enough. + * Thus, prog_array_map cannot be used as an inner_map + * and map_meta_equal is not implemented. + */ static int prog_array_map_btf_id; const struct bpf_map_ops prog_array_map_ops = { .map_alloc_check = fd_array_map_alloc_check, @@ -1090,6 +1144,9 @@ static void perf_event_fd_array_release(struct bpf_map *map, struct bpf_event_entry *ee; int i; + if (map->map_flags & BPF_F_PRESERVE_ELEMS) + return; + rcu_read_lock(); for (i = 0; i < array->map.max_entries; i++) { ee = READ_ONCE(array->ptrs[i]); @@ -1099,11 +1156,19 @@ static void perf_event_fd_array_release(struct bpf_map *map, rcu_read_unlock(); } +static void perf_event_fd_array_map_free(struct bpf_map *map) +{ + if (map->map_flags & BPF_F_PRESERVE_ELEMS) + bpf_fd_array_map_clear(map); + fd_array_map_free(map); +} + static int perf_event_array_map_btf_id; const struct bpf_map_ops perf_event_array_map_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_map_alloc, - .map_free = fd_array_map_free, + .map_free = perf_event_fd_array_map_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, .map_delete_elem = fd_array_map_delete_elem, @@ -1137,6 +1202,7 @@ static void cgroup_fd_array_free(struct bpf_map *map) static int cgroup_array_map_btf_id; const struct bpf_map_ops cgroup_array_map_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_map_alloc, .map_free = cgroup_fd_array_free, @@ -1190,7 +1256,7 @@ static void *array_of_map_lookup_elem(struct bpf_map *map, void *key) return READ_ONCE(*inner_map); } -static u32 array_of_map_gen_lookup(struct bpf_map *map, +static int array_of_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_array *array = container_of(map, struct bpf_array, map); diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c new file mode 100644 index 000000000000..6edff97ad594 --- /dev/null +++ b/kernel/bpf/bpf_inode_storage.c @@ -0,0 +1,272 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2019 Facebook + * Copyright 2020 Google LLC. + */ + +#include <linux/rculist.h> +#include <linux/list.h> +#include <linux/hash.h> +#include <linux/types.h> +#include <linux/spinlock.h> +#include <linux/bpf.h> +#include <linux/bpf_local_storage.h> +#include <net/sock.h> +#include <uapi/linux/sock_diag.h> +#include <uapi/linux/btf.h> +#include <linux/bpf_lsm.h> +#include <linux/btf_ids.h> +#include <linux/fdtable.h> + +DEFINE_BPF_STORAGE_CACHE(inode_cache); + +static struct bpf_local_storage __rcu ** +inode_storage_ptr(void *owner) +{ + struct inode *inode = owner; + struct bpf_storage_blob *bsb; + + bsb = bpf_inode(inode); + if (!bsb) + return NULL; + return &bsb->storage; +} + +static struct bpf_local_storage_data *inode_storage_lookup(struct inode *inode, + struct bpf_map *map, + bool cacheit_lockit) +{ + struct bpf_local_storage *inode_storage; + struct bpf_local_storage_map *smap; + struct bpf_storage_blob *bsb; + + bsb = bpf_inode(inode); + if (!bsb) + return NULL; + + inode_storage = rcu_dereference(bsb->storage); + if (!inode_storage) + return NULL; + + smap = (struct bpf_local_storage_map *)map; + return bpf_local_storage_lookup(inode_storage, smap, cacheit_lockit); +} + +void bpf_inode_storage_free(struct inode *inode) +{ + struct bpf_local_storage_elem *selem; + struct bpf_local_storage *local_storage; + bool free_inode_storage = false; + struct bpf_storage_blob *bsb; + struct hlist_node *n; + + bsb = bpf_inode(inode); + if (!bsb) + return; + + rcu_read_lock(); + + local_storage = rcu_dereference(bsb->storage); + if (!local_storage) { + rcu_read_unlock(); + return; + } + + /* Netiher the bpf_prog nor the bpf-map's syscall + * could be modifying the local_storage->list now. + * Thus, no elem can be added-to or deleted-from the + * local_storage->list by the bpf_prog or by the bpf-map's syscall. + * + * It is racing with bpf_local_storage_map_free() alone + * when unlinking elem from the local_storage->list and + * the map's bucket->list. + */ + raw_spin_lock_bh(&local_storage->lock); + hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) { + /* Always unlink from map before unlinking from + * local_storage. + */ + bpf_selem_unlink_map(selem); + free_inode_storage = bpf_selem_unlink_storage_nolock( + local_storage, selem, false); + } + raw_spin_unlock_bh(&local_storage->lock); + rcu_read_unlock(); + + /* free_inoode_storage should always be true as long as + * local_storage->list was non-empty. + */ + if (free_inode_storage) + kfree_rcu(local_storage, rcu); +} + +static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_local_storage_data *sdata; + struct file *f; + int fd; + + fd = *(int *)key; + f = fget_raw(fd); + if (!f) + return NULL; + + sdata = inode_storage_lookup(f->f_inode, map, true); + fput(f); + return sdata ? sdata->data : NULL; +} + +static int bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) +{ + struct bpf_local_storage_data *sdata; + struct file *f; + int fd; + + fd = *(int *)key; + f = fget_raw(fd); + if (!f || !inode_storage_ptr(f->f_inode)) + return -EBADF; + + sdata = bpf_local_storage_update(f->f_inode, + (struct bpf_local_storage_map *)map, + value, map_flags); + fput(f); + return PTR_ERR_OR_ZERO(sdata); +} + +static int inode_storage_delete(struct inode *inode, struct bpf_map *map) +{ + struct bpf_local_storage_data *sdata; + + sdata = inode_storage_lookup(inode, map, false); + if (!sdata) + return -ENOENT; + + bpf_selem_unlink(SELEM(sdata)); + + return 0; +} + +static int bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key) +{ + struct file *f; + int fd, err; + + fd = *(int *)key; + f = fget_raw(fd); + if (!f) + return -EBADF; + + err = inode_storage_delete(f->f_inode, map); + fput(f); + return err; +} + +BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode, + void *, value, u64, flags) +{ + struct bpf_local_storage_data *sdata; + + if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE)) + return (unsigned long)NULL; + + /* explicitly check that the inode_storage_ptr is not + * NULL as inode_storage_lookup returns NULL in this case and + * bpf_local_storage_update expects the owner to have a + * valid storage pointer. + */ + if (!inode_storage_ptr(inode)) + return (unsigned long)NULL; + + sdata = inode_storage_lookup(inode, map, true); + if (sdata) + return (unsigned long)sdata->data; + + /* This helper must only called from where the inode is gurranteed + * to have a refcount and cannot be freed. + */ + if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) { + sdata = bpf_local_storage_update( + inode, (struct bpf_local_storage_map *)map, value, + BPF_NOEXIST); + return IS_ERR(sdata) ? (unsigned long)NULL : + (unsigned long)sdata->data; + } + + return (unsigned long)NULL; +} + +BPF_CALL_2(bpf_inode_storage_delete, + struct bpf_map *, map, struct inode *, inode) +{ + /* This helper must only called from where the inode is gurranteed + * to have a refcount and cannot be freed. + */ + return inode_storage_delete(inode, map); +} + +static int notsupp_get_next_key(struct bpf_map *map, void *key, + void *next_key) +{ + return -ENOTSUPP; +} + +static struct bpf_map *inode_storage_map_alloc(union bpf_attr *attr) +{ + struct bpf_local_storage_map *smap; + + smap = bpf_local_storage_map_alloc(attr); + if (IS_ERR(smap)) + return ERR_CAST(smap); + + smap->cache_idx = bpf_local_storage_cache_idx_get(&inode_cache); + return &smap->map; +} + +static void inode_storage_map_free(struct bpf_map *map) +{ + struct bpf_local_storage_map *smap; + + smap = (struct bpf_local_storage_map *)map; + bpf_local_storage_cache_idx_free(&inode_cache, smap->cache_idx); + bpf_local_storage_map_free(smap); +} + +static int inode_storage_map_btf_id; +const struct bpf_map_ops inode_storage_map_ops = { + .map_meta_equal = bpf_map_meta_equal, + .map_alloc_check = bpf_local_storage_map_alloc_check, + .map_alloc = inode_storage_map_alloc, + .map_free = inode_storage_map_free, + .map_get_next_key = notsupp_get_next_key, + .map_lookup_elem = bpf_fd_inode_storage_lookup_elem, + .map_update_elem = bpf_fd_inode_storage_update_elem, + .map_delete_elem = bpf_fd_inode_storage_delete_elem, + .map_check_btf = bpf_local_storage_map_check_btf, + .map_btf_name = "bpf_local_storage_map", + .map_btf_id = &inode_storage_map_btf_id, + .map_owner_storage_ptr = inode_storage_ptr, +}; + +BTF_ID_LIST_SINGLE(bpf_inode_storage_btf_ids, struct, inode) + +const struct bpf_func_proto bpf_inode_storage_get_proto = { + .func = bpf_inode_storage_get, + .gpl_only = false, + .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_btf_id = &bpf_inode_storage_btf_ids[0], + .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, + .arg4_type = ARG_ANYTHING, +}; + +const struct bpf_func_proto bpf_inode_storage_delete_proto = { + .func = bpf_inode_storage_delete, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_btf_id = &bpf_inode_storage_btf_ids[0], +}; diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index b6715964b685..8f10e30ea0b0 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -67,6 +67,9 @@ static void bpf_iter_done_stop(struct seq_file *seq) iter_priv->done_stop = true; } +/* maximum visited objects before bailing out */ +#define MAX_ITER_OBJECTS 1000000 + /* bpf_seq_read, a customized and simpler version for bpf iterator. * no_llseek is assumed for this file. * The following are differences from seq_read(): @@ -79,14 +82,14 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size, { struct seq_file *seq = file->private_data; size_t n, offs, copied = 0; - int err = 0; + int err = 0, num_objs = 0; void *p; mutex_lock(&seq->lock); if (!seq->buf) { - seq->size = PAGE_SIZE; - seq->buf = kmalloc(seq->size, GFP_KERNEL); + seq->size = PAGE_SIZE << 3; + seq->buf = kvmalloc(seq->size, GFP_KERNEL); if (!seq->buf) { err = -ENOMEM; goto done; @@ -135,6 +138,7 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size, while (1) { loff_t pos = seq->index; + num_objs++; offs = seq->count; p = seq->op->next(seq, p, &seq->index); if (pos == seq->index) { @@ -153,6 +157,15 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size, if (seq->count >= size) break; + if (num_objs >= MAX_ITER_OBJECTS) { + if (offs == 0) { + err = -EAGAIN; + seq->op->stop(seq, p); + goto done; + } + break; + } + err = seq->op->show(seq, p); if (err > 0) { bpf_iter_dec_seq_num(seq); @@ -377,10 +390,68 @@ out_unlock: return ret; } +static void bpf_iter_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_iter_link *iter_link = + container_of(link, struct bpf_iter_link, link); + bpf_iter_show_fdinfo_t show_fdinfo; + + seq_printf(seq, + "target_name:\t%s\n", + iter_link->tinfo->reg_info->target); + + show_fdinfo = iter_link->tinfo->reg_info->show_fdinfo; + if (show_fdinfo) + show_fdinfo(&iter_link->aux, seq); +} + +static int bpf_iter_link_fill_link_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + struct bpf_iter_link *iter_link = + container_of(link, struct bpf_iter_link, link); + char __user *ubuf = u64_to_user_ptr(info->iter.target_name); + bpf_iter_fill_link_info_t fill_link_info; + u32 ulen = info->iter.target_name_len; + const char *target_name; + u32 target_len; + + if (!ulen ^ !ubuf) + return -EINVAL; + + target_name = iter_link->tinfo->reg_info->target; + target_len = strlen(target_name); + info->iter.target_name_len = target_len + 1; + + if (ubuf) { + if (ulen >= target_len + 1) { + if (copy_to_user(ubuf, target_name, target_len + 1)) + return -EFAULT; + } else { + char zero = '\0'; + + if (copy_to_user(ubuf, target_name, ulen - 1)) + return -EFAULT; + if (put_user(zero, ubuf + ulen - 1)) + return -EFAULT; + return -ENOSPC; + } + } + + fill_link_info = iter_link->tinfo->reg_info->fill_link_info; + if (fill_link_info) + return fill_link_info(&iter_link->aux, info); + + return 0; +} + static const struct bpf_link_ops bpf_iter_link_lops = { .release = bpf_iter_link_release, .dealloc = bpf_iter_link_dealloc, .update_prog = bpf_iter_link_replace, + .show_fdinfo = bpf_iter_link_show_fdinfo, + .fill_link_info = bpf_iter_link_fill_link_info, }; bool bpf_link_is_iter(struct bpf_link *link) diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c new file mode 100644 index 000000000000..5d3a7af9ba9b --- /dev/null +++ b/kernel/bpf/bpf_local_storage.c @@ -0,0 +1,600 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2019 Facebook */ +#include <linux/rculist.h> +#include <linux/list.h> +#include <linux/hash.h> +#include <linux/types.h> +#include <linux/spinlock.h> +#include <linux/bpf.h> +#include <linux/btf_ids.h> +#include <linux/bpf_local_storage.h> +#include <net/sock.h> +#include <uapi/linux/sock_diag.h> +#include <uapi/linux/btf.h> + +#define BPF_LOCAL_STORAGE_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_CLONE) + +static struct bpf_local_storage_map_bucket * +select_bucket(struct bpf_local_storage_map *smap, + struct bpf_local_storage_elem *selem) +{ + return &smap->buckets[hash_ptr(selem, smap->bucket_log)]; +} + +static int mem_charge(struct bpf_local_storage_map *smap, void *owner, u32 size) +{ + struct bpf_map *map = &smap->map; + + if (!map->ops->map_local_storage_charge) + return 0; + + return map->ops->map_local_storage_charge(smap, owner, size); +} + +static void mem_uncharge(struct bpf_local_storage_map *smap, void *owner, + u32 size) +{ + struct bpf_map *map = &smap->map; + + if (map->ops->map_local_storage_uncharge) + map->ops->map_local_storage_uncharge(smap, owner, size); +} + +static struct bpf_local_storage __rcu ** +owner_storage(struct bpf_local_storage_map *smap, void *owner) +{ + struct bpf_map *map = &smap->map; + + return map->ops->map_owner_storage_ptr(owner); +} + +static bool selem_linked_to_storage(const struct bpf_local_storage_elem *selem) +{ + return !hlist_unhashed(&selem->snode); +} + +static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem) +{ + return !hlist_unhashed(&selem->map_node); +} + +struct bpf_local_storage_elem * +bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, + void *value, bool charge_mem) +{ + struct bpf_local_storage_elem *selem; + + if (charge_mem && mem_charge(smap, owner, smap->elem_size)) + return NULL; + + selem = kzalloc(smap->elem_size, GFP_ATOMIC | __GFP_NOWARN); + if (selem) { + if (value) + memcpy(SDATA(selem)->data, value, smap->map.value_size); + return selem; + } + + if (charge_mem) + mem_uncharge(smap, owner, smap->elem_size); + + return NULL; +} + +/* local_storage->lock must be held and selem->local_storage == local_storage. + * The caller must ensure selem->smap is still valid to be + * dereferenced for its smap->elem_size and smap->cache_idx. + */ +bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, + struct bpf_local_storage_elem *selem, + bool uncharge_mem) +{ + struct bpf_local_storage_map *smap; + bool free_local_storage; + void *owner; + + smap = rcu_dereference(SDATA(selem)->smap); + owner = local_storage->owner; + + /* All uncharging on the owner must be done first. + * The owner may be freed once the last selem is unlinked + * from local_storage. + */ + if (uncharge_mem) + mem_uncharge(smap, owner, smap->elem_size); + + free_local_storage = hlist_is_singular_node(&selem->snode, + &local_storage->list); + if (free_local_storage) { + mem_uncharge(smap, owner, sizeof(struct bpf_local_storage)); + local_storage->owner = NULL; + + /* After this RCU_INIT, owner may be freed and cannot be used */ + RCU_INIT_POINTER(*owner_storage(smap, owner), NULL); + + /* local_storage is not freed now. local_storage->lock is + * still held and raw_spin_unlock_bh(&local_storage->lock) + * will be done by the caller. + * + * Although the unlock will be done under + * rcu_read_lock(), it is more intutivie to + * read if kfree_rcu(local_storage, rcu) is done + * after the raw_spin_unlock_bh(&local_storage->lock). + * + * Hence, a "bool free_local_storage" is returned + * to the caller which then calls the kfree_rcu() + * after unlock. + */ + } + hlist_del_init_rcu(&selem->snode); + if (rcu_access_pointer(local_storage->cache[smap->cache_idx]) == + SDATA(selem)) + RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL); + + kfree_rcu(selem, rcu); + + return free_local_storage; +} + +static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem) +{ + struct bpf_local_storage *local_storage; + bool free_local_storage = false; + + if (unlikely(!selem_linked_to_storage(selem))) + /* selem has already been unlinked from sk */ + return; + + local_storage = rcu_dereference(selem->local_storage); + raw_spin_lock_bh(&local_storage->lock); + if (likely(selem_linked_to_storage(selem))) + free_local_storage = bpf_selem_unlink_storage_nolock( + local_storage, selem, true); + raw_spin_unlock_bh(&local_storage->lock); + + if (free_local_storage) + kfree_rcu(local_storage, rcu); +} + +void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, + struct bpf_local_storage_elem *selem) +{ + RCU_INIT_POINTER(selem->local_storage, local_storage); + hlist_add_head_rcu(&selem->snode, &local_storage->list); +} + +void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) +{ + struct bpf_local_storage_map *smap; + struct bpf_local_storage_map_bucket *b; + + if (unlikely(!selem_linked_to_map(selem))) + /* selem has already be unlinked from smap */ + return; + + smap = rcu_dereference(SDATA(selem)->smap); + b = select_bucket(smap, selem); + raw_spin_lock_bh(&b->lock); + if (likely(selem_linked_to_map(selem))) + hlist_del_init_rcu(&selem->map_node); + raw_spin_unlock_bh(&b->lock); +} + +void bpf_selem_link_map(struct bpf_local_storage_map *smap, + struct bpf_local_storage_elem *selem) +{ + struct bpf_local_storage_map_bucket *b = select_bucket(smap, selem); + + raw_spin_lock_bh(&b->lock); + RCU_INIT_POINTER(SDATA(selem)->smap, smap); + hlist_add_head_rcu(&selem->map_node, &b->list); + raw_spin_unlock_bh(&b->lock); +} + +void bpf_selem_unlink(struct bpf_local_storage_elem *selem) +{ + /* Always unlink from map before unlinking from local_storage + * because selem will be freed after successfully unlinked from + * the local_storage. + */ + bpf_selem_unlink_map(selem); + __bpf_selem_unlink_storage(selem); +} + +struct bpf_local_storage_data * +bpf_local_storage_lookup(struct bpf_local_storage *local_storage, + struct bpf_local_storage_map *smap, + bool cacheit_lockit) +{ + struct bpf_local_storage_data *sdata; + struct bpf_local_storage_elem *selem; + + /* Fast path (cache hit) */ + sdata = rcu_dereference(local_storage->cache[smap->cache_idx]); + if (sdata && rcu_access_pointer(sdata->smap) == smap) + return sdata; + + /* Slow path (cache miss) */ + hlist_for_each_entry_rcu(selem, &local_storage->list, snode) + if (rcu_access_pointer(SDATA(selem)->smap) == smap) + break; + + if (!selem) + return NULL; + + sdata = SDATA(selem); + if (cacheit_lockit) { + /* spinlock is needed to avoid racing with the + * parallel delete. Otherwise, publishing an already + * deleted sdata to the cache will become a use-after-free + * problem in the next bpf_local_storage_lookup(). + */ + raw_spin_lock_bh(&local_storage->lock); + if (selem_linked_to_storage(selem)) + rcu_assign_pointer(local_storage->cache[smap->cache_idx], + sdata); + raw_spin_unlock_bh(&local_storage->lock); + } + + return sdata; +} + +static int check_flags(const struct bpf_local_storage_data *old_sdata, + u64 map_flags) +{ + if (old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST) + /* elem already exists */ + return -EEXIST; + + if (!old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_EXIST) + /* elem doesn't exist, cannot update it */ + return -ENOENT; + + return 0; +} + +int bpf_local_storage_alloc(void *owner, + struct bpf_local_storage_map *smap, + struct bpf_local_storage_elem *first_selem) +{ + struct bpf_local_storage *prev_storage, *storage; + struct bpf_local_storage **owner_storage_ptr; + int err; + + err = mem_charge(smap, owner, sizeof(*storage)); + if (err) + return err; + + storage = kzalloc(sizeof(*storage), GFP_ATOMIC | __GFP_NOWARN); + if (!storage) { + err = -ENOMEM; + goto uncharge; + } + + INIT_HLIST_HEAD(&storage->list); + raw_spin_lock_init(&storage->lock); + storage->owner = owner; + + bpf_selem_link_storage_nolock(storage, first_selem); + bpf_selem_link_map(smap, first_selem); + + owner_storage_ptr = + (struct bpf_local_storage **)owner_storage(smap, owner); + /* Publish storage to the owner. + * Instead of using any lock of the kernel object (i.e. owner), + * cmpxchg will work with any kernel object regardless what + * the running context is, bh, irq...etc. + * + * From now on, the owner->storage pointer (e.g. sk->sk_bpf_storage) + * is protected by the storage->lock. Hence, when freeing + * the owner->storage, the storage->lock must be held before + * setting owner->storage ptr to NULL. + */ + prev_storage = cmpxchg(owner_storage_ptr, NULL, storage); + if (unlikely(prev_storage)) { + bpf_selem_unlink_map(first_selem); + err = -EAGAIN; + goto uncharge; + + /* Note that even first_selem was linked to smap's + * bucket->list, first_selem can be freed immediately + * (instead of kfree_rcu) because + * bpf_local_storage_map_free() does a + * synchronize_rcu() before walking the bucket->list. + * Hence, no one is accessing selem from the + * bucket->list under rcu_read_lock(). + */ + } + + return 0; + +uncharge: + kfree(storage); + mem_uncharge(smap, owner, sizeof(*storage)); + return err; +} + +/* sk cannot be going away because it is linking new elem + * to sk->sk_bpf_storage. (i.e. sk->sk_refcnt cannot be 0). + * Otherwise, it will become a leak (and other memory issues + * during map destruction). + */ +struct bpf_local_storage_data * +bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, + void *value, u64 map_flags) +{ + struct bpf_local_storage_data *old_sdata = NULL; + struct bpf_local_storage_elem *selem; + struct bpf_local_storage *local_storage; + int err; + + /* BPF_EXIST and BPF_NOEXIST cannot be both set */ + if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) || + /* BPF_F_LOCK can only be used in a value with spin_lock */ + unlikely((map_flags & BPF_F_LOCK) && + !map_value_has_spin_lock(&smap->map))) + return ERR_PTR(-EINVAL); + + local_storage = rcu_dereference(*owner_storage(smap, owner)); + if (!local_storage || hlist_empty(&local_storage->list)) { + /* Very first elem for the owner */ + err = check_flags(NULL, map_flags); + if (err) + return ERR_PTR(err); + + selem = bpf_selem_alloc(smap, owner, value, true); + if (!selem) + return ERR_PTR(-ENOMEM); + + err = bpf_local_storage_alloc(owner, smap, selem); + if (err) { + kfree(selem); + mem_uncharge(smap, owner, smap->elem_size); + return ERR_PTR(err); + } + + return SDATA(selem); + } + + if ((map_flags & BPF_F_LOCK) && !(map_flags & BPF_NOEXIST)) { + /* Hoping to find an old_sdata to do inline update + * such that it can avoid taking the local_storage->lock + * and changing the lists. + */ + old_sdata = + bpf_local_storage_lookup(local_storage, smap, false); + err = check_flags(old_sdata, map_flags); + if (err) + return ERR_PTR(err); + if (old_sdata && selem_linked_to_storage(SELEM(old_sdata))) { + copy_map_value_locked(&smap->map, old_sdata->data, + value, false); + return old_sdata; + } + } + + raw_spin_lock_bh(&local_storage->lock); + + /* Recheck local_storage->list under local_storage->lock */ + if (unlikely(hlist_empty(&local_storage->list))) { + /* A parallel del is happening and local_storage is going + * away. It has just been checked before, so very + * unlikely. Return instead of retry to keep things + * simple. + */ + err = -EAGAIN; + goto unlock_err; + } + + old_sdata = bpf_local_storage_lookup(local_storage, smap, false); + err = check_flags(old_sdata, map_flags); + if (err) + goto unlock_err; + + if (old_sdata && (map_flags & BPF_F_LOCK)) { + copy_map_value_locked(&smap->map, old_sdata->data, value, + false); + selem = SELEM(old_sdata); + goto unlock; + } + + /* local_storage->lock is held. Hence, we are sure + * we can unlink and uncharge the old_sdata successfully + * later. Hence, instead of charging the new selem now + * and then uncharge the old selem later (which may cause + * a potential but unnecessary charge failure), avoid taking + * a charge at all here (the "!old_sdata" check) and the + * old_sdata will not be uncharged later during + * bpf_selem_unlink_storage_nolock(). + */ + selem = bpf_selem_alloc(smap, owner, value, !old_sdata); + if (!selem) { + err = -ENOMEM; + goto unlock_err; + } + + /* First, link the new selem to the map */ + bpf_selem_link_map(smap, selem); + + /* Second, link (and publish) the new selem to local_storage */ + bpf_selem_link_storage_nolock(local_storage, selem); + + /* Third, remove old selem, SELEM(old_sdata) */ + if (old_sdata) { + bpf_selem_unlink_map(SELEM(old_sdata)); + bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata), + false); + } + +unlock: + raw_spin_unlock_bh(&local_storage->lock); + return SDATA(selem); + +unlock_err: + raw_spin_unlock_bh(&local_storage->lock); + return ERR_PTR(err); +} + +u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache) +{ + u64 min_usage = U64_MAX; + u16 i, res = 0; + + spin_lock(&cache->idx_lock); + + for (i = 0; i < BPF_LOCAL_STORAGE_CACHE_SIZE; i++) { + if (cache->idx_usage_counts[i] < min_usage) { + min_usage = cache->idx_usage_counts[i]; + res = i; + + /* Found a free cache_idx */ + if (!min_usage) + break; + } + } + cache->idx_usage_counts[res]++; + + spin_unlock(&cache->idx_lock); + + return res; +} + +void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache, + u16 idx) +{ + spin_lock(&cache->idx_lock); + cache->idx_usage_counts[idx]--; + spin_unlock(&cache->idx_lock); +} + +void bpf_local_storage_map_free(struct bpf_local_storage_map *smap) +{ + struct bpf_local_storage_elem *selem; + struct bpf_local_storage_map_bucket *b; + unsigned int i; + + /* Note that this map might be concurrently cloned from + * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone + * RCU read section to finish before proceeding. New RCU + * read sections should be prevented via bpf_map_inc_not_zero. + */ + synchronize_rcu(); + + /* bpf prog and the userspace can no longer access this map + * now. No new selem (of this map) can be added + * to the owner->storage or to the map bucket's list. + * + * The elem of this map can be cleaned up here + * or when the storage is freed e.g. + * by bpf_sk_storage_free() during __sk_destruct(). + */ + for (i = 0; i < (1U << smap->bucket_log); i++) { + b = &smap->buckets[i]; + + rcu_read_lock(); + /* No one is adding to b->list now */ + while ((selem = hlist_entry_safe( + rcu_dereference_raw(hlist_first_rcu(&b->list)), + struct bpf_local_storage_elem, map_node))) { + bpf_selem_unlink(selem); + cond_resched_rcu(); + } + rcu_read_unlock(); + } + + /* While freeing the storage we may still need to access the map. + * + * e.g. when bpf_sk_storage_free() has unlinked selem from the map + * which then made the above while((selem = ...)) loop + * exit immediately. + * + * However, while freeing the storage one still needs to access the + * smap->elem_size to do the uncharging in + * bpf_selem_unlink_storage_nolock(). + * + * Hence, wait another rcu grace period for the storage to be freed. + */ + synchronize_rcu(); + + kvfree(smap->buckets); + kfree(smap); +} + +int bpf_local_storage_map_alloc_check(union bpf_attr *attr) +{ + if (attr->map_flags & ~BPF_LOCAL_STORAGE_CREATE_FLAG_MASK || + !(attr->map_flags & BPF_F_NO_PREALLOC) || + attr->max_entries || + attr->key_size != sizeof(int) || !attr->value_size || + /* Enforce BTF for userspace sk dumping */ + !attr->btf_key_type_id || !attr->btf_value_type_id) + return -EINVAL; + + if (!bpf_capable()) + return -EPERM; + + if (attr->value_size > BPF_LOCAL_STORAGE_MAX_VALUE_SIZE) + return -E2BIG; + + return 0; +} + +struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr) +{ + struct bpf_local_storage_map *smap; + unsigned int i; + u32 nbuckets; + u64 cost; + int ret; + + smap = kzalloc(sizeof(*smap), GFP_USER | __GFP_NOWARN); + if (!smap) + return ERR_PTR(-ENOMEM); + bpf_map_init_from_attr(&smap->map, attr); + + nbuckets = roundup_pow_of_two(num_possible_cpus()); + /* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */ + nbuckets = max_t(u32, 2, nbuckets); + smap->bucket_log = ilog2(nbuckets); + cost = sizeof(*smap->buckets) * nbuckets + sizeof(*smap); + + ret = bpf_map_charge_init(&smap->map.memory, cost); + if (ret < 0) { + kfree(smap); + return ERR_PTR(ret); + } + + smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets, + GFP_USER | __GFP_NOWARN); + if (!smap->buckets) { + bpf_map_charge_finish(&smap->map.memory); + kfree(smap); + return ERR_PTR(-ENOMEM); + } + + for (i = 0; i < nbuckets; i++) { + INIT_HLIST_HEAD(&smap->buckets[i].list); + raw_spin_lock_init(&smap->buckets[i].lock); + } + + smap->elem_size = + sizeof(struct bpf_local_storage_elem) + attr->value_size; + + return smap; +} + +int bpf_local_storage_map_check_btf(const struct bpf_map *map, + const struct btf *btf, + const struct btf_type *key_type, + const struct btf_type *value_type) +{ + u32 int_data; + + if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) + return -EINVAL; + + int_data = *(u32 *)(key_type + 1); + if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data)) + return -EINVAL; + + return 0; +} diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index fb278144e9fd..78ea8a7bd27f 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -11,6 +11,8 @@ #include <linux/bpf_lsm.h> #include <linux/kallsyms.h> #include <linux/bpf_verifier.h> +#include <net/bpf_sk_storage.h> +#include <linux/bpf_local_storage.h> /* For every LSM hook that allows attachment of BPF programs, declare a nop * function where a BPF program can be attached. @@ -45,10 +47,27 @@ int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, return 0; } +static const struct bpf_func_proto * +bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_inode_storage_get: + return &bpf_inode_storage_get_proto; + case BPF_FUNC_inode_storage_delete: + return &bpf_inode_storage_delete_proto; + case BPF_FUNC_sk_storage_get: + return &bpf_sk_storage_get_proto; + case BPF_FUNC_sk_storage_delete: + return &bpf_sk_storage_delete_proto; + default: + return tracing_prog_func_proto(func_id, prog); + } +} + const struct bpf_prog_ops lsm_prog_ops = { }; const struct bpf_verifier_ops lsm_verifier_ops = { - .get_func_proto = tracing_prog_func_proto, + .get_func_proto = bpf_lsm_func_proto, .is_valid_access = btf_ctx_access, }; diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 969c5d47f81f..4c3b543bb33b 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -298,8 +298,7 @@ static int check_zero_holes(const struct btf_type *t, void *data) return -EINVAL; mtype = btf_type_by_id(btf_vmlinux, member->type); - mtype = btf_resolve_size(btf_vmlinux, mtype, &msize, - NULL, NULL); + mtype = btf_resolve_size(btf_vmlinux, mtype, &msize); if (IS_ERR(mtype)) return PTR_ERR(mtype); prev_mend = moff + msize; @@ -396,8 +395,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, u32 msize; mtype = btf_type_by_id(btf_vmlinux, member->type); - mtype = btf_resolve_size(btf_vmlinux, mtype, &msize, - NULL, NULL); + mtype = btf_resolve_size(btf_vmlinux, mtype, &msize); if (IS_ERR(mtype)) { err = PTR_ERR(mtype); goto reset_unlock; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 91afdd4c82e3..ed7d02e8bc93 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -21,6 +21,8 @@ #include <linux/btf_ids.h> #include <linux/skmsg.h> #include <linux/perf_event.h> +#include <linux/bsearch.h> +#include <linux/btf_ids.h> #include <net/sock.h> /* BTF (BPF Type Format) is the meta data format which describes @@ -186,11 +188,6 @@ i < btf_type_vlen(struct_type); \ i++, member++) -#define for_each_vsi(i, struct_type, member) \ - for (i = 0, member = btf_type_var_secinfo(struct_type); \ - i < btf_type_vlen(struct_type); \ - i++, member++) - #define for_each_vsi_from(i, from, struct_type, member) \ for (i = from, member = btf_type_var_secinfo(struct_type) + from; \ i < btf_type_vlen(struct_type); \ @@ -282,6 +279,91 @@ static const char *btf_type_str(const struct btf_type *t) return btf_kind_str[BTF_INFO_KIND(t->info)]; } +/* Chunk size we use in safe copy of data to be shown. */ +#define BTF_SHOW_OBJ_SAFE_SIZE 32 + +/* + * This is the maximum size of a base type value (equivalent to a + * 128-bit int); if we are at the end of our safe buffer and have + * less than 16 bytes space we can't be assured of being able + * to copy the next type safely, so in such cases we will initiate + * a new copy. + */ +#define BTF_SHOW_OBJ_BASE_TYPE_SIZE 16 + +/* Type name size */ +#define BTF_SHOW_NAME_SIZE 80 + +/* + * Common data to all BTF show operations. Private show functions can add + * their own data to a structure containing a struct btf_show and consult it + * in the show callback. See btf_type_show() below. + * + * One challenge with showing nested data is we want to skip 0-valued + * data, but in order to figure out whether a nested object is all zeros + * we need to walk through it. As a result, we need to make two passes + * when handling structs, unions and arrays; the first path simply looks + * for nonzero data, while the second actually does the display. The first + * pass is signalled by show->state.depth_check being set, and if we + * encounter a non-zero value we set show->state.depth_to_show to + * the depth at which we encountered it. When we have completed the + * first pass, we will know if anything needs to be displayed if + * depth_to_show > depth. See btf_[struct,array]_show() for the + * implementation of this. + * + * Another problem is we want to ensure the data for display is safe to + * access. To support this, the anonymous "struct {} obj" tracks the data + * object and our safe copy of it. We copy portions of the data needed + * to the object "copy" buffer, but because its size is limited to + * BTF_SHOW_OBJ_COPY_LEN bytes, multiple copies may be required as we + * traverse larger objects for display. + * + * The various data type show functions all start with a call to + * btf_show_start_type() which returns a pointer to the safe copy + * of the data needed (or if BTF_SHOW_UNSAFE is specified, to the + * raw data itself). btf_show_obj_safe() is responsible for + * using copy_from_kernel_nofault() to update the safe data if necessary + * as we traverse the object's data. skbuff-like semantics are + * used: + * + * - obj.head points to the start of the toplevel object for display + * - obj.size is the size of the toplevel object + * - obj.data points to the current point in the original data at + * which our safe data starts. obj.data will advance as we copy + * portions of the data. + * + * In most cases a single copy will suffice, but larger data structures + * such as "struct task_struct" will require many copies. The logic in + * btf_show_obj_safe() handles the logic that determines if a new + * copy_from_kernel_nofault() is needed. + */ +struct btf_show { + u64 flags; + void *target; /* target of show operation (seq file, buffer) */ + void (*showfn)(struct btf_show *show, const char *fmt, va_list args); + const struct btf *btf; + /* below are used during iteration */ + struct { + u8 depth; + u8 depth_to_show; + u8 depth_check; + u8 array_member:1, + array_terminated:1; + u16 array_encoding; + u32 type_id; + int status; /* non-zero for error */ + const struct btf_type *type; + const struct btf_member *member; + char name[BTF_SHOW_NAME_SIZE]; /* space for member name/type */ + } state; + struct { + u32 size; + void *head; + void *data; + u8 safe[BTF_SHOW_OBJ_SAFE_SIZE]; + } obj; +}; + struct btf_kind_operations { s32 (*check_meta)(struct btf_verifier_env *env, const struct btf_type *t, @@ -298,9 +380,9 @@ struct btf_kind_operations { const struct btf_type *member_type); void (*log_details)(struct btf_verifier_env *env, const struct btf_type *t); - void (*seq_show)(const struct btf *btf, const struct btf_type *t, + void (*show)(const struct btf *btf, const struct btf_type *t, u32 type_id, void *data, u8 bits_offsets, - struct seq_file *m); + struct btf_show *show); }; static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS]; @@ -353,16 +435,6 @@ static bool btf_type_nosize_or_null(const struct btf_type *t) return !t || btf_type_nosize(t); } -/* union is only a special case of struct: - * all its offsetof(member) == 0 - */ -static bool btf_type_is_struct(const struct btf_type *t) -{ - u8 kind = BTF_INFO_KIND(t->info); - - return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION; -} - static bool __btf_type_is_struct(const struct btf_type *t) { return BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT; @@ -373,11 +445,6 @@ static bool btf_type_is_array(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY; } -static bool btf_type_is_var(const struct btf_type *t) -{ - return BTF_INFO_KIND(t->info) == BTF_KIND_VAR; -} - static bool btf_type_is_datasec(const struct btf_type *t) { return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC; @@ -526,11 +593,6 @@ static const struct btf_var *btf_type_var(const struct btf_type *t) return (const struct btf_var *)(t + 1); } -static const struct btf_var_secinfo *btf_type_var_secinfo(const struct btf_type *t) -{ - return (const struct btf_var_secinfo *)(t + 1); -} - static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t) { return kind_ops[BTF_INFO_KIND(t->info)]; @@ -677,6 +739,488 @@ bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, return true; } +/* Similar to btf_type_skip_modifiers() but does not skip typedefs. */ +static const struct btf_type *btf_type_skip_qualifiers(const struct btf *btf, + u32 id) +{ + const struct btf_type *t = btf_type_by_id(btf, id); + + while (btf_type_is_modifier(t) && + BTF_INFO_KIND(t->info) != BTF_KIND_TYPEDEF) { + id = t->type; + t = btf_type_by_id(btf, t->type); + } + + return t; +} + +#define BTF_SHOW_MAX_ITER 10 + +#define BTF_KIND_BIT(kind) (1ULL << kind) + +/* + * Populate show->state.name with type name information. + * Format of type name is + * + * [.member_name = ] (type_name) + */ +static const char *btf_show_name(struct btf_show *show) +{ + /* BTF_MAX_ITER array suffixes "[]" */ + const char *array_suffixes = "[][][][][][][][][][]"; + const char *array_suffix = &array_suffixes[strlen(array_suffixes)]; + /* BTF_MAX_ITER pointer suffixes "*" */ + const char *ptr_suffixes = "**********"; + const char *ptr_suffix = &ptr_suffixes[strlen(ptr_suffixes)]; + const char *name = NULL, *prefix = "", *parens = ""; + const struct btf_member *m = show->state.member; + const struct btf_type *t = show->state.type; + const struct btf_array *array; + u32 id = show->state.type_id; + const char *member = NULL; + bool show_member = false; + u64 kinds = 0; + int i; + + show->state.name[0] = '\0'; + + /* + * Don't show type name if we're showing an array member; + * in that case we show the array type so don't need to repeat + * ourselves for each member. + */ + if (show->state.array_member) + return ""; + + /* Retrieve member name, if any. */ + if (m) { + member = btf_name_by_offset(show->btf, m->name_off); + show_member = strlen(member) > 0; + id = m->type; + } + + /* + * Start with type_id, as we have resolved the struct btf_type * + * via btf_modifier_show() past the parent typedef to the child + * struct, int etc it is defined as. In such cases, the type_id + * still represents the starting type while the struct btf_type * + * in our show->state points at the resolved type of the typedef. + */ + t = btf_type_by_id(show->btf, id); + if (!t) + return ""; + + /* + * The goal here is to build up the right number of pointer and + * array suffixes while ensuring the type name for a typedef + * is represented. Along the way we accumulate a list of + * BTF kinds we have encountered, since these will inform later + * display; for example, pointer types will not require an + * opening "{" for struct, we will just display the pointer value. + * + * We also want to accumulate the right number of pointer or array + * indices in the format string while iterating until we get to + * the typedef/pointee/array member target type. + * + * We start by pointing at the end of pointer and array suffix + * strings; as we accumulate pointers and arrays we move the pointer + * or array string backwards so it will show the expected number of + * '*' or '[]' for the type. BTF_SHOW_MAX_ITER of nesting of pointers + * and/or arrays and typedefs are supported as a precaution. + * + * We also want to get typedef name while proceeding to resolve + * type it points to so that we can add parentheses if it is a + * "typedef struct" etc. + */ + for (i = 0; i < BTF_SHOW_MAX_ITER; i++) { + + switch (BTF_INFO_KIND(t->info)) { + case BTF_KIND_TYPEDEF: + if (!name) + name = btf_name_by_offset(show->btf, + t->name_off); + kinds |= BTF_KIND_BIT(BTF_KIND_TYPEDEF); + id = t->type; + break; + case BTF_KIND_ARRAY: + kinds |= BTF_KIND_BIT(BTF_KIND_ARRAY); + parens = "["; + if (!t) + return ""; + array = btf_type_array(t); + if (array_suffix > array_suffixes) + array_suffix -= 2; + id = array->type; + break; + case BTF_KIND_PTR: + kinds |= BTF_KIND_BIT(BTF_KIND_PTR); + if (ptr_suffix > ptr_suffixes) + ptr_suffix -= 1; + id = t->type; + break; + default: + id = 0; + break; + } + if (!id) + break; + t = btf_type_skip_qualifiers(show->btf, id); + } + /* We may not be able to represent this type; bail to be safe */ + if (i == BTF_SHOW_MAX_ITER) + return ""; + + if (!name) + name = btf_name_by_offset(show->btf, t->name_off); + + switch (BTF_INFO_KIND(t->info)) { + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + prefix = BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT ? + "struct" : "union"; + /* if it's an array of struct/union, parens is already set */ + if (!(kinds & (BTF_KIND_BIT(BTF_KIND_ARRAY)))) + parens = "{"; + break; + case BTF_KIND_ENUM: + prefix = "enum"; + break; + default: + break; + } + + /* pointer does not require parens */ + if (kinds & BTF_KIND_BIT(BTF_KIND_PTR)) + parens = ""; + /* typedef does not require struct/union/enum prefix */ + if (kinds & BTF_KIND_BIT(BTF_KIND_TYPEDEF)) + prefix = ""; + + if (!name) + name = ""; + + /* Even if we don't want type name info, we want parentheses etc */ + if (show->flags & BTF_SHOW_NONAME) + snprintf(show->state.name, sizeof(show->state.name), "%s", + parens); + else + snprintf(show->state.name, sizeof(show->state.name), + "%s%s%s(%s%s%s%s%s%s)%s", + /* first 3 strings comprise ".member = " */ + show_member ? "." : "", + show_member ? member : "", + show_member ? " = " : "", + /* ...next is our prefix (struct, enum, etc) */ + prefix, + strlen(prefix) > 0 && strlen(name) > 0 ? " " : "", + /* ...this is the type name itself */ + name, + /* ...suffixed by the appropriate '*', '[]' suffixes */ + strlen(ptr_suffix) > 0 ? " " : "", ptr_suffix, + array_suffix, parens); + + return show->state.name; +} + +static const char *__btf_show_indent(struct btf_show *show) +{ + const char *indents = " "; + const char *indent = &indents[strlen(indents)]; + + if ((indent - show->state.depth) >= indents) + return indent - show->state.depth; + return indents; +} + +static const char *btf_show_indent(struct btf_show *show) +{ + return show->flags & BTF_SHOW_COMPACT ? "" : __btf_show_indent(show); +} + +static const char *btf_show_newline(struct btf_show *show) +{ + return show->flags & BTF_SHOW_COMPACT ? "" : "\n"; +} + +static const char *btf_show_delim(struct btf_show *show) +{ + if (show->state.depth == 0) + return ""; + + if ((show->flags & BTF_SHOW_COMPACT) && show->state.type && + BTF_INFO_KIND(show->state.type->info) == BTF_KIND_UNION) + return "|"; + + return ","; +} + +__printf(2, 3) static void btf_show(struct btf_show *show, const char *fmt, ...) +{ + va_list args; + + if (!show->state.depth_check) { + va_start(args, fmt); + show->showfn(show, fmt, args); + va_end(args); + } +} + +/* Macros are used here as btf_show_type_value[s]() prepends and appends + * format specifiers to the format specifier passed in; these do the work of + * adding indentation, delimiters etc while the caller simply has to specify + * the type value(s) in the format specifier + value(s). + */ +#define btf_show_type_value(show, fmt, value) \ + do { \ + if ((value) != 0 || (show->flags & BTF_SHOW_ZERO) || \ + show->state.depth == 0) { \ + btf_show(show, "%s%s" fmt "%s%s", \ + btf_show_indent(show), \ + btf_show_name(show), \ + value, btf_show_delim(show), \ + btf_show_newline(show)); \ + if (show->state.depth > show->state.depth_to_show) \ + show->state.depth_to_show = show->state.depth; \ + } \ + } while (0) + +#define btf_show_type_values(show, fmt, ...) \ + do { \ + btf_show(show, "%s%s" fmt "%s%s", btf_show_indent(show), \ + btf_show_name(show), \ + __VA_ARGS__, btf_show_delim(show), \ + btf_show_newline(show)); \ + if (show->state.depth > show->state.depth_to_show) \ + show->state.depth_to_show = show->state.depth; \ + } while (0) + +/* How much is left to copy to safe buffer after @data? */ +static int btf_show_obj_size_left(struct btf_show *show, void *data) +{ + return show->obj.head + show->obj.size - data; +} + +/* Is object pointed to by @data of @size already copied to our safe buffer? */ +static bool btf_show_obj_is_safe(struct btf_show *show, void *data, int size) +{ + return data >= show->obj.data && + (data + size) < (show->obj.data + BTF_SHOW_OBJ_SAFE_SIZE); +} + +/* + * If object pointed to by @data of @size falls within our safe buffer, return + * the equivalent pointer to the same safe data. Assumes + * copy_from_kernel_nofault() has already happened and our safe buffer is + * populated. + */ +static void *__btf_show_obj_safe(struct btf_show *show, void *data, int size) +{ + if (btf_show_obj_is_safe(show, data, size)) + return show->obj.safe + (data - show->obj.data); + return NULL; +} + +/* + * Return a safe-to-access version of data pointed to by @data. + * We do this by copying the relevant amount of information + * to the struct btf_show obj.safe buffer using copy_from_kernel_nofault(). + * + * If BTF_SHOW_UNSAFE is specified, just return data as-is; no + * safe copy is needed. + * + * Otherwise we need to determine if we have the required amount + * of data (determined by the @data pointer and the size of the + * largest base type we can encounter (represented by + * BTF_SHOW_OBJ_BASE_TYPE_SIZE). Having that much data ensures + * that we will be able to print some of the current object, + * and if more is needed a copy will be triggered. + * Some objects such as structs will not fit into the buffer; + * in such cases additional copies when we iterate over their + * members may be needed. + * + * btf_show_obj_safe() is used to return a safe buffer for + * btf_show_start_type(); this ensures that as we recurse into + * nested types we always have safe data for the given type. + * This approach is somewhat wasteful; it's possible for example + * that when iterating over a large union we'll end up copying the + * same data repeatedly, but the goal is safety not performance. + * We use stack data as opposed to per-CPU buffers because the + * iteration over a type can take some time, and preemption handling + * would greatly complicate use of the safe buffer. + */ +static void *btf_show_obj_safe(struct btf_show *show, + const struct btf_type *t, + void *data) +{ + const struct btf_type *rt; + int size_left, size; + void *safe = NULL; + + if (show->flags & BTF_SHOW_UNSAFE) + return data; + + rt = btf_resolve_size(show->btf, t, &size); + if (IS_ERR(rt)) { + show->state.status = PTR_ERR(rt); + return NULL; + } + + /* + * Is this toplevel object? If so, set total object size and + * initialize pointers. Otherwise check if we still fall within + * our safe object data. + */ + if (show->state.depth == 0) { + show->obj.size = size; + show->obj.head = data; + } else { + /* + * If the size of the current object is > our remaining + * safe buffer we _may_ need to do a new copy. However + * consider the case of a nested struct; it's size pushes + * us over the safe buffer limit, but showing any individual + * struct members does not. In such cases, we don't need + * to initiate a fresh copy yet; however we definitely need + * at least BTF_SHOW_OBJ_BASE_TYPE_SIZE bytes left + * in our buffer, regardless of the current object size. + * The logic here is that as we resolve types we will + * hit a base type at some point, and we need to be sure + * the next chunk of data is safely available to display + * that type info safely. We cannot rely on the size of + * the current object here because it may be much larger + * than our current buffer (e.g. task_struct is 8k). + * All we want to do here is ensure that we can print the + * next basic type, which we can if either + * - the current type size is within the safe buffer; or + * - at least BTF_SHOW_OBJ_BASE_TYPE_SIZE bytes are left in + * the safe buffer. + */ + safe = __btf_show_obj_safe(show, data, + min(size, + BTF_SHOW_OBJ_BASE_TYPE_SIZE)); + } + + /* + * We need a new copy to our safe object, either because we haven't + * yet copied and are intializing safe data, or because the data + * we want falls outside the boundaries of the safe object. + */ + if (!safe) { + size_left = btf_show_obj_size_left(show, data); + if (size_left > BTF_SHOW_OBJ_SAFE_SIZE) + size_left = BTF_SHOW_OBJ_SAFE_SIZE; + show->state.status = copy_from_kernel_nofault(show->obj.safe, + data, size_left); + if (!show->state.status) { + show->obj.data = data; + safe = show->obj.safe; + } + } + + return safe; +} + +/* + * Set the type we are starting to show and return a safe data pointer + * to be used for showing the associated data. + */ +static void *btf_show_start_type(struct btf_show *show, + const struct btf_type *t, + u32 type_id, void *data) +{ + show->state.type = t; + show->state.type_id = type_id; + show->state.name[0] = '\0'; + + return btf_show_obj_safe(show, t, data); +} + +static void btf_show_end_type(struct btf_show *show) +{ + show->state.type = NULL; + show->state.type_id = 0; + show->state.name[0] = '\0'; +} + +static void *btf_show_start_aggr_type(struct btf_show *show, + const struct btf_type *t, + u32 type_id, void *data) +{ + void *safe_data = btf_show_start_type(show, t, type_id, data); + + if (!safe_data) + return safe_data; + + btf_show(show, "%s%s%s", btf_show_indent(show), + btf_show_name(show), + btf_show_newline(show)); + show->state.depth++; + return safe_data; +} + +static void btf_show_end_aggr_type(struct btf_show *show, + const char *suffix) +{ + show->state.depth--; + btf_show(show, "%s%s%s%s", btf_show_indent(show), suffix, + btf_show_delim(show), btf_show_newline(show)); + btf_show_end_type(show); +} + +static void btf_show_start_member(struct btf_show *show, + const struct btf_member *m) +{ + show->state.member = m; +} + +static void btf_show_start_array_member(struct btf_show *show) +{ + show->state.array_member = 1; + btf_show_start_member(show, NULL); +} + +static void btf_show_end_member(struct btf_show *show) +{ + show->state.member = NULL; +} + +static void btf_show_end_array_member(struct btf_show *show) +{ + show->state.array_member = 0; + btf_show_end_member(show); +} + +static void *btf_show_start_array_type(struct btf_show *show, + const struct btf_type *t, + u32 type_id, + u16 array_encoding, + void *data) +{ + show->state.array_encoding = array_encoding; + show->state.array_terminated = 0; + return btf_show_start_aggr_type(show, t, type_id, data); +} + +static void btf_show_end_array_type(struct btf_show *show) +{ + show->state.array_encoding = 0; + show->state.array_terminated = 0; + btf_show_end_aggr_type(show, "]"); +} + +static void *btf_show_start_struct_type(struct btf_show *show, + const struct btf_type *t, + u32 type_id, + void *data) +{ + return btf_show_start_aggr_type(show, t, type_id, data); +} + +static void btf_show_end_struct_type(struct btf_show *show) +{ + btf_show_end_aggr_type(show, "}"); +} + __printf(2, 3) static void __btf_verifier_log(struct bpf_verifier_log *log, const char *fmt, ...) { @@ -1079,23 +1623,27 @@ static const struct resolve_vertex *env_stack_peak(struct btf_verifier_env *env) * *type_size: (x * y * sizeof(u32)). Hence, *type_size always * corresponds to the return type. * *elem_type: u32 + * *elem_id: id of u32 * *total_nelems: (x * y). Hence, individual elem size is * (*type_size / *total_nelems) + * *type_id: id of type if it's changed within the function, 0 if not * * type: is not an array (e.g. const struct X) * return type: type "struct X" * *type_size: sizeof(struct X) * *elem_type: same as return type ("struct X") + * *elem_id: 0 * *total_nelems: 1 + * *type_id: id of type if it's changed within the function, 0 if not */ -const struct btf_type * -btf_resolve_size(const struct btf *btf, const struct btf_type *type, - u32 *type_size, const struct btf_type **elem_type, - u32 *total_nelems) +static const struct btf_type * +__btf_resolve_size(const struct btf *btf, const struct btf_type *type, + u32 *type_size, const struct btf_type **elem_type, + u32 *elem_id, u32 *total_nelems, u32 *type_id) { const struct btf_type *array_type = NULL; - const struct btf_array *array; - u32 i, size, nelems = 1; + const struct btf_array *array = NULL; + u32 i, size, nelems = 1, id = 0; for (i = 0; i < MAX_RESOLVE_DEPTH; i++) { switch (BTF_INFO_KIND(type->info)) { @@ -1116,6 +1664,7 @@ btf_resolve_size(const struct btf *btf, const struct btf_type *type, case BTF_KIND_VOLATILE: case BTF_KIND_CONST: case BTF_KIND_RESTRICT: + id = type->type; type = btf_type_by_id(btf, type->type); break; @@ -1146,10 +1695,21 @@ resolved: *total_nelems = nelems; if (elem_type) *elem_type = type; + if (elem_id) + *elem_id = array ? array->type : 0; + if (type_id && id) + *type_id = id; return array_type ? : type; } +const struct btf_type * +btf_resolve_size(const struct btf *btf, const struct btf_type *type, + u32 *type_size) +{ + return __btf_resolve_size(btf, type, type_size, NULL, NULL, NULL, NULL); +} + /* The input param "type_id" must point to a needs_resolve type */ static const struct btf_type *btf_type_id_resolve(const struct btf *btf, u32 *type_id) @@ -1250,11 +1810,11 @@ static int btf_df_resolve(struct btf_verifier_env *env, return -EINVAL; } -static void btf_df_seq_show(const struct btf *btf, const struct btf_type *t, - u32 type_id, void *data, u8 bits_offsets, - struct seq_file *m) +static void btf_df_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offsets, + struct btf_show *show) { - seq_printf(m, "<unsupported kind:%u>", BTF_INFO_KIND(t->info)); + btf_show(show, "<unsupported kind:%u>", BTF_INFO_KIND(t->info)); } static int btf_int_check_member(struct btf_verifier_env *env, @@ -1427,7 +1987,7 @@ static void btf_int_log(struct btf_verifier_env *env, btf_int_encoding_str(BTF_INT_ENCODING(int_data))); } -static void btf_int128_print(struct seq_file *m, void *data) +static void btf_int128_print(struct btf_show *show, void *data) { /* data points to a __int128 number. * Suppose @@ -1446,9 +2006,10 @@ static void btf_int128_print(struct seq_file *m, void *data) lower_num = *(u64 *)data; #endif if (upper_num == 0) - seq_printf(m, "0x%llx", lower_num); + btf_show_type_value(show, "0x%llx", lower_num); else - seq_printf(m, "0x%llx%016llx", upper_num, lower_num); + btf_show_type_values(show, "0x%llx%016llx", upper_num, + lower_num); } static void btf_int128_shift(u64 *print_num, u16 left_shift_bits, @@ -1492,8 +2053,8 @@ static void btf_int128_shift(u64 *print_num, u16 left_shift_bits, #endif } -static void btf_bitfield_seq_show(void *data, u8 bits_offset, - u8 nr_bits, struct seq_file *m) +static void btf_bitfield_show(void *data, u8 bits_offset, + u8 nr_bits, struct btf_show *show) { u16 left_shift_bits, right_shift_bits; u8 nr_copy_bytes; @@ -1513,14 +2074,14 @@ static void btf_bitfield_seq_show(void *data, u8 bits_offset, right_shift_bits = BITS_PER_U128 - nr_bits; btf_int128_shift(print_num, left_shift_bits, right_shift_bits); - btf_int128_print(m, print_num); + btf_int128_print(show, print_num); } -static void btf_int_bits_seq_show(const struct btf *btf, - const struct btf_type *t, - void *data, u8 bits_offset, - struct seq_file *m) +static void btf_int_bits_show(const struct btf *btf, + const struct btf_type *t, + void *data, u8 bits_offset, + struct btf_show *show) { u32 int_data = btf_type_int(t); u8 nr_bits = BTF_INT_BITS(int_data); @@ -1533,55 +2094,77 @@ static void btf_int_bits_seq_show(const struct btf *btf, total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data); data += BITS_ROUNDDOWN_BYTES(total_bits_offset); bits_offset = BITS_PER_BYTE_MASKED(total_bits_offset); - btf_bitfield_seq_show(data, bits_offset, nr_bits, m); + btf_bitfield_show(data, bits_offset, nr_bits, show); } -static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t, - u32 type_id, void *data, u8 bits_offset, - struct seq_file *m) +static void btf_int_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offset, + struct btf_show *show) { u32 int_data = btf_type_int(t); u8 encoding = BTF_INT_ENCODING(int_data); bool sign = encoding & BTF_INT_SIGNED; u8 nr_bits = BTF_INT_BITS(int_data); + void *safe_data; + + safe_data = btf_show_start_type(show, t, type_id, data); + if (!safe_data) + return; if (bits_offset || BTF_INT_OFFSET(int_data) || BITS_PER_BYTE_MASKED(nr_bits)) { - btf_int_bits_seq_show(btf, t, data, bits_offset, m); - return; + btf_int_bits_show(btf, t, safe_data, bits_offset, show); + goto out; } switch (nr_bits) { case 128: - btf_int128_print(m, data); + btf_int128_print(show, safe_data); break; case 64: if (sign) - seq_printf(m, "%lld", *(s64 *)data); + btf_show_type_value(show, "%lld", *(s64 *)safe_data); else - seq_printf(m, "%llu", *(u64 *)data); + btf_show_type_value(show, "%llu", *(u64 *)safe_data); break; case 32: if (sign) - seq_printf(m, "%d", *(s32 *)data); + btf_show_type_value(show, "%d", *(s32 *)safe_data); else - seq_printf(m, "%u", *(u32 *)data); + btf_show_type_value(show, "%u", *(u32 *)safe_data); break; case 16: if (sign) - seq_printf(m, "%d", *(s16 *)data); + btf_show_type_value(show, "%d", *(s16 *)safe_data); else - seq_printf(m, "%u", *(u16 *)data); + btf_show_type_value(show, "%u", *(u16 *)safe_data); break; case 8: + if (show->state.array_encoding == BTF_INT_CHAR) { + /* check for null terminator */ + if (show->state.array_terminated) + break; + if (*(char *)data == '\0') { + show->state.array_terminated = 1; + break; + } + if (isprint(*(char *)data)) { + btf_show_type_value(show, "'%c'", + *(char *)safe_data); + break; + } + } if (sign) - seq_printf(m, "%d", *(s8 *)data); + btf_show_type_value(show, "%d", *(s8 *)safe_data); else - seq_printf(m, "%u", *(u8 *)data); + btf_show_type_value(show, "%u", *(u8 *)safe_data); break; default: - btf_int_bits_seq_show(btf, t, data, bits_offset, m); + btf_int_bits_show(btf, t, safe_data, bits_offset, show); + break; } +out: + btf_show_end_type(show); } static const struct btf_kind_operations int_ops = { @@ -1590,7 +2173,7 @@ static const struct btf_kind_operations int_ops = { .check_member = btf_int_check_member, .check_kflag_member = btf_int_check_kflag_member, .log_details = btf_int_log, - .seq_show = btf_int_seq_show, + .show = btf_int_show, }; static int btf_modifier_check_member(struct btf_verifier_env *env, @@ -1854,34 +2437,44 @@ static int btf_ptr_resolve(struct btf_verifier_env *env, return 0; } -static void btf_modifier_seq_show(const struct btf *btf, - const struct btf_type *t, - u32 type_id, void *data, - u8 bits_offset, struct seq_file *m) +static void btf_modifier_show(const struct btf *btf, + const struct btf_type *t, + u32 type_id, void *data, + u8 bits_offset, struct btf_show *show) { if (btf->resolved_ids) t = btf_type_id_resolve(btf, &type_id); else t = btf_type_skip_modifiers(btf, type_id, NULL); - btf_type_ops(t)->seq_show(btf, t, type_id, data, bits_offset, m); + btf_type_ops(t)->show(btf, t, type_id, data, bits_offset, show); } -static void btf_var_seq_show(const struct btf *btf, const struct btf_type *t, - u32 type_id, void *data, u8 bits_offset, - struct seq_file *m) +static void btf_var_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offset, + struct btf_show *show) { t = btf_type_id_resolve(btf, &type_id); - btf_type_ops(t)->seq_show(btf, t, type_id, data, bits_offset, m); + btf_type_ops(t)->show(btf, t, type_id, data, bits_offset, show); } -static void btf_ptr_seq_show(const struct btf *btf, const struct btf_type *t, - u32 type_id, void *data, u8 bits_offset, - struct seq_file *m) +static void btf_ptr_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offset, + struct btf_show *show) { - /* It is a hashed value */ - seq_printf(m, "%p", *(void **)data); + void *safe_data; + + safe_data = btf_show_start_type(show, t, type_id, data); + if (!safe_data) + return; + + /* It is a hashed value unless BTF_SHOW_PTR_RAW is specified */ + if (show->flags & BTF_SHOW_PTR_RAW) + btf_show_type_value(show, "0x%px", *(void **)safe_data); + else + btf_show_type_value(show, "0x%p", *(void **)safe_data); + btf_show_end_type(show); } static void btf_ref_type_log(struct btf_verifier_env *env, @@ -1896,7 +2489,7 @@ static struct btf_kind_operations modifier_ops = { .check_member = btf_modifier_check_member, .check_kflag_member = btf_modifier_check_kflag_member, .log_details = btf_ref_type_log, - .seq_show = btf_modifier_seq_show, + .show = btf_modifier_show, }; static struct btf_kind_operations ptr_ops = { @@ -1905,7 +2498,7 @@ static struct btf_kind_operations ptr_ops = { .check_member = btf_ptr_check_member, .check_kflag_member = btf_generic_check_kflag_member, .log_details = btf_ref_type_log, - .seq_show = btf_ptr_seq_show, + .show = btf_ptr_show, }; static s32 btf_fwd_check_meta(struct btf_verifier_env *env, @@ -1946,7 +2539,7 @@ static struct btf_kind_operations fwd_ops = { .check_member = btf_df_check_member, .check_kflag_member = btf_df_check_kflag_member, .log_details = btf_fwd_type_log, - .seq_show = btf_df_seq_show, + .show = btf_df_show, }; static int btf_array_check_member(struct btf_verifier_env *env, @@ -2105,28 +2698,90 @@ static void btf_array_log(struct btf_verifier_env *env, array->type, array->index_type, array->nelems); } -static void btf_array_seq_show(const struct btf *btf, const struct btf_type *t, - u32 type_id, void *data, u8 bits_offset, - struct seq_file *m) +static void __btf_array_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offset, + struct btf_show *show) { const struct btf_array *array = btf_type_array(t); const struct btf_kind_operations *elem_ops; const struct btf_type *elem_type; - u32 i, elem_size, elem_type_id; + u32 i, elem_size = 0, elem_type_id; + u16 encoding = 0; elem_type_id = array->type; - elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size); + elem_type = btf_type_skip_modifiers(btf, elem_type_id, NULL); + if (elem_type && btf_type_has_size(elem_type)) + elem_size = elem_type->size; + + if (elem_type && btf_type_is_int(elem_type)) { + u32 int_type = btf_type_int(elem_type); + + encoding = BTF_INT_ENCODING(int_type); + + /* + * BTF_INT_CHAR encoding never seems to be set for + * char arrays, so if size is 1 and element is + * printable as a char, we'll do that. + */ + if (elem_size == 1) + encoding = BTF_INT_CHAR; + } + + if (!btf_show_start_array_type(show, t, type_id, encoding, data)) + return; + + if (!elem_type) + goto out; elem_ops = btf_type_ops(elem_type); - seq_puts(m, "["); + for (i = 0; i < array->nelems; i++) { - if (i) - seq_puts(m, ","); - elem_ops->seq_show(btf, elem_type, elem_type_id, data, - bits_offset, m); + btf_show_start_array_member(show); + + elem_ops->show(btf, elem_type, elem_type_id, data, + bits_offset, show); data += elem_size; + + btf_show_end_array_member(show); + + if (show->state.array_terminated) + break; + } +out: + btf_show_end_array_type(show); +} + +static void btf_array_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offset, + struct btf_show *show) +{ + const struct btf_member *m = show->state.member; + + /* + * First check if any members would be shown (are non-zero). + * See comments above "struct btf_show" definition for more + * details on how this works at a high-level. + */ + if (show->state.depth > 0 && !(show->flags & BTF_SHOW_ZERO)) { + if (!show->state.depth_check) { + show->state.depth_check = show->state.depth + 1; + show->state.depth_to_show = 0; + } + __btf_array_show(btf, t, type_id, data, bits_offset, show); + show->state.member = m; + + if (show->state.depth_check != show->state.depth + 1) + return; + show->state.depth_check = 0; + + if (show->state.depth_to_show <= show->state.depth) + return; + /* + * Reaching here indicates we have recursed and found + * non-zero array member(s). + */ } - seq_puts(m, "]"); + __btf_array_show(btf, t, type_id, data, bits_offset, show); } static struct btf_kind_operations array_ops = { @@ -2135,7 +2790,7 @@ static struct btf_kind_operations array_ops = { .check_member = btf_array_check_member, .check_kflag_member = btf_generic_check_kflag_member, .log_details = btf_array_log, - .seq_show = btf_array_seq_show, + .show = btf_array_show, }; static int btf_struct_check_member(struct btf_verifier_env *env, @@ -2358,15 +3013,18 @@ int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t) return off; } -static void btf_struct_seq_show(const struct btf *btf, const struct btf_type *t, - u32 type_id, void *data, u8 bits_offset, - struct seq_file *m) +static void __btf_struct_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offset, + struct btf_show *show) { - const char *seq = BTF_INFO_KIND(t->info) == BTF_KIND_UNION ? "|" : ","; const struct btf_member *member; + void *safe_data; u32 i; - seq_puts(m, "{"); + safe_data = btf_show_start_struct_type(show, t, type_id, data); + if (!safe_data) + return; + for_each_member(i, t, member) { const struct btf_type *member_type = btf_type_by_id(btf, member->type); @@ -2375,23 +3033,65 @@ static void btf_struct_seq_show(const struct btf *btf, const struct btf_type *t, u32 bytes_offset; u8 bits8_offset; - if (i) - seq_puts(m, seq); + btf_show_start_member(show, member); member_offset = btf_member_bit_offset(t, member); bitfield_size = btf_member_bitfield_size(t, member); bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset); bits8_offset = BITS_PER_BYTE_MASKED(member_offset); if (bitfield_size) { - btf_bitfield_seq_show(data + bytes_offset, bits8_offset, - bitfield_size, m); + safe_data = btf_show_start_type(show, member_type, + member->type, + data + bytes_offset); + if (safe_data) + btf_bitfield_show(safe_data, + bits8_offset, + bitfield_size, show); + btf_show_end_type(show); } else { ops = btf_type_ops(member_type); - ops->seq_show(btf, member_type, member->type, - data + bytes_offset, bits8_offset, m); + ops->show(btf, member_type, member->type, + data + bytes_offset, bits8_offset, show); } + + btf_show_end_member(show); } - seq_puts(m, "}"); + + btf_show_end_struct_type(show); +} + +static void btf_struct_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offset, + struct btf_show *show) +{ + const struct btf_member *m = show->state.member; + + /* + * First check if any members would be shown (are non-zero). + * See comments above "struct btf_show" definition for more + * details on how this works at a high-level. + */ + if (show->state.depth > 0 && !(show->flags & BTF_SHOW_ZERO)) { + if (!show->state.depth_check) { + show->state.depth_check = show->state.depth + 1; + show->state.depth_to_show = 0; + } + __btf_struct_show(btf, t, type_id, data, bits_offset, show); + /* Restore saved member data here */ + show->state.member = m; + if (show->state.depth_check != show->state.depth + 1) + return; + show->state.depth_check = 0; + + if (show->state.depth_to_show <= show->state.depth) + return; + /* + * Reaching here indicates we have recursed and found + * non-zero child values. + */ + } + + __btf_struct_show(btf, t, type_id, data, bits_offset, show); } static struct btf_kind_operations struct_ops = { @@ -2400,7 +3100,7 @@ static struct btf_kind_operations struct_ops = { .check_member = btf_struct_check_member, .check_kflag_member = btf_generic_check_kflag_member, .log_details = btf_struct_log, - .seq_show = btf_struct_seq_show, + .show = btf_struct_show, }; static int btf_enum_check_member(struct btf_verifier_env *env, @@ -2531,24 +3231,35 @@ static void btf_enum_log(struct btf_verifier_env *env, btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t)); } -static void btf_enum_seq_show(const struct btf *btf, const struct btf_type *t, - u32 type_id, void *data, u8 bits_offset, - struct seq_file *m) +static void btf_enum_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offset, + struct btf_show *show) { const struct btf_enum *enums = btf_type_enum(t); u32 i, nr_enums = btf_type_vlen(t); - int v = *(int *)data; + void *safe_data; + int v; + + safe_data = btf_show_start_type(show, t, type_id, data); + if (!safe_data) + return; + + v = *(int *)safe_data; for (i = 0; i < nr_enums; i++) { - if (v == enums[i].val) { - seq_printf(m, "%s", - __btf_name_by_offset(btf, - enums[i].name_off)); - return; - } + if (v != enums[i].val) + continue; + + btf_show_type_value(show, "%s", + __btf_name_by_offset(btf, + enums[i].name_off)); + + btf_show_end_type(show); + return; } - seq_printf(m, "%d", v); + btf_show_type_value(show, "%d", v); + btf_show_end_type(show); } static struct btf_kind_operations enum_ops = { @@ -2557,7 +3268,7 @@ static struct btf_kind_operations enum_ops = { .check_member = btf_enum_check_member, .check_kflag_member = btf_enum_check_kflag_member, .log_details = btf_enum_log, - .seq_show = btf_enum_seq_show, + .show = btf_enum_show, }; static s32 btf_func_proto_check_meta(struct btf_verifier_env *env, @@ -2644,7 +3355,7 @@ static struct btf_kind_operations func_proto_ops = { .check_member = btf_df_check_member, .check_kflag_member = btf_df_check_kflag_member, .log_details = btf_func_proto_log, - .seq_show = btf_df_seq_show, + .show = btf_df_show, }; static s32 btf_func_check_meta(struct btf_verifier_env *env, @@ -2678,7 +3389,7 @@ static struct btf_kind_operations func_ops = { .check_member = btf_df_check_member, .check_kflag_member = btf_df_check_kflag_member, .log_details = btf_ref_type_log, - .seq_show = btf_df_seq_show, + .show = btf_df_show, }; static s32 btf_var_check_meta(struct btf_verifier_env *env, @@ -2742,7 +3453,7 @@ static const struct btf_kind_operations var_ops = { .check_member = btf_df_check_member, .check_kflag_member = btf_df_check_kflag_member, .log_details = btf_var_log, - .seq_show = btf_var_seq_show, + .show = btf_var_show, }; static s32 btf_datasec_check_meta(struct btf_verifier_env *env, @@ -2868,24 +3579,28 @@ static void btf_datasec_log(struct btf_verifier_env *env, btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t)); } -static void btf_datasec_seq_show(const struct btf *btf, - const struct btf_type *t, u32 type_id, - void *data, u8 bits_offset, - struct seq_file *m) +static void btf_datasec_show(const struct btf *btf, + const struct btf_type *t, u32 type_id, + void *data, u8 bits_offset, + struct btf_show *show) { const struct btf_var_secinfo *vsi; const struct btf_type *var; u32 i; - seq_printf(m, "section (\"%s\") = {", __btf_name_by_offset(btf, t->name_off)); + if (!btf_show_start_type(show, t, type_id, data)) + return; + + btf_show_type_value(show, "section (\"%s\") = {", + __btf_name_by_offset(btf, t->name_off)); for_each_vsi(i, t, vsi) { var = btf_type_by_id(btf, vsi->type); if (i) - seq_puts(m, ","); - btf_type_ops(var)->seq_show(btf, var, vsi->type, - data + vsi->offset, bits_offset, m); + btf_show(show, ","); + btf_type_ops(var)->show(btf, var, vsi->type, + data + vsi->offset, bits_offset, show); } - seq_puts(m, "}"); + btf_show_end_type(show); } static const struct btf_kind_operations datasec_ops = { @@ -2894,7 +3609,7 @@ static const struct btf_kind_operations datasec_ops = { .check_member = btf_df_check_member, .check_kflag_member = btf_df_check_kflag_member, .log_details = btf_datasec_log, - .seq_show = btf_datasec_seq_show, + .show = btf_datasec_show, }; static int btf_func_proto_check(struct btf_verifier_env *env, @@ -3688,7 +4403,7 @@ errout: struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog) { - struct bpf_prog *tgt_prog = prog->aux->linked_prog; + struct bpf_prog *tgt_prog = prog->aux->dst_prog; if (tgt_prog) { return tgt_prog->aux->btf; @@ -3715,7 +4430,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { const struct btf_type *t = prog->aux->attach_func_proto; - struct bpf_prog *tgt_prog = prog->aux->linked_prog; + struct bpf_prog *tgt_prog = prog->aux->dst_prog; struct btf *btf = bpf_prog_get_target_btf(prog); const char *tname = prog->aux->attach_func_name; struct bpf_verifier_log *log = info->log; @@ -3842,7 +4557,14 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, info->reg_type = PTR_TO_BTF_ID; if (tgt_prog) { - ret = btf_translate_to_vmlinux(log, btf, t, tgt_prog->type, arg); + enum bpf_prog_type tgt_type; + + if (tgt_prog->type == BPF_PROG_TYPE_EXT) + tgt_type = tgt_prog->aux->saved_dst_prog_type; + else + tgt_type = tgt_prog->type; + + ret = btf_translate_to_vmlinux(log, btf, t, tgt_type, arg); if (ret > 0) { info->btf_id = ret; return true; @@ -3870,16 +4592,22 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, return true; } -int btf_struct_access(struct bpf_verifier_log *log, - const struct btf_type *t, int off, int size, - enum bpf_access_type atype, - u32 *next_btf_id) +enum bpf_struct_walk_result { + /* < 0 error */ + WALK_SCALAR = 0, + WALK_PTR, + WALK_STRUCT, +}; + +static int btf_struct_walk(struct bpf_verifier_log *log, + const struct btf_type *t, int off, int size, + u32 *next_btf_id) { u32 i, moff, mtrue_end, msize = 0, total_nelems = 0; const struct btf_type *mtype, *elem_type = NULL; const struct btf_member *member; const char *tname, *mname; - u32 vlen; + u32 vlen, elem_id, mid; again: tname = __btf_name_by_offset(btf_vmlinux, t->name_off); @@ -3915,14 +4643,13 @@ again: /* Only allow structure for now, can be relaxed for * other types later. */ - elem_type = btf_type_skip_modifiers(btf_vmlinux, - array_elem->type, NULL); - if (!btf_type_is_struct(elem_type)) + t = btf_type_skip_modifiers(btf_vmlinux, array_elem->type, + NULL); + if (!btf_type_is_struct(t)) goto error; - off = (off - moff) % elem_type->size; - return btf_struct_access(log, elem_type, off, size, atype, - next_btf_id); + off = (off - moff) % t->size; + goto again; error: bpf_log(log, "access beyond struct %s at off %u size %u\n", @@ -3951,7 +4678,7 @@ error: */ if (off <= moff && BITS_ROUNDUP_BYTES(end_bit) <= off + size) - return SCALAR_VALUE; + return WALK_SCALAR; /* off may be accessing a following member * @@ -3973,11 +4700,13 @@ error: break; /* type of the field */ + mid = member->type; mtype = btf_type_by_id(btf_vmlinux, member->type); mname = __btf_name_by_offset(btf_vmlinux, member->name_off); - mtype = btf_resolve_size(btf_vmlinux, mtype, &msize, - &elem_type, &total_nelems); + mtype = __btf_resolve_size(btf_vmlinux, mtype, &msize, + &elem_type, &elem_id, &total_nelems, + &mid); if (IS_ERR(mtype)) { bpf_log(log, "field %s doesn't have size\n", mname); return -EFAULT; @@ -3991,7 +4720,7 @@ error: if (btf_type_is_array(mtype)) { u32 elem_idx; - /* btf_resolve_size() above helps to + /* __btf_resolve_size() above helps to * linearize a multi-dimensional array. * * The logic here is treating an array @@ -4039,6 +4768,7 @@ error: elem_idx = (off - moff) / msize; moff += elem_idx * msize; mtype = elem_type; + mid = elem_id; } /* the 'off' we're looking for is either equal to start @@ -4048,6 +4778,12 @@ error: /* our field must be inside that union or struct */ t = mtype; + /* return if the offset matches the member offset */ + if (off == moff) { + *next_btf_id = mid; + return WALK_STRUCT; + } + /* adjust offset we're looking for */ off -= moff; goto again; @@ -4063,11 +4799,10 @@ error: mname, moff, tname, off, size); return -EACCES; } - stype = btf_type_skip_modifiers(btf_vmlinux, mtype->type, &id); if (btf_type_is_struct(stype)) { *next_btf_id = id; - return PTR_TO_BTF_ID; + return WALK_PTR; } } @@ -4084,23 +4819,82 @@ error: return -EACCES; } - return SCALAR_VALUE; + return WALK_SCALAR; } bpf_log(log, "struct %s doesn't have field at offset %d\n", tname, off); return -EINVAL; } -int btf_resolve_helper_id(struct bpf_verifier_log *log, - const struct bpf_func_proto *fn, int arg) +int btf_struct_access(struct bpf_verifier_log *log, + const struct btf_type *t, int off, int size, + enum bpf_access_type atype __maybe_unused, + u32 *next_btf_id) +{ + int err; + u32 id; + + do { + err = btf_struct_walk(log, t, off, size, &id); + + switch (err) { + case WALK_PTR: + /* If we found the pointer or scalar on t+off, + * we're done. + */ + *next_btf_id = id; + return PTR_TO_BTF_ID; + case WALK_SCALAR: + return SCALAR_VALUE; + case WALK_STRUCT: + /* We found nested struct, so continue the search + * by diving in it. At this point the offset is + * aligned with the new type, so set it to 0. + */ + t = btf_type_by_id(btf_vmlinux, id); + off = 0; + break; + default: + /* It's either error or unknown return value.. + * scream and leave. + */ + if (WARN_ONCE(err > 0, "unknown btf_struct_walk return value")) + return -EINVAL; + return err; + } + } while (t); + + return -EINVAL; +} + +bool btf_struct_ids_match(struct bpf_verifier_log *log, + int off, u32 id, u32 need_type_id) { - int id; + const struct btf_type *type; + int err; - if (fn->arg_type[arg] != ARG_PTR_TO_BTF_ID || !btf_vmlinux) - return -EINVAL; - id = fn->btf_id[arg]; - if (!id || id > btf_vmlinux->nr_types) - return -EINVAL; - return id; + /* Are we already done? */ + if (need_type_id == id && off == 0) + return true; + +again: + type = btf_type_by_id(btf_vmlinux, id); + if (!type) + return false; + err = btf_struct_walk(log, type, off, 1, &id); + if (err != WALK_STRUCT) + return false; + + /* We found nested struct object. If it matches + * the requested ID, we're done. Otherwise let's + * continue the search with offset 0 in the new + * type. + */ + if (need_type_id != id) { + off = 0; + goto again; + } + + return true; } static int __get_type_size(struct btf *btf, u32 btf_id, @@ -4298,7 +5092,7 @@ static int btf_check_func_type_match(struct bpf_verifier_log *log, } /* Compare BTFs of given program with BTF of target program */ -int btf_check_type_match(struct bpf_verifier_env *env, struct bpf_prog *prog, +int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog, struct btf *btf2, const struct btf_type *t2) { struct btf *btf1 = prog->aux->btf; @@ -4306,7 +5100,7 @@ int btf_check_type_match(struct bpf_verifier_env *env, struct bpf_prog *prog, u32 btf_id = 0; if (!prog->aux->func_info) { - bpf_log(&env->log, "Program extension requires BTF\n"); + bpf_log(log, "Program extension requires BTF\n"); return -EINVAL; } @@ -4318,7 +5112,7 @@ int btf_check_type_match(struct bpf_verifier_env *env, struct bpf_prog *prog, if (!t1 || !btf_type_is_func(t1)) return -EFAULT; - return btf_check_func_type_match(&env->log, btf1, t1, btf2, t2); + return btf_check_func_type_match(log, btf1, t1, btf2, t2); } /* Compare BTF of a function with given bpf_reg_state. @@ -4469,7 +5263,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, return -EFAULT; } if (prog_type == BPF_PROG_TYPE_EXT) - prog_type = prog->aux->linked_prog->type; + prog_type = prog->aux->dst_prog->type; t = btf_type_by_id(btf, t->type); if (!t || !btf_type_is_func_proto(t)) { @@ -4516,12 +5310,93 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, return 0; } +static void btf_type_show(const struct btf *btf, u32 type_id, void *obj, + struct btf_show *show) +{ + const struct btf_type *t = btf_type_by_id(btf, type_id); + + show->btf = btf; + memset(&show->state, 0, sizeof(show->state)); + memset(&show->obj, 0, sizeof(show->obj)); + + btf_type_ops(t)->show(btf, t, type_id, obj, 0, show); +} + +static void btf_seq_show(struct btf_show *show, const char *fmt, + va_list args) +{ + seq_vprintf((struct seq_file *)show->target, fmt, args); +} + +int btf_type_seq_show_flags(const struct btf *btf, u32 type_id, + void *obj, struct seq_file *m, u64 flags) +{ + struct btf_show sseq; + + sseq.target = m; + sseq.showfn = btf_seq_show; + sseq.flags = flags; + + btf_type_show(btf, type_id, obj, &sseq); + + return sseq.state.status; +} + void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, struct seq_file *m) { - const struct btf_type *t = btf_type_by_id(btf, type_id); + (void) btf_type_seq_show_flags(btf, type_id, obj, m, + BTF_SHOW_NONAME | BTF_SHOW_COMPACT | + BTF_SHOW_ZERO | BTF_SHOW_UNSAFE); +} + +struct btf_show_snprintf { + struct btf_show show; + int len_left; /* space left in string */ + int len; /* length we would have written */ +}; + +static void btf_snprintf_show(struct btf_show *show, const char *fmt, + va_list args) +{ + struct btf_show_snprintf *ssnprintf = (struct btf_show_snprintf *)show; + int len; - btf_type_ops(t)->seq_show(btf, t, type_id, obj, 0, m); + len = vsnprintf(show->target, ssnprintf->len_left, fmt, args); + + if (len < 0) { + ssnprintf->len_left = 0; + ssnprintf->len = len; + } else if (len > ssnprintf->len_left) { + /* no space, drive on to get length we would have written */ + ssnprintf->len_left = 0; + ssnprintf->len += len; + } else { + ssnprintf->len_left -= len; + ssnprintf->len += len; + show->target += len; + } +} + +int btf_type_snprintf_show(const struct btf *btf, u32 type_id, void *obj, + char *buf, int len, u64 flags) +{ + struct btf_show_snprintf ssnprintf; + + ssnprintf.show.target = buf; + ssnprintf.show.flags = flags; + ssnprintf.show.showfn = btf_snprintf_show; + ssnprintf.len_left = len; + ssnprintf.len = 0; + + btf_type_show(btf, type_id, obj, (struct btf_show *)&ssnprintf); + + /* If we encontered an error, return it. */ + if (ssnprintf.show.state.status) + return ssnprintf.show.state.status; + + /* Otherwise return length we would have written */ + return ssnprintf.len; } #ifdef CONFIG_PROC_FS @@ -4661,3 +5536,15 @@ u32 btf_id(const struct btf *btf) { return btf->id; } + +static int btf_id_cmp_func(const void *a, const void *b) +{ + const int *pa = a, *pb = b; + + return *pa - *pb; +} + +bool btf_id_set_contains(const struct btf_id_set *set, u32 id) +{ + return bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func) != NULL; +} diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 83ff127ef7ae..6ec088a96302 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1226,7 +1226,7 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = { */ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, struct ctl_table *table, int write, - void **buf, size_t *pcount, loff_t *ppos, + char **buf, size_t *pcount, loff_t *ppos, enum bpf_attach_type type) { struct bpf_sysctl_kern ctx = { @@ -1794,7 +1794,7 @@ static bool cg_sockopt_is_valid_access(int off, int size, return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT; case offsetof(struct bpf_sockopt, optname): - /* fallthrough */ + fallthrough; case offsetof(struct bpf_sockopt, level): if (size != size_default) return false; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index ed0b3578867c..9268d77898b7 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -25,7 +25,7 @@ #include <linux/moduleloader.h> #include <linux/bpf.h> #include <linux/btf.h> -#include <linux/frame.h> +#include <linux/objtool.h> #include <linux/rbtree_latch.h> #include <linux/kallsyms.h> #include <linux/rcupdate.h> @@ -98,6 +98,8 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag fp->jit_requested = ebpf_jit_enabled(); INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode); + mutex_init(&fp->aux->used_maps_mutex); + mutex_init(&fp->aux->dst_mutex); return fp; } @@ -253,6 +255,8 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, void __bpf_prog_free(struct bpf_prog *fp) { if (fp->aux) { + mutex_destroy(&fp->aux->used_maps_mutex); + mutex_destroy(&fp->aux->dst_mutex); free_percpu(fp->aux->stats); kfree(fp->aux->poke_tab); kfree(fp->aux); @@ -773,7 +777,8 @@ int bpf_jit_add_poke_descriptor(struct bpf_prog *prog, if (size > poke_tab_max) return -ENOSPC; - if (poke->ip || poke->ip_stable || poke->adj_off) + if (poke->tailcall_target || poke->tailcall_target_stable || + poke->tailcall_bypass || poke->adj_off || poke->bypass_addr) return -EINVAL; switch (poke->reason) { @@ -1747,8 +1752,9 @@ bool bpf_prog_array_compatible(struct bpf_array *array, static int bpf_check_tail_call(const struct bpf_prog *fp) { struct bpf_prog_aux *aux = fp->aux; - int i; + int i, ret = 0; + mutex_lock(&aux->used_maps_mutex); for (i = 0; i < aux->used_map_cnt; i++) { struct bpf_map *map = aux->used_maps[i]; struct bpf_array *array; @@ -1757,11 +1763,15 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) continue; array = container_of(map, struct bpf_array, map); - if (!bpf_prog_array_compatible(array, fp)) - return -EINVAL; + if (!bpf_prog_array_compatible(array, fp)) { + ret = -EINVAL; + goto out; + } } - return 0; +out: + mutex_unlock(&aux->used_maps_mutex); + return ret; } static void bpf_prog_select_func(struct bpf_prog *fp) @@ -2130,7 +2140,8 @@ static void bpf_prog_free_deferred(struct work_struct *work) if (aux->prog->has_callchain_buf) put_callchain_buffers(); #endif - bpf_trampoline_put(aux->trampoline); + if (aux->dst_trampoline) + bpf_trampoline_put(aux->dst_trampoline); for (i = 0; i < aux->func_cnt; i++) bpf_jit_free(aux->func[i]); if (aux->func_cnt) { @@ -2146,8 +2157,8 @@ void bpf_prog_free(struct bpf_prog *fp) { struct bpf_prog_aux *aux = fp->aux; - if (aux->linked_prog) - bpf_prog_put(aux->linked_prog); + if (aux->dst_prog) + bpf_prog_put(aux->dst_prog); INIT_WORK(&aux->work, bpf_prog_free_deferred); schedule_work(&aux->work); } @@ -2208,6 +2219,8 @@ const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak; const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto __weak; const struct bpf_func_proto bpf_get_local_storage_proto __weak; const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto __weak; +const struct bpf_func_proto bpf_snprintf_btf_proto __weak; +const struct bpf_func_proto bpf_seq_printf_btf_proto __weak; const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) { diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index f1c46529929b..c61a23b564aa 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -79,8 +79,6 @@ struct bpf_cpu_map { static DEFINE_PER_CPU(struct list_head, cpu_map_flush_list); -static int bq_flush_to_queue(struct xdp_bulk_queue *bq); - static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) { u32 value_size = attr->value_size; @@ -157,8 +155,7 @@ static void cpu_map_kthread_stop(struct work_struct *work) kthread_stop(rcpu->kthread); } -static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, - struct xdp_frame *xdpf, +static struct sk_buff *cpu_map_build_skb(struct xdp_frame *xdpf, struct sk_buff *skb) { unsigned int hard_start_headroom; @@ -279,7 +276,7 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, break; default: bpf_warn_invalid_xdp_action(act); - /* fallthrough */ + fallthrough; case XDP_DROP: xdp_return_frame(xdpf); stats->drop++; @@ -367,7 +364,7 @@ static int cpu_map_kthread_run(void *data) struct sk_buff *skb = skbs[i]; int ret; - skb = cpu_map_build_skb(rcpu, xdpf, skb); + skb = cpu_map_build_skb(xdpf, skb); if (!skb) { xdp_return_frame(xdpf); continue; @@ -658,6 +655,7 @@ static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key) static int cpu_map_btf_id; const struct bpf_map_ops cpu_map_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc = cpu_map_alloc, .map_free = cpu_map_free, .map_delete_elem = cpu_map_delete_elem, @@ -669,7 +667,7 @@ const struct bpf_map_ops cpu_map_ops = { .map_btf_id = &cpu_map_btf_id, }; -static int bq_flush_to_queue(struct xdp_bulk_queue *bq) +static void bq_flush_to_queue(struct xdp_bulk_queue *bq) { struct bpf_cpu_map_entry *rcpu = bq->obj; unsigned int processed = 0, drops = 0; @@ -678,7 +676,7 @@ static int bq_flush_to_queue(struct xdp_bulk_queue *bq) int i; if (unlikely(!bq->count)) - return 0; + return; q = rcpu->queue; spin_lock(&q->producer_lock); @@ -701,13 +699,12 @@ static int bq_flush_to_queue(struct xdp_bulk_queue *bq) /* Feedback loop via tracepoints */ trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu); - return 0; } /* Runs under RCU-read-side, plus in softirq under NAPI protection. * Thus, safe percpu variable access. */ -static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) +static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) { struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list); struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq); @@ -728,8 +725,6 @@ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) if (!bq->flush_node.prev) list_add(&bq->flush_node, flush_list); - - return 0; } int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 10abb06065bb..2b5ca93c17de 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -341,14 +341,14 @@ bool dev_map_can_have_prog(struct bpf_map *map) return false; } -static int bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) +static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) { struct net_device *dev = bq->dev; int sent = 0, drops = 0, err = 0; int i; if (unlikely(!bq->count)) - return 0; + return; for (i = 0; i < bq->count; i++) { struct xdp_frame *xdpf = bq->q[i]; @@ -369,7 +369,7 @@ out: trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, drops, err); bq->dev_rx = NULL; __list_del_clearprev(&bq->flush_node); - return 0; + return; error: /* If ndo_xdp_xmit fails with an errno, no frames have been * xmit'ed and it's our responsibility to them free all. @@ -421,8 +421,8 @@ struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key) /* Runs under RCU-read-side, plus in softirq under NAPI protection. * Thus, safe percpu variable access. */ -static int bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, - struct net_device *dev_rx) +static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, + struct net_device *dev_rx) { struct list_head *flush_list = this_cpu_ptr(&dev_flush_list); struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq); @@ -441,8 +441,6 @@ static int bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, if (!bq->flush_node.prev) list_add(&bq->flush_node, flush_list); - - return 0; } static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, @@ -462,7 +460,8 @@ static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, if (unlikely(!xdpf)) return -EOVERFLOW; - return bq_enqueue(dev, xdpf, dev_rx); + bq_enqueue(dev, xdpf, dev_rx); + return 0; } static struct xdp_buff *dev_map_run_prog(struct net_device *dev, @@ -751,6 +750,7 @@ static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value, static int dev_map_btf_id; const struct bpf_map_ops dev_map_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc = dev_map_alloc, .map_free = dev_map_free, .map_get_next_key = dev_map_get_next_key, @@ -764,6 +764,7 @@ const struct bpf_map_ops dev_map_ops = { static int dev_map_hash_map_btf_id; const struct bpf_map_ops dev_map_hash_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc = dev_map_alloc, .map_free = dev_map_free, .map_get_next_key = dev_map_hash_get_next_key, diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 78dfff6a501b..1815e97d4c9c 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -9,6 +9,7 @@ #include <linux/rculist_nulls.h> #include <linux/random.h> #include <uapi/linux/btf.h> +#include <linux/rcupdate_trace.h> #include "percpu_freelist.h" #include "bpf_lru_list.h" #include "map_in_map.h" @@ -577,8 +578,7 @@ static void *__htab_map_lookup_elem(struct bpf_map *map, void *key) struct htab_elem *l; u32 hash, key_size; - /* Must be called with rcu_read_lock. */ - WARN_ON_ONCE(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); key_size = map->key_size; @@ -612,7 +612,7 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key) * bpf_prog * __htab_map_lookup_elem */ -static u32 htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) +static int htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_insn *insn = insn_buf; const int ret = BPF_REG_0; @@ -651,7 +651,7 @@ static void *htab_lru_map_lookup_elem_sys(struct bpf_map *map, void *key) return __htab_lru_map_lookup_elem(map, key, false); } -static u32 htab_lru_map_gen_lookup(struct bpf_map *map, +static int htab_lru_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_insn *insn = insn_buf; @@ -941,7 +941,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, /* unknown flags */ return -EINVAL; - WARN_ON_ONCE(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); key_size = map->key_size; @@ -1032,7 +1032,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, /* unknown flags */ return -EINVAL; - WARN_ON_ONCE(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); key_size = map->key_size; @@ -1220,7 +1220,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) u32 hash, key_size; int ret = -ENOENT; - WARN_ON_ONCE(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); key_size = map->key_size; @@ -1252,7 +1252,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) u32 hash, key_size; int ret = -ENOENT; - WARN_ON_ONCE(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); key_size = map->key_size; @@ -1622,7 +1622,6 @@ struct bpf_iter_seq_hash_map_info { struct bpf_map *map; struct bpf_htab *htab; void *percpu_value_buf; // non-zero means percpu hash - unsigned long flags; u32 bucket_id; u32 skip_elems; }; @@ -1632,7 +1631,6 @@ bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info *info, struct htab_elem *prev_elem) { const struct bpf_htab *htab = info->htab; - unsigned long flags = info->flags; u32 skip_elems = info->skip_elems; u32 bucket_id = info->bucket_id; struct hlist_nulls_head *head; @@ -1656,19 +1654,18 @@ bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info *info, /* not found, unlock and go to the next bucket */ b = &htab->buckets[bucket_id++]; - htab_unlock_bucket(htab, b, flags); + rcu_read_unlock(); skip_elems = 0; } for (i = bucket_id; i < htab->n_buckets; i++) { b = &htab->buckets[i]; - flags = htab_lock_bucket(htab, b); + rcu_read_lock(); count = 0; head = &b->head; hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) { if (count >= skip_elems) { - info->flags = flags; info->bucket_id = i; info->skip_elems = count; return elem; @@ -1676,7 +1673,7 @@ bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info *info, count++; } - htab_unlock_bucket(htab, b, flags); + rcu_read_unlock(); skip_elems = 0; } @@ -1754,14 +1751,10 @@ static int bpf_hash_map_seq_show(struct seq_file *seq, void *v) static void bpf_hash_map_seq_stop(struct seq_file *seq, void *v) { - struct bpf_iter_seq_hash_map_info *info = seq->private; - if (!v) (void)__bpf_hash_map_seq_show(seq, NULL); else - htab_unlock_bucket(info->htab, - &info->htab->buckets[info->bucket_id], - info->flags); + rcu_read_unlock(); } static int bpf_iter_init_hash_map(void *priv_data, @@ -1810,6 +1803,7 @@ static const struct bpf_iter_seq_info iter_seq_info = { static int htab_map_btf_id; const struct bpf_map_ops htab_map_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, .map_free = htab_map_free, @@ -1827,6 +1821,7 @@ const struct bpf_map_ops htab_map_ops = { static int htab_lru_map_btf_id; const struct bpf_map_ops htab_lru_map_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, .map_free = htab_map_free, @@ -1947,6 +1942,7 @@ static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key, static int htab_percpu_map_btf_id; const struct bpf_map_ops htab_percpu_map_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, .map_free = htab_map_free, @@ -1963,6 +1959,7 @@ const struct bpf_map_ops htab_percpu_map_ops = { static int htab_lru_percpu_map_btf_id; const struct bpf_map_ops htab_lru_percpu_map_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, .map_free = htab_map_free, @@ -2073,7 +2070,7 @@ static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key) return READ_ONCE(*inner_map); } -static u32 htab_of_map_gen_lookup(struct bpf_map *map, +static int htab_of_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_insn *insn = insn_buf; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index be43ab3e619f..25520f5eeaf6 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -601,6 +601,56 @@ const struct bpf_func_proto bpf_event_output_data_proto = { .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; +BPF_CALL_3(bpf_copy_from_user, void *, dst, u32, size, + const void __user *, user_ptr) +{ + int ret = copy_from_user(dst, user_ptr, size); + + if (unlikely(ret)) { + memset(dst, 0, size); + ret = -EFAULT; + } + + return ret; +} + +const struct bpf_func_proto bpf_copy_from_user_proto = { + .func = bpf_copy_from_user, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, + .arg3_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu) +{ + if (cpu >= nr_cpu_ids) + return (unsigned long)NULL; + + return (unsigned long)per_cpu_ptr((const void __percpu *)ptr, cpu); +} + +const struct bpf_func_proto bpf_per_cpu_ptr_proto = { + .func = bpf_per_cpu_ptr, + .gpl_only = false, + .ret_type = RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL, + .arg1_type = ARG_PTR_TO_PERCPU_BTF_ID, + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_1(bpf_this_cpu_ptr, const void *, percpu_ptr) +{ + return (unsigned long)this_cpu_ptr((const void __percpu *)percpu_ptr); +} + +const struct bpf_func_proto bpf_this_cpu_ptr_proto = { + .func = bpf_this_cpu_ptr, + .gpl_only = false, + .ret_type = RET_PTR_TO_MEM_OR_BTF_ID, + .arg1_type = ARG_PTR_TO_PERCPU_BTF_ID, +}; + const struct bpf_func_proto bpf_get_current_task_proto __weak; const struct bpf_func_proto bpf_probe_read_user_proto __weak; const struct bpf_func_proto bpf_probe_read_user_str_proto __weak; @@ -661,8 +711,16 @@ bpf_base_func_proto(enum bpf_func_id func_id) if (!perfmon_capable()) return NULL; return bpf_get_trace_printk_proto(); + case BPF_FUNC_snprintf_btf: + if (!perfmon_capable()) + return NULL; + return &bpf_snprintf_btf_proto; case BPF_FUNC_jiffies64: return &bpf_jiffies64_proto; + case BPF_FUNC_bpf_per_cpu_ptr: + return &bpf_per_cpu_ptr_proto; + case BPF_FUNC_bpf_this_cpu_ptr: + return &bpf_this_cpu_ptr_proto; default: break; } diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index fb878ba3f22f..dd4b7fd60ee7 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -20,6 +20,7 @@ #include <linux/filter.h> #include <linux/bpf.h> #include <linux/bpf_trace.h> +#include "preload/bpf_preload.h" enum bpf_type { BPF_TYPE_UNSPEC = 0, @@ -226,10 +227,12 @@ static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos) else prev_key = key; + rcu_read_lock(); if (map->ops->map_get_next_key(map, prev_key, key)) { map_iter(m)->done = true; - return NULL; + key = NULL; } + rcu_read_unlock(); return key; } @@ -369,9 +372,10 @@ static struct dentry * bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags) { /* Dots in names (e.g. "/sys/fs/bpf/foo.bar") are reserved for future - * extensions. + * extensions. That allows popoulate_bpffs() create special files. */ - if (strchr(dentry->d_name.name, '.')) + if ((dir->i_mode & S_IALLUGO) && + strchr(dentry->d_name.name, '.')) return ERR_PTR(-EPERM); return simple_lookup(dir, dentry, flags); @@ -409,6 +413,27 @@ static const struct inode_operations bpf_dir_iops = { .unlink = simple_unlink, }; +/* pin iterator link into bpffs */ +static int bpf_iter_link_pin_kernel(struct dentry *parent, + const char *name, struct bpf_link *link) +{ + umode_t mode = S_IFREG | S_IRUSR; + struct dentry *dentry; + int ret; + + inode_lock(parent->d_inode); + dentry = lookup_one_len(name, parent, strlen(name)); + if (IS_ERR(dentry)) { + inode_unlock(parent->d_inode); + return PTR_ERR(dentry); + } + ret = bpf_mkobj_ops(dentry, mode, link, &bpf_link_iops, + &bpf_iter_fops); + dput(dentry); + inode_unlock(parent->d_inode); + return ret; +} + static int bpf_obj_do_pin(const char __user *pathname, void *raw, enum bpf_type type) { @@ -638,6 +663,91 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param) return 0; } +struct bpf_preload_ops *bpf_preload_ops; +EXPORT_SYMBOL_GPL(bpf_preload_ops); + +static bool bpf_preload_mod_get(void) +{ + /* If bpf_preload.ko wasn't loaded earlier then load it now. + * When bpf_preload is built into vmlinux the module's __init + * function will populate it. + */ + if (!bpf_preload_ops) { + request_module("bpf_preload"); + if (!bpf_preload_ops) + return false; + } + /* And grab the reference, so the module doesn't disappear while the + * kernel is interacting with the kernel module and its UMD. + */ + if (!try_module_get(bpf_preload_ops->owner)) { + pr_err("bpf_preload module get failed.\n"); + return false; + } + return true; +} + +static void bpf_preload_mod_put(void) +{ + if (bpf_preload_ops) + /* now user can "rmmod bpf_preload" if necessary */ + module_put(bpf_preload_ops->owner); +} + +static DEFINE_MUTEX(bpf_preload_lock); + +static int populate_bpffs(struct dentry *parent) +{ + struct bpf_preload_info objs[BPF_PRELOAD_LINKS] = {}; + struct bpf_link *links[BPF_PRELOAD_LINKS] = {}; + int err = 0, i; + + /* grab the mutex to make sure the kernel interactions with bpf_preload + * UMD are serialized + */ + mutex_lock(&bpf_preload_lock); + + /* if bpf_preload.ko wasn't built into vmlinux then load it */ + if (!bpf_preload_mod_get()) + goto out; + + if (!bpf_preload_ops->info.tgid) { + /* preload() will start UMD that will load BPF iterator programs */ + err = bpf_preload_ops->preload(objs); + if (err) + goto out_put; + for (i = 0; i < BPF_PRELOAD_LINKS; i++) { + links[i] = bpf_link_by_id(objs[i].link_id); + if (IS_ERR(links[i])) { + err = PTR_ERR(links[i]); + goto out_put; + } + } + for (i = 0; i < BPF_PRELOAD_LINKS; i++) { + err = bpf_iter_link_pin_kernel(parent, + objs[i].link_name, links[i]); + if (err) + goto out_put; + /* do not unlink successfully pinned links even + * if later link fails to pin + */ + links[i] = NULL; + } + /* finish() will tell UMD process to exit */ + err = bpf_preload_ops->finish(); + if (err) + goto out_put; + } +out_put: + bpf_preload_mod_put(); +out: + mutex_unlock(&bpf_preload_lock); + for (i = 0; i < BPF_PRELOAD_LINKS && err; i++) + if (!IS_ERR_OR_NULL(links[i])) + bpf_link_put(links[i]); + return err; +} + static int bpf_fill_super(struct super_block *sb, struct fs_context *fc) { static const struct tree_descr bpf_rfiles[] = { { "" } }; @@ -654,8 +764,8 @@ static int bpf_fill_super(struct super_block *sb, struct fs_context *fc) inode = sb->s_root->d_inode; inode->i_op = &bpf_dir_iops; inode->i_mode &= ~S_IALLUGO; + populate_bpffs(sb->s_root); inode->i_mode |= S_ISVTX | opts->mode; - return 0; } @@ -705,6 +815,8 @@ static int __init bpf_init(void) { int ret; + mutex_init(&bpf_preload_lock); + ret = sysfs_create_mount_point(fs_kobj, "bpf"); if (ret) return ret; diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 44474bf3ab7a..00e32f2ec3e6 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -732,6 +732,7 @@ static int trie_check_btf(const struct bpf_map *map, static int trie_map_btf_id; const struct bpf_map_ops trie_map_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc = trie_alloc, .map_free = trie_free, .map_get_next_key = trie_get_next_key, diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 17738c93bec8..39ab0b68cade 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -17,23 +17,17 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) if (IS_ERR(inner_map)) return inner_map; - /* prog_array->aux->{type,jited} is a runtime binding. - * Doing static check alone in the verifier is not enough. - */ - if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY || - inner_map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE || - inner_map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE || - inner_map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { - fdput(f); - return ERR_PTR(-ENOTSUPP); - } - /* Does not support >1 level map-in-map */ if (inner_map->inner_map_meta) { fdput(f); return ERR_PTR(-EINVAL); } + if (!inner_map->ops->map_meta_equal) { + fdput(f); + return ERR_PTR(-ENOTSUPP); + } + if (map_value_has_spin_lock(inner_map)) { fdput(f); return ERR_PTR(-ENOTSUPP); @@ -81,15 +75,14 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0, return meta0->map_type == meta1->map_type && meta0->key_size == meta1->key_size && meta0->value_size == meta1->value_size && - meta0->map_flags == meta1->map_flags && - meta0->max_entries == meta1->max_entries; + meta0->map_flags == meta1->map_flags; } void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file /* not used */, int ufd) { - struct bpf_map *inner_map; + struct bpf_map *inner_map, *inner_map_meta; struct fd f; f = fdget(ufd); @@ -97,7 +90,8 @@ void *bpf_map_fd_get_ptr(struct bpf_map *map, if (IS_ERR(inner_map)) return inner_map; - if (bpf_map_meta_equal(map->inner_map_meta, inner_map)) + inner_map_meta = map->inner_map_meta; + if (inner_map_meta->ops->map_meta_equal(inner_map_meta, inner_map)) bpf_map_inc(inner_map); else inner_map = ERR_PTR(-EINVAL); diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h index a507bf6ef8b9..bcb7534afb3c 100644 --- a/kernel/bpf/map_in_map.h +++ b/kernel/bpf/map_in_map.h @@ -11,8 +11,6 @@ struct bpf_map; struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd); void bpf_map_meta_free(struct bpf_map *map_meta); -bool bpf_map_meta_equal(const struct bpf_map *meta0, - const struct bpf_map *meta1); void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file, int ufd); void bpf_map_fd_put_ptr(void *ptr); diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index af86048e5afd..6a9542af4212 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -149,6 +149,19 @@ static void bpf_iter_detach_map(struct bpf_iter_aux_info *aux) bpf_map_put_with_uref(aux->map); } +void bpf_iter_map_show_fdinfo(const struct bpf_iter_aux_info *aux, + struct seq_file *seq) +{ + seq_printf(seq, "map_id:\t%u\n", aux->map->id); +} + +int bpf_iter_map_fill_link_info(const struct bpf_iter_aux_info *aux, + struct bpf_link_info *info) +{ + info->iter.map.map_id = aux->map->id; + return 0; +} + DEFINE_BPF_ITER_FUNC(bpf_map_elem, struct bpf_iter_meta *meta, struct bpf_map *map, void *key, void *value) @@ -156,6 +169,8 @@ static const struct bpf_iter_reg bpf_map_elem_reg_info = { .target = "bpf_map_elem", .attach_target = bpf_iter_attach_map, .detach_target = bpf_iter_detach_map, + .show_fdinfo = bpf_iter_map_show_fdinfo, + .fill_link_info = bpf_iter_map_fill_link_info, .ctx_arg_info_size = 2, .ctx_arg_info = { { offsetof(struct bpf_iter__bpf_map_elem, key), diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c index b367430e611c..3d897de89061 100644 --- a/kernel/bpf/percpu_freelist.c +++ b/kernel/bpf/percpu_freelist.c @@ -17,6 +17,8 @@ int pcpu_freelist_init(struct pcpu_freelist *s) raw_spin_lock_init(&head->lock); head->first = NULL; } + raw_spin_lock_init(&s->extralist.lock); + s->extralist.first = NULL; return 0; } @@ -40,12 +42,50 @@ static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head, raw_spin_unlock(&head->lock); } +static inline bool pcpu_freelist_try_push_extra(struct pcpu_freelist *s, + struct pcpu_freelist_node *node) +{ + if (!raw_spin_trylock(&s->extralist.lock)) + return false; + + pcpu_freelist_push_node(&s->extralist, node); + raw_spin_unlock(&s->extralist.lock); + return true; +} + +static inline void ___pcpu_freelist_push_nmi(struct pcpu_freelist *s, + struct pcpu_freelist_node *node) +{ + int cpu, orig_cpu; + + orig_cpu = cpu = raw_smp_processor_id(); + while (1) { + struct pcpu_freelist_head *head; + + head = per_cpu_ptr(s->freelist, cpu); + if (raw_spin_trylock(&head->lock)) { + pcpu_freelist_push_node(head, node); + raw_spin_unlock(&head->lock); + return; + } + cpu = cpumask_next(cpu, cpu_possible_mask); + if (cpu >= nr_cpu_ids) + cpu = 0; + + /* cannot lock any per cpu lock, try extralist */ + if (cpu == orig_cpu && + pcpu_freelist_try_push_extra(s, node)) + return; + } +} + void __pcpu_freelist_push(struct pcpu_freelist *s, struct pcpu_freelist_node *node) { - struct pcpu_freelist_head *head = this_cpu_ptr(s->freelist); - - ___pcpu_freelist_push(head, node); + if (in_nmi()) + ___pcpu_freelist_push_nmi(s, node); + else + ___pcpu_freelist_push(this_cpu_ptr(s->freelist), node); } void pcpu_freelist_push(struct pcpu_freelist *s, @@ -81,7 +121,7 @@ again: } } -struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s) +static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s) { struct pcpu_freelist_head *head; struct pcpu_freelist_node *node; @@ -102,8 +142,59 @@ struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s) if (cpu >= nr_cpu_ids) cpu = 0; if (cpu == orig_cpu) - return NULL; + break; + } + + /* per cpu lists are all empty, try extralist */ + raw_spin_lock(&s->extralist.lock); + node = s->extralist.first; + if (node) + s->extralist.first = node->next; + raw_spin_unlock(&s->extralist.lock); + return node; +} + +static struct pcpu_freelist_node * +___pcpu_freelist_pop_nmi(struct pcpu_freelist *s) +{ + struct pcpu_freelist_head *head; + struct pcpu_freelist_node *node; + int orig_cpu, cpu; + + orig_cpu = cpu = raw_smp_processor_id(); + while (1) { + head = per_cpu_ptr(s->freelist, cpu); + if (raw_spin_trylock(&head->lock)) { + node = head->first; + if (node) { + head->first = node->next; + raw_spin_unlock(&head->lock); + return node; + } + raw_spin_unlock(&head->lock); + } + cpu = cpumask_next(cpu, cpu_possible_mask); + if (cpu >= nr_cpu_ids) + cpu = 0; + if (cpu == orig_cpu) + break; } + + /* cannot pop from per cpu lists, try extralist */ + if (!raw_spin_trylock(&s->extralist.lock)) + return NULL; + node = s->extralist.first; + if (node) + s->extralist.first = node->next; + raw_spin_unlock(&s->extralist.lock); + return node; +} + +struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s) +{ + if (in_nmi()) + return ___pcpu_freelist_pop_nmi(s); + return ___pcpu_freelist_pop(s); } struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s) diff --git a/kernel/bpf/percpu_freelist.h b/kernel/bpf/percpu_freelist.h index fbf8a8a28979..3c76553cfe57 100644 --- a/kernel/bpf/percpu_freelist.h +++ b/kernel/bpf/percpu_freelist.h @@ -13,6 +13,7 @@ struct pcpu_freelist_head { struct pcpu_freelist { struct pcpu_freelist_head __percpu *freelist; + struct pcpu_freelist_head extralist; }; struct pcpu_freelist_node { diff --git a/kernel/bpf/preload/.gitignore b/kernel/bpf/preload/.gitignore new file mode 100644 index 000000000000..856a4c5ad0dd --- /dev/null +++ b/kernel/bpf/preload/.gitignore @@ -0,0 +1,4 @@ +/FEATURE-DUMP.libbpf +/bpf_helper_defs.h +/feature +/bpf_preload_umd diff --git a/kernel/bpf/preload/Kconfig b/kernel/bpf/preload/Kconfig new file mode 100644 index 000000000000..ace49111d3a3 --- /dev/null +++ b/kernel/bpf/preload/Kconfig @@ -0,0 +1,26 @@ +# SPDX-License-Identifier: GPL-2.0-only +config USERMODE_DRIVER + bool + default n + +menuconfig BPF_PRELOAD + bool "Preload BPF file system with kernel specific program and map iterators" + depends on BPF + # The dependency on !COMPILE_TEST prevents it from being enabled + # in allmodconfig or allyesconfig configurations + depends on !COMPILE_TEST + select USERMODE_DRIVER + help + This builds kernel module with several embedded BPF programs that are + pinned into BPF FS mount point as human readable files that are + useful in debugging and introspection of BPF programs and maps. + +if BPF_PRELOAD +config BPF_PRELOAD_UMD + tristate "bpf_preload kernel module with user mode driver" + depends on CC_CAN_LINK + depends on m || CC_CAN_LINK_STATIC + default m + help + This builds bpf_preload kernel module with embedded user mode driver. +endif diff --git a/kernel/bpf/preload/Makefile b/kernel/bpf/preload/Makefile new file mode 100644 index 000000000000..23ee310b6eb4 --- /dev/null +++ b/kernel/bpf/preload/Makefile @@ -0,0 +1,25 @@ +# SPDX-License-Identifier: GPL-2.0 + +LIBBPF_SRCS = $(srctree)/tools/lib/bpf/ +LIBBPF_A = $(obj)/libbpf.a +LIBBPF_OUT = $(abspath $(obj)) + +$(LIBBPF_A): + $(Q)$(MAKE) -C $(LIBBPF_SRCS) OUTPUT=$(LIBBPF_OUT)/ $(LIBBPF_OUT)/libbpf.a + +userccflags += -I $(srctree)/tools/include/ -I $(srctree)/tools/include/uapi \ + -I $(srctree)/tools/lib/ -Wno-unused-result + +userprogs := bpf_preload_umd + +clean-files := $(userprogs) bpf_helper_defs.h FEATURE-DUMP.libbpf staticobjs/ feature/ + +bpf_preload_umd-objs := iterators/iterators.o +bpf_preload_umd-userldlibs := $(LIBBPF_A) -lelf -lz + +$(obj)/bpf_preload_umd: $(LIBBPF_A) + +$(obj)/bpf_preload_umd_blob.o: $(obj)/bpf_preload_umd + +obj-$(CONFIG_BPF_PRELOAD_UMD) += bpf_preload.o +bpf_preload-objs += bpf_preload_kern.o bpf_preload_umd_blob.o diff --git a/kernel/bpf/preload/bpf_preload.h b/kernel/bpf/preload/bpf_preload.h new file mode 100644 index 000000000000..2f9932276f2e --- /dev/null +++ b/kernel/bpf/preload/bpf_preload.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BPF_PRELOAD_H +#define _BPF_PRELOAD_H + +#include <linux/usermode_driver.h> +#include "iterators/bpf_preload_common.h" + +struct bpf_preload_ops { + struct umd_info info; + int (*preload)(struct bpf_preload_info *); + int (*finish)(void); + struct module *owner; +}; +extern struct bpf_preload_ops *bpf_preload_ops; +#define BPF_PRELOAD_LINKS 2 +#endif diff --git a/kernel/bpf/preload/bpf_preload_kern.c b/kernel/bpf/preload/bpf_preload_kern.c new file mode 100644 index 000000000000..79c5772465f1 --- /dev/null +++ b/kernel/bpf/preload/bpf_preload_kern.c @@ -0,0 +1,91 @@ +// SPDX-License-Identifier: GPL-2.0 +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/init.h> +#include <linux/module.h> +#include <linux/pid.h> +#include <linux/fs.h> +#include <linux/sched/signal.h> +#include "bpf_preload.h" + +extern char bpf_preload_umd_start; +extern char bpf_preload_umd_end; + +static int preload(struct bpf_preload_info *obj); +static int finish(void); + +static struct bpf_preload_ops umd_ops = { + .info.driver_name = "bpf_preload", + .preload = preload, + .finish = finish, + .owner = THIS_MODULE, +}; + +static int preload(struct bpf_preload_info *obj) +{ + int magic = BPF_PRELOAD_START; + loff_t pos = 0; + int i, err; + ssize_t n; + + err = fork_usermode_driver(&umd_ops.info); + if (err) + return err; + + /* send the start magic to let UMD proceed with loading BPF progs */ + n = kernel_write(umd_ops.info.pipe_to_umh, + &magic, sizeof(magic), &pos); + if (n != sizeof(magic)) + return -EPIPE; + + /* receive bpf_link IDs and names from UMD */ + pos = 0; + for (i = 0; i < BPF_PRELOAD_LINKS; i++) { + n = kernel_read(umd_ops.info.pipe_from_umh, + &obj[i], sizeof(*obj), &pos); + if (n != sizeof(*obj)) + return -EPIPE; + } + return 0; +} + +static int finish(void) +{ + int magic = BPF_PRELOAD_END; + struct pid *tgid; + loff_t pos = 0; + ssize_t n; + + /* send the last magic to UMD. It will do a normal exit. */ + n = kernel_write(umd_ops.info.pipe_to_umh, + &magic, sizeof(magic), &pos); + if (n != sizeof(magic)) + return -EPIPE; + tgid = umd_ops.info.tgid; + wait_event(tgid->wait_pidfd, thread_group_exited(tgid)); + umd_ops.info.tgid = NULL; + return 0; +} + +static int __init load_umd(void) +{ + int err; + + err = umd_load_blob(&umd_ops.info, &bpf_preload_umd_start, + &bpf_preload_umd_end - &bpf_preload_umd_start); + if (err) + return err; + bpf_preload_ops = &umd_ops; + return err; +} + +static void __exit fini_umd(void) +{ + bpf_preload_ops = NULL; + /* kill UMD in case it's still there due to earlier error */ + kill_pid(umd_ops.info.tgid, SIGKILL, 1); + umd_ops.info.tgid = NULL; + umd_unload_blob(&umd_ops.info); +} +late_initcall(load_umd); +module_exit(fini_umd); +MODULE_LICENSE("GPL"); diff --git a/kernel/bpf/preload/bpf_preload_umd_blob.S b/kernel/bpf/preload/bpf_preload_umd_blob.S new file mode 100644 index 000000000000..f1f40223b5c3 --- /dev/null +++ b/kernel/bpf/preload/bpf_preload_umd_blob.S @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + .section .init.rodata, "a" + .global bpf_preload_umd_start +bpf_preload_umd_start: + .incbin "kernel/bpf/preload/bpf_preload_umd" + .global bpf_preload_umd_end +bpf_preload_umd_end: diff --git a/kernel/bpf/preload/iterators/.gitignore b/kernel/bpf/preload/iterators/.gitignore new file mode 100644 index 000000000000..ffdb70230c8b --- /dev/null +++ b/kernel/bpf/preload/iterators/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +/.output diff --git a/kernel/bpf/preload/iterators/Makefile b/kernel/bpf/preload/iterators/Makefile new file mode 100644 index 000000000000..28fa8c1440f4 --- /dev/null +++ b/kernel/bpf/preload/iterators/Makefile @@ -0,0 +1,57 @@ +# SPDX-License-Identifier: GPL-2.0 +OUTPUT := .output +CLANG ?= clang +LLC ?= llc +LLVM_STRIP ?= llvm-strip +DEFAULT_BPFTOOL := $(OUTPUT)/sbin/bpftool +BPFTOOL ?= $(DEFAULT_BPFTOOL) +LIBBPF_SRC := $(abspath ../../../../tools/lib/bpf) +BPFOBJ := $(OUTPUT)/libbpf.a +BPF_INCLUDE := $(OUTPUT) +INCLUDES := -I$(OUTPUT) -I$(BPF_INCLUDE) -I$(abspath ../../../../tools/lib) \ + -I$(abspath ../../../../tools/include/uapi) +CFLAGS := -g -Wall + +abs_out := $(abspath $(OUTPUT)) +ifeq ($(V),1) +Q = +msg = +else +Q = @ +msg = @printf ' %-8s %s%s\n' "$(1)" "$(notdir $(2))" "$(if $(3), $(3))"; +MAKEFLAGS += --no-print-directory +submake_extras := feature_display=0 +endif + +.DELETE_ON_ERROR: + +.PHONY: all clean + +all: iterators.skel.h + +clean: + $(call msg,CLEAN) + $(Q)rm -rf $(OUTPUT) iterators + +iterators.skel.h: $(OUTPUT)/iterators.bpf.o | $(BPFTOOL) + $(call msg,GEN-SKEL,$@) + $(Q)$(BPFTOOL) gen skeleton $< > $@ + + +$(OUTPUT)/iterators.bpf.o: iterators.bpf.c $(BPFOBJ) | $(OUTPUT) + $(call msg,BPF,$@) + $(Q)$(CLANG) -g -O2 -target bpf $(INCLUDES) \ + -c $(filter %.c,$^) -o $@ && \ + $(LLVM_STRIP) -g $@ + +$(OUTPUT): + $(call msg,MKDIR,$@) + $(Q)mkdir -p $(OUTPUT) + +$(BPFOBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(OUTPUT) + $(Q)$(MAKE) $(submake_extras) -C $(LIBBPF_SRC) \ + OUTPUT=$(abspath $(dir $@))/ $(abspath $@) + +$(DEFAULT_BPFTOOL): + $(Q)$(MAKE) $(submake_extras) -C ../../../../tools/bpf/bpftool \ + prefix= OUTPUT=$(abs_out)/ DESTDIR=$(abs_out) install diff --git a/kernel/bpf/preload/iterators/README b/kernel/bpf/preload/iterators/README new file mode 100644 index 000000000000..7fd6d39a9ad2 --- /dev/null +++ b/kernel/bpf/preload/iterators/README @@ -0,0 +1,4 @@ +WARNING: +If you change "iterators.bpf.c" do "make -j" in this directory to rebuild "iterators.skel.h". +Make sure to have clang 10 installed. +See Documentation/bpf/bpf_devel_QA.rst diff --git a/kernel/bpf/preload/iterators/bpf_preload_common.h b/kernel/bpf/preload/iterators/bpf_preload_common.h new file mode 100644 index 000000000000..8464d1a48c05 --- /dev/null +++ b/kernel/bpf/preload/iterators/bpf_preload_common.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BPF_PRELOAD_COMMON_H +#define _BPF_PRELOAD_COMMON_H + +#define BPF_PRELOAD_START 0x5555 +#define BPF_PRELOAD_END 0xAAAA + +struct bpf_preload_info { + char link_name[16]; + int link_id; +}; + +#endif diff --git a/kernel/bpf/preload/iterators/iterators.bpf.c b/kernel/bpf/preload/iterators/iterators.bpf.c new file mode 100644 index 000000000000..52aa7b38e8b8 --- /dev/null +++ b/kernel/bpf/preload/iterators/iterators.bpf.c @@ -0,0 +1,114 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include <bpf/bpf_core_read.h> + +#pragma clang attribute push (__attribute__((preserve_access_index)), apply_to = record) +struct seq_file; +struct bpf_iter_meta { + struct seq_file *seq; + __u64 session_id; + __u64 seq_num; +}; + +struct bpf_map { + __u32 id; + char name[16]; + __u32 max_entries; +}; + +struct bpf_iter__bpf_map { + struct bpf_iter_meta *meta; + struct bpf_map *map; +}; + +struct btf_type { + __u32 name_off; +}; + +struct btf_header { + __u32 str_len; +}; + +struct btf { + const char *strings; + struct btf_type **types; + struct btf_header hdr; +}; + +struct bpf_prog_aux { + __u32 id; + char name[16]; + const char *attach_func_name; + struct bpf_prog *dst_prog; + struct bpf_func_info *func_info; + struct btf *btf; +}; + +struct bpf_prog { + struct bpf_prog_aux *aux; +}; + +struct bpf_iter__bpf_prog { + struct bpf_iter_meta *meta; + struct bpf_prog *prog; +}; +#pragma clang attribute pop + +static const char *get_name(struct btf *btf, long btf_id, const char *fallback) +{ + struct btf_type **types, *t; + unsigned int name_off; + const char *str; + + if (!btf) + return fallback; + str = btf->strings; + types = btf->types; + bpf_probe_read_kernel(&t, sizeof(t), types + btf_id); + name_off = BPF_CORE_READ(t, name_off); + if (name_off >= btf->hdr.str_len) + return fallback; + return str + name_off; +} + +SEC("iter/bpf_map") +int dump_bpf_map(struct bpf_iter__bpf_map *ctx) +{ + struct seq_file *seq = ctx->meta->seq; + __u64 seq_num = ctx->meta->seq_num; + struct bpf_map *map = ctx->map; + + if (!map) + return 0; + + if (seq_num == 0) + BPF_SEQ_PRINTF(seq, " id name max_entries\n"); + + BPF_SEQ_PRINTF(seq, "%4u %-16s%6d\n", map->id, map->name, map->max_entries); + return 0; +} + +SEC("iter/bpf_prog") +int dump_bpf_prog(struct bpf_iter__bpf_prog *ctx) +{ + struct seq_file *seq = ctx->meta->seq; + __u64 seq_num = ctx->meta->seq_num; + struct bpf_prog *prog = ctx->prog; + struct bpf_prog_aux *aux; + + if (!prog) + return 0; + + aux = prog->aux; + if (seq_num == 0) + BPF_SEQ_PRINTF(seq, " id name attached\n"); + + BPF_SEQ_PRINTF(seq, "%4u %-16s %s %s\n", aux->id, + get_name(aux->btf, aux->func_info[0].type_id, aux->name), + aux->attach_func_name, aux->dst_prog->aux->name); + return 0; +} +char LICENSE[] SEC("license") = "GPL"; diff --git a/kernel/bpf/preload/iterators/iterators.c b/kernel/bpf/preload/iterators/iterators.c new file mode 100644 index 000000000000..b7ff87939172 --- /dev/null +++ b/kernel/bpf/preload/iterators/iterators.c @@ -0,0 +1,94 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#include <argp.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <fcntl.h> +#include <sys/resource.h> +#include <bpf/libbpf.h> +#include <bpf/bpf.h> +#include <sys/mount.h> +#include "iterators.skel.h" +#include "bpf_preload_common.h" + +int to_kernel = -1; +int from_kernel = 0; + +static int send_link_to_kernel(struct bpf_link *link, const char *link_name) +{ + struct bpf_preload_info obj = {}; + struct bpf_link_info info = {}; + __u32 info_len = sizeof(info); + int err; + + err = bpf_obj_get_info_by_fd(bpf_link__fd(link), &info, &info_len); + if (err) + return err; + obj.link_id = info.id; + if (strlen(link_name) >= sizeof(obj.link_name)) + return -E2BIG; + strcpy(obj.link_name, link_name); + if (write(to_kernel, &obj, sizeof(obj)) != sizeof(obj)) + return -EPIPE; + return 0; +} + +int main(int argc, char **argv) +{ + struct rlimit rlim = { RLIM_INFINITY, RLIM_INFINITY }; + struct iterators_bpf *skel; + int err, magic; + int debug_fd; + + debug_fd = open("/dev/console", O_WRONLY | O_NOCTTY | O_CLOEXEC); + if (debug_fd < 0) + return 1; + to_kernel = dup(1); + close(1); + dup(debug_fd); + /* now stdin and stderr point to /dev/console */ + + read(from_kernel, &magic, sizeof(magic)); + if (magic != BPF_PRELOAD_START) { + printf("bad start magic %d\n", magic); + return 1; + } + setrlimit(RLIMIT_MEMLOCK, &rlim); + /* libbpf opens BPF object and loads it into the kernel */ + skel = iterators_bpf__open_and_load(); + if (!skel) { + /* iterators.skel.h is little endian. + * libbpf doesn't support automatic little->big conversion + * of BPF bytecode yet. + * The program load will fail in such case. + */ + printf("Failed load could be due to wrong endianness\n"); + return 1; + } + err = iterators_bpf__attach(skel); + if (err) + goto cleanup; + + /* send two bpf_link IDs with names to the kernel */ + err = send_link_to_kernel(skel->links.dump_bpf_map, "maps.debug"); + if (err) + goto cleanup; + err = send_link_to_kernel(skel->links.dump_bpf_prog, "progs.debug"); + if (err) + goto cleanup; + + /* The kernel will proceed with pinnging the links in bpffs. + * UMD will wait on read from pipe. + */ + read(from_kernel, &magic, sizeof(magic)); + if (magic != BPF_PRELOAD_END) { + printf("bad final magic %d\n", magic); + err = -EINVAL; + } +cleanup: + iterators_bpf__destroy(skel); + + return err != 0; +} diff --git a/kernel/bpf/preload/iterators/iterators.skel.h b/kernel/bpf/preload/iterators/iterators.skel.h new file mode 100644 index 000000000000..cf9a6a94b3a4 --- /dev/null +++ b/kernel/bpf/preload/iterators/iterators.skel.h @@ -0,0 +1,412 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +/* THIS FILE IS AUTOGENERATED! */ +#ifndef __ITERATORS_BPF_SKEL_H__ +#define __ITERATORS_BPF_SKEL_H__ + +#include <stdlib.h> +#include <bpf/libbpf.h> + +struct iterators_bpf { + struct bpf_object_skeleton *skeleton; + struct bpf_object *obj; + struct { + struct bpf_map *rodata; + } maps; + struct { + struct bpf_program *dump_bpf_map; + struct bpf_program *dump_bpf_prog; + } progs; + struct { + struct bpf_link *dump_bpf_map; + struct bpf_link *dump_bpf_prog; + } links; + struct iterators_bpf__rodata { + char dump_bpf_map____fmt[35]; + char dump_bpf_map____fmt_1[14]; + char dump_bpf_prog____fmt[32]; + char dump_bpf_prog____fmt_2[17]; + } *rodata; +}; + +static void +iterators_bpf__destroy(struct iterators_bpf *obj) +{ + if (!obj) + return; + if (obj->skeleton) + bpf_object__destroy_skeleton(obj->skeleton); + free(obj); +} + +static inline int +iterators_bpf__create_skeleton(struct iterators_bpf *obj); + +static inline struct iterators_bpf * +iterators_bpf__open_opts(const struct bpf_object_open_opts *opts) +{ + struct iterators_bpf *obj; + + obj = (struct iterators_bpf *)calloc(1, sizeof(*obj)); + if (!obj) + return NULL; + if (iterators_bpf__create_skeleton(obj)) + goto err; + if (bpf_object__open_skeleton(obj->skeleton, opts)) + goto err; + + return obj; +err: + iterators_bpf__destroy(obj); + return NULL; +} + +static inline struct iterators_bpf * +iterators_bpf__open(void) +{ + return iterators_bpf__open_opts(NULL); +} + +static inline int +iterators_bpf__load(struct iterators_bpf *obj) +{ + return bpf_object__load_skeleton(obj->skeleton); +} + +static inline struct iterators_bpf * +iterators_bpf__open_and_load(void) +{ + struct iterators_bpf *obj; + + obj = iterators_bpf__open(); + if (!obj) + return NULL; + if (iterators_bpf__load(obj)) { + iterators_bpf__destroy(obj); + return NULL; + } + return obj; +} + +static inline int +iterators_bpf__attach(struct iterators_bpf *obj) +{ + return bpf_object__attach_skeleton(obj->skeleton); +} + +static inline void +iterators_bpf__detach(struct iterators_bpf *obj) +{ + return bpf_object__detach_skeleton(obj->skeleton); +} + +static inline int +iterators_bpf__create_skeleton(struct iterators_bpf *obj) +{ + struct bpf_object_skeleton *s; + + s = (struct bpf_object_skeleton *)calloc(1, sizeof(*s)); + if (!s) + return -1; + obj->skeleton = s; + + s->sz = sizeof(*s); + s->name = "iterators_bpf"; + s->obj = &obj->obj; + + /* maps */ + s->map_cnt = 1; + s->map_skel_sz = sizeof(*s->maps); + s->maps = (struct bpf_map_skeleton *)calloc(s->map_cnt, s->map_skel_sz); + if (!s->maps) + goto err; + + s->maps[0].name = "iterator.rodata"; + s->maps[0].map = &obj->maps.rodata; + s->maps[0].mmaped = (void **)&obj->rodata; + + /* programs */ + s->prog_cnt = 2; + s->prog_skel_sz = sizeof(*s->progs); + s->progs = (struct bpf_prog_skeleton *)calloc(s->prog_cnt, s->prog_skel_sz); + if (!s->progs) + goto err; + + s->progs[0].name = "dump_bpf_map"; + s->progs[0].prog = &obj->progs.dump_bpf_map; + s->progs[0].link = &obj->links.dump_bpf_map; + + s->progs[1].name = "dump_bpf_prog"; + s->progs[1].prog = &obj->progs.dump_bpf_prog; + s->progs[1].link = &obj->links.dump_bpf_prog; + + s->data_sz = 7176; + s->data = (void *)"\ +\x7f\x45\x4c\x46\x02\x01\x01\0\0\0\0\0\0\0\0\0\x01\0\xf7\0\x01\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\x48\x18\0\0\0\0\0\0\0\0\0\0\x40\0\0\0\0\0\x40\0\x0f\0\ +\x0e\0\x79\x12\0\0\0\0\0\0\x79\x26\0\0\0\0\0\0\x79\x17\x08\0\0\0\0\0\x15\x07\ +\x1a\0\0\0\0\0\x79\x21\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\0\0\0\0\0\0\ +\x07\x04\0\0\xe8\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x02\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\xb7\x03\0\0\x23\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\0\x61\x71\0\ +\0\0\0\0\0\x7b\x1a\xe8\xff\0\0\0\0\xb7\x01\0\0\x04\0\0\0\xbf\x72\0\0\0\0\0\0\ +\x0f\x12\0\0\0\0\0\0\x7b\x2a\xf0\xff\0\0\0\0\x61\x71\x14\0\0\0\0\0\x7b\x1a\xf8\ +\xff\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xe8\xff\xff\xff\xbf\x61\0\0\0\0\0\ +\0\x18\x02\0\0\x23\0\0\0\0\0\0\0\0\0\0\0\xb7\x03\0\0\x0e\0\0\0\xb7\x05\0\0\x18\ +\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\x79\x12\0\0\0\0\ +\0\0\x79\x26\0\0\0\0\0\0\x79\x11\x08\0\0\0\0\0\x15\x01\x3b\0\0\0\0\0\x79\x17\0\ +\0\0\0\0\0\x79\x21\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\ +\x04\0\0\xd0\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x02\0\0\x31\0\0\0\0\0\0\0\0\0\ +\0\0\xb7\x03\0\0\x20\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\0\x7b\x6a\xc8\ +\xff\0\0\0\0\x61\x71\0\0\0\0\0\0\x7b\x1a\xd0\xff\0\0\0\0\xb7\x03\0\0\x04\0\0\0\ +\xbf\x79\0\0\0\0\0\0\x0f\x39\0\0\0\0\0\0\x79\x71\x28\0\0\0\0\0\x79\x78\x30\0\0\ +\0\0\0\x15\x08\x18\0\0\0\0\0\xb7\x02\0\0\0\0\0\0\x0f\x21\0\0\0\0\0\0\x61\x11\ +\x04\0\0\0\0\0\x79\x83\x08\0\0\0\0\0\x67\x01\0\0\x03\0\0\0\x0f\x13\0\0\0\0\0\0\ +\x79\x86\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf8\xff\xff\xff\xb7\x02\0\ +\0\x08\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x01\0\0\0\0\0\0\x79\xa3\xf8\xff\0\0\0\0\ +\x0f\x13\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf4\xff\xff\xff\xb7\x02\0\ +\0\x04\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x03\0\0\x04\0\0\0\x61\xa1\xf4\xff\0\0\0\0\ +\x61\x82\x10\0\0\0\0\0\x3d\x21\x02\0\0\0\0\0\x0f\x16\0\0\0\0\0\0\xbf\x69\0\0\0\ +\0\0\0\x7b\x9a\xd8\xff\0\0\0\0\x79\x71\x18\0\0\0\0\0\x7b\x1a\xe0\xff\0\0\0\0\ +\x79\x71\x20\0\0\0\0\0\x79\x11\0\0\0\0\0\0\x0f\x31\0\0\0\0\0\0\x7b\x1a\xe8\xff\ +\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xd0\xff\xff\xff\x79\xa1\xc8\xff\0\0\0\ +\0\x18\x02\0\0\x51\0\0\0\0\0\0\0\0\0\0\0\xb7\x03\0\0\x11\0\0\0\xb7\x05\0\0\x20\ +\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\x20\x20\x69\x64\ +\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x6d\ +\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\ +\x73\x25\x36\x64\x0a\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\ +\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\x64\x0a\0\x25\x34\ +\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x0a\0\x47\x50\x4c\0\x9f\ +\xeb\x01\0\x18\0\0\0\0\0\0\0\x1c\x04\0\0\x1c\x04\0\0\x09\x05\0\0\0\0\0\0\0\0\0\ +\x02\x02\0\0\0\x01\0\0\0\x02\0\0\x04\x10\0\0\0\x13\0\0\0\x03\0\0\0\0\0\0\0\x18\ +\0\0\0\x04\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\x08\0\0\0\0\0\0\0\0\0\0\x02\x0d\0\ +\0\0\0\0\0\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x01\0\0\0\x20\0\0\0\0\0\0\x01\x04\ +\0\0\0\x20\0\0\x01\x24\0\0\0\x01\0\0\x0c\x05\0\0\0\xaf\0\0\0\x03\0\0\x04\x18\0\ +\0\0\xbd\0\0\0\x09\0\0\0\0\0\0\0\xc1\0\0\0\x0b\0\0\0\x40\0\0\0\xcc\0\0\0\x0b\0\ +\0\0\x80\0\0\0\0\0\0\0\0\0\0\x02\x0a\0\0\0\xd4\0\0\0\0\0\0\x07\0\0\0\0\xdd\0\0\ +\0\0\0\0\x08\x0c\0\0\0\xe3\0\0\0\0\0\0\x01\x08\0\0\0\x40\0\0\0\xa4\x01\0\0\x03\ +\0\0\x04\x18\0\0\0\xac\x01\0\0\x0e\0\0\0\0\0\0\0\xaf\x01\0\0\x11\0\0\0\x20\0\0\ +\0\xb4\x01\0\0\x0e\0\0\0\xa0\0\0\0\xc0\x01\0\0\0\0\0\x08\x0f\0\0\0\xc6\x01\0\0\ +\0\0\0\x01\x04\0\0\0\x20\0\0\0\xd3\x01\0\0\0\0\0\x01\x01\0\0\0\x08\0\0\x01\0\0\ +\0\0\0\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x10\0\0\0\xd8\x01\0\0\0\0\0\x01\x04\ +\0\0\0\x20\0\0\0\0\0\0\0\0\0\0\x02\x14\0\0\0\x3c\x02\0\0\x02\0\0\x04\x10\0\0\0\ +\x13\0\0\0\x03\0\0\0\0\0\0\0\x4f\x02\0\0\x15\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\ +\x18\0\0\0\0\0\0\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x13\0\0\0\x54\x02\0\0\x01\0\ +\0\x0c\x16\0\0\0\xa0\x02\0\0\x01\0\0\x04\x08\0\0\0\xa9\x02\0\0\x19\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\x02\x1a\0\0\0\xfa\x02\0\0\x06\0\0\x04\x38\0\0\0\xac\x01\0\0\ +\x0e\0\0\0\0\0\0\0\xaf\x01\0\0\x11\0\0\0\x20\0\0\0\x07\x03\0\0\x1b\0\0\0\xc0\0\ +\0\0\x18\x03\0\0\x15\0\0\0\0\x01\0\0\x21\x03\0\0\x1d\0\0\0\x40\x01\0\0\x2b\x03\ +\0\0\x1e\0\0\0\x80\x01\0\0\0\0\0\0\0\0\0\x02\x1c\0\0\0\0\0\0\0\0\0\0\x0a\x10\0\ +\0\0\0\0\0\0\0\0\0\x02\x1f\0\0\0\0\0\0\0\0\0\0\x02\x20\0\0\0\x75\x03\0\0\x02\0\ +\0\x04\x08\0\0\0\x83\x03\0\0\x0e\0\0\0\0\0\0\0\x8c\x03\0\0\x0e\0\0\0\x20\0\0\0\ +\x2b\x03\0\0\x03\0\0\x04\x18\0\0\0\x96\x03\0\0\x1b\0\0\0\0\0\0\0\x9e\x03\0\0\ +\x21\0\0\0\x40\0\0\0\xa4\x03\0\0\x23\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\x02\x22\0\0\ +\0\0\0\0\0\0\0\0\x02\x24\0\0\0\xa8\x03\0\0\x01\0\0\x04\x04\0\0\0\xb3\x03\0\0\ +\x0e\0\0\0\0\0\0\0\x1c\x04\0\0\x01\0\0\x04\x04\0\0\0\x25\x04\0\0\x0e\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x23\0\0\0\x9b\x04\0\0\0\0\0\ +\x0e\x25\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x0e\0\0\0\ +\xaf\x04\0\0\0\0\0\x0e\x27\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\ +\x12\0\0\0\x20\0\0\0\xc5\x04\0\0\0\0\0\x0e\x29\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\ +\0\0\0\0\x1c\0\0\0\x12\0\0\0\x11\0\0\0\xda\x04\0\0\0\0\0\x0e\x2b\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x04\0\0\0\xf1\x04\0\0\0\0\0\x0e\ +\x2d\0\0\0\x01\0\0\0\xf9\x04\0\0\x04\0\0\x0f\0\0\0\0\x26\0\0\0\0\0\0\0\x23\0\0\ +\0\x28\0\0\0\x23\0\0\0\x0e\0\0\0\x2a\0\0\0\x31\0\0\0\x20\0\0\0\x2c\0\0\0\x51\0\ +\0\0\x11\0\0\0\x01\x05\0\0\x01\0\0\x0f\0\0\0\0\x2e\0\0\0\0\0\0\0\x04\0\0\0\0\ +\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x6d\x65\ +\x74\x61\0\x6d\x61\x70\0\x63\x74\x78\0\x69\x6e\x74\0\x64\x75\x6d\x70\x5f\x62\ +\x70\x66\x5f\x6d\x61\x70\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x6d\x61\x70\0\ +\x30\x3a\x30\0\x2f\x68\x6f\x6d\x65\x2f\x61\x6c\x72\x75\x61\x2f\x62\x75\x69\x6c\ +\x64\x2f\x6c\x69\x6e\x75\x78\x2f\x6b\x65\x72\x6e\x65\x6c\x2f\x62\x70\x66\x2f\ +\x70\x72\x65\x6c\x6f\x61\x64\x2f\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2f\x69\ +\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\x2e\x63\0\x09\x73\x74\x72\x75\ +\x63\x74\x20\x73\x65\x71\x5f\x66\x69\x6c\x65\x20\x2a\x73\x65\x71\x20\x3d\x20\ +\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x3b\0\x62\x70\x66\x5f\ +\x69\x74\x65\x72\x5f\x6d\x65\x74\x61\0\x73\x65\x71\0\x73\x65\x73\x73\x69\x6f\ +\x6e\x5f\x69\x64\0\x73\x65\x71\x5f\x6e\x75\x6d\0\x73\x65\x71\x5f\x66\x69\x6c\ +\x65\0\x5f\x5f\x75\x36\x34\0\x6c\x6f\x6e\x67\x20\x6c\x6f\x6e\x67\x20\x75\x6e\ +\x73\x69\x67\x6e\x65\x64\x20\x69\x6e\x74\0\x30\x3a\x31\0\x09\x73\x74\x72\x75\ +\x63\x74\x20\x62\x70\x66\x5f\x6d\x61\x70\x20\x2a\x6d\x61\x70\x20\x3d\x20\x63\ +\x74\x78\x2d\x3e\x6d\x61\x70\x3b\0\x09\x69\x66\x20\x28\x21\x6d\x61\x70\x29\0\ +\x30\x3a\x32\0\x09\x5f\x5f\x75\x36\x34\x20\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\ +\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x5f\x6e\x75\x6d\ +\x3b\0\x09\x69\x66\x20\x28\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x3d\x20\x30\x29\ +\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\ +\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\ +\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x5c\ +\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x6d\x61\x70\0\x69\x64\0\x6e\x61\x6d\x65\0\ +\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\0\x5f\x5f\x75\x33\x32\0\x75\x6e\ +\x73\x69\x67\x6e\x65\x64\x20\x69\x6e\x74\0\x63\x68\x61\x72\0\x5f\x5f\x41\x52\ +\x52\x41\x59\x5f\x53\x49\x5a\x45\x5f\x54\x59\x50\x45\x5f\x5f\0\x09\x42\x50\x46\ +\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\ +\x34\x75\x20\x25\x2d\x31\x36\x73\x25\x36\x64\x5c\x6e\x22\x2c\x20\x6d\x61\x70\ +\x2d\x3e\x69\x64\x2c\x20\x6d\x61\x70\x2d\x3e\x6e\x61\x6d\x65\x2c\x20\x6d\x61\ +\x70\x2d\x3e\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x29\x3b\0\x7d\0\x62\ +\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x70\x72\ +\x6f\x67\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x69\x74\x65\ +\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x09\x73\x74\x72\x75\x63\x74\x20\x62\ +\x70\x66\x5f\x70\x72\x6f\x67\x20\x2a\x70\x72\x6f\x67\x20\x3d\x20\x63\x74\x78\ +\x2d\x3e\x70\x72\x6f\x67\x3b\0\x09\x69\x66\x20\x28\x21\x70\x72\x6f\x67\x29\0\ +\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x61\x75\x78\0\x09\x61\x75\x78\x20\x3d\x20\ +\x70\x72\x6f\x67\x2d\x3e\x61\x75\x78\x3b\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\ +\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\ +\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\ +\x74\x61\x63\x68\x65\x64\x5c\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x70\x72\x6f\x67\ +\x5f\x61\x75\x78\0\x61\x74\x74\x61\x63\x68\x5f\x66\x75\x6e\x63\x5f\x6e\x61\x6d\ +\x65\0\x64\x73\x74\x5f\x70\x72\x6f\x67\0\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\ +\x62\x74\x66\0\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\ +\x73\x65\x71\x2c\x20\x22\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\ +\x25\x73\x5c\x6e\x22\x2c\x20\x61\x75\x78\x2d\x3e\x69\x64\x2c\0\x30\x3a\x34\0\ +\x30\x3a\x35\0\x09\x69\x66\x20\x28\x21\x62\x74\x66\x29\0\x62\x70\x66\x5f\x66\ +\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x69\x6e\x73\x6e\x5f\x6f\x66\x66\0\x74\x79\ +\x70\x65\x5f\x69\x64\0\x30\0\x73\x74\x72\x69\x6e\x67\x73\0\x74\x79\x70\x65\x73\ +\0\x68\x64\x72\0\x62\x74\x66\x5f\x68\x65\x61\x64\x65\x72\0\x73\x74\x72\x5f\x6c\ +\x65\x6e\0\x09\x74\x79\x70\x65\x73\x20\x3d\x20\x62\x74\x66\x2d\x3e\x74\x79\x70\ +\x65\x73\x3b\0\x09\x62\x70\x66\x5f\x70\x72\x6f\x62\x65\x5f\x72\x65\x61\x64\x5f\ +\x6b\x65\x72\x6e\x65\x6c\x28\x26\x74\x2c\x20\x73\x69\x7a\x65\x6f\x66\x28\x74\ +\x29\x2c\x20\x74\x79\x70\x65\x73\x20\x2b\x20\x62\x74\x66\x5f\x69\x64\x29\x3b\0\ +\x09\x73\x74\x72\x20\x3d\x20\x62\x74\x66\x2d\x3e\x73\x74\x72\x69\x6e\x67\x73\ +\x3b\0\x62\x74\x66\x5f\x74\x79\x70\x65\0\x6e\x61\x6d\x65\x5f\x6f\x66\x66\0\x09\ +\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3d\x20\x42\x50\x46\x5f\x43\x4f\x52\x45\ +\x5f\x52\x45\x41\x44\x28\x74\x2c\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x29\x3b\0\ +\x30\x3a\x32\x3a\x30\0\x09\x69\x66\x20\x28\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\ +\x3e\x3d\x20\x62\x74\x66\x2d\x3e\x68\x64\x72\x2e\x73\x74\x72\x5f\x6c\x65\x6e\ +\x29\0\x09\x72\x65\x74\x75\x72\x6e\x20\x73\x74\x72\x20\x2b\x20\x6e\x61\x6d\x65\ +\x5f\x6f\x66\x66\x3b\0\x30\x3a\x33\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\ +\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\ +\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x31\0\x64\x75\x6d\x70\x5f\x62\x70\x66\ +\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\ +\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x32\0\x4c\x49\x43\x45\ +\x4e\x53\x45\0\x2e\x72\x6f\x64\x61\x74\x61\0\x6c\x69\x63\x65\x6e\x73\x65\0\x9f\ +\xeb\x01\0\x20\0\0\0\0\0\0\0\x24\0\0\0\x24\0\0\0\x44\x02\0\0\x68\x02\0\0\xa4\ +\x01\0\0\x08\0\0\0\x31\0\0\0\x01\0\0\0\0\0\0\0\x07\0\0\0\x62\x02\0\0\x01\0\0\0\ +\0\0\0\0\x17\0\0\0\x10\0\0\0\x31\0\0\0\x09\0\0\0\0\0\0\0\x42\0\0\0\x87\0\0\0\ +\x1e\x40\x01\0\x08\0\0\0\x42\0\0\0\x87\0\0\0\x24\x40\x01\0\x10\0\0\0\x42\0\0\0\ +\xfe\0\0\0\x1d\x48\x01\0\x18\0\0\0\x42\0\0\0\x1f\x01\0\0\x06\x50\x01\0\x20\0\0\ +\0\x42\0\0\0\x2e\x01\0\0\x1d\x44\x01\0\x28\0\0\0\x42\0\0\0\x53\x01\0\0\x06\x5c\ +\x01\0\x38\0\0\0\x42\0\0\0\x66\x01\0\0\x03\x60\x01\0\x70\0\0\0\x42\0\0\0\xec\ +\x01\0\0\x02\x68\x01\0\xf0\0\0\0\x42\0\0\0\x3a\x02\0\0\x01\x70\x01\0\x62\x02\0\ +\0\x1a\0\0\0\0\0\0\0\x42\0\0\0\x87\0\0\0\x1e\x84\x01\0\x08\0\0\0\x42\0\0\0\x87\ +\0\0\0\x24\x84\x01\0\x10\0\0\0\x42\0\0\0\x70\x02\0\0\x1f\x8c\x01\0\x18\0\0\0\ +\x42\0\0\0\x94\x02\0\0\x06\x98\x01\0\x20\0\0\0\x42\0\0\0\xad\x02\0\0\x0e\xa4\ +\x01\0\x28\0\0\0\x42\0\0\0\x2e\x01\0\0\x1d\x88\x01\0\x30\0\0\0\x42\0\0\0\x53\ +\x01\0\0\x06\xa8\x01\0\x40\0\0\0\x42\0\0\0\xbf\x02\0\0\x03\xac\x01\0\x80\0\0\0\ +\x42\0\0\0\x2f\x03\0\0\x02\xb4\x01\0\xb8\0\0\0\x42\0\0\0\x6a\x03\0\0\x06\x08\ +\x01\0\xd0\0\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\xd8\0\0\0\x42\0\0\0\xbb\x03\0\0\x0f\ +\x14\x01\0\xe0\0\0\0\x42\0\0\0\xd0\x03\0\0\x2d\x18\x01\0\xf0\0\0\0\x42\0\0\0\ +\x07\x04\0\0\x0d\x10\x01\0\0\x01\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\x08\x01\0\0\x42\ +\0\0\0\xd0\x03\0\0\x02\x18\x01\0\x20\x01\0\0\x42\0\0\0\x2e\x04\0\0\x0d\x1c\x01\ +\0\x38\x01\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\x40\x01\0\0\x42\0\0\0\x2e\x04\0\0\x0d\ +\x1c\x01\0\x58\x01\0\0\x42\0\0\0\x2e\x04\0\0\x0d\x1c\x01\0\x60\x01\0\0\x42\0\0\ +\0\x5c\x04\0\0\x1b\x20\x01\0\x68\x01\0\0\x42\0\0\0\x5c\x04\0\0\x06\x20\x01\0\ +\x70\x01\0\0\x42\0\0\0\x7f\x04\0\0\x0d\x28\x01\0\x78\x01\0\0\x42\0\0\0\0\0\0\0\ +\0\0\0\0\x80\x01\0\0\x42\0\0\0\x2f\x03\0\0\x02\xb4\x01\0\xf8\x01\0\0\x42\0\0\0\ +\x3a\x02\0\0\x01\xc4\x01\0\x10\0\0\0\x31\0\0\0\x07\0\0\0\0\0\0\0\x02\0\0\0\x3e\ +\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\x02\0\0\0\xfa\0\ +\0\0\0\0\0\0\x20\0\0\0\x08\0\0\0\x2a\x01\0\0\0\0\0\0\x70\0\0\0\x0d\0\0\0\x3e\0\ +\0\0\0\0\0\0\x80\0\0\0\x0d\0\0\0\xfa\0\0\0\0\0\0\0\xa0\0\0\0\x0d\0\0\0\x2a\x01\ +\0\0\0\0\0\0\x62\x02\0\0\x12\0\0\0\0\0\0\0\x14\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\ +\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\x14\0\0\0\xfa\0\0\0\0\0\0\0\x20\0\0\0\ +\x18\0\0\0\x3e\0\0\0\0\0\0\0\x28\0\0\0\x08\0\0\0\x2a\x01\0\0\0\0\0\0\x80\0\0\0\ +\x1a\0\0\0\x3e\0\0\0\0\0\0\0\x90\0\0\0\x1a\0\0\0\xfa\0\0\0\0\0\0\0\xa8\0\0\0\ +\x1a\0\0\0\x62\x03\0\0\0\0\0\0\xb0\0\0\0\x1a\0\0\0\x66\x03\0\0\0\0\0\0\xc0\0\0\ +\0\x1f\0\0\0\x94\x03\0\0\0\0\0\0\xd8\0\0\0\x20\0\0\0\xfa\0\0\0\0\0\0\0\xf0\0\0\ +\0\x20\0\0\0\x3e\0\0\0\0\0\0\0\x18\x01\0\0\x24\0\0\0\x3e\0\0\0\0\0\0\0\x50\x01\ +\0\0\x1a\0\0\0\xfa\0\0\0\0\0\0\0\x60\x01\0\0\x20\0\0\0\x56\x04\0\0\0\0\0\0\x88\ +\x01\0\0\x1a\0\0\0\x2a\x01\0\0\0\0\0\0\x98\x01\0\0\x1a\0\0\0\x97\x04\0\0\0\0\0\ +\0\xa0\x01\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\x91\0\0\0\x04\0\xf1\xff\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xe6\0\0\ +\0\0\0\x02\0\x70\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xd8\0\0\0\0\0\x02\0\xf0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\xdf\0\0\0\0\0\x03\0\x78\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\xd1\0\0\0\0\0\x03\0\x80\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xca\0\0\0\0\0\x03\0\ +\xf8\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x14\0\0\0\x01\0\x04\0\0\0\0\0\0\0\0\0\x23\ +\0\0\0\0\0\0\0\x04\x01\0\0\x01\0\x04\0\x23\0\0\0\0\0\0\0\x0e\0\0\0\0\0\0\0\x28\ +\0\0\0\x01\0\x04\0\x31\0\0\0\0\0\0\0\x20\0\0\0\0\0\0\0\xed\0\0\0\x01\0\x04\0\ +\x51\0\0\0\0\0\0\0\x11\0\0\0\0\0\0\0\0\0\0\0\x03\0\x02\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\x03\0\x03\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\ +\x04\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xc2\0\0\0\x11\0\x05\0\0\0\0\0\0\0\0\0\ +\x04\0\0\0\0\0\0\0\x3d\0\0\0\x12\0\x02\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\x5b\ +\0\0\0\x12\0\x03\0\0\0\0\0\0\0\0\0\x08\x02\0\0\0\0\0\0\x48\0\0\0\0\0\0\0\x01\0\ +\0\0\x0d\0\0\0\xc8\0\0\0\0\0\0\0\x01\0\0\0\x0d\0\0\0\x50\0\0\0\0\0\0\0\x01\0\0\ +\0\x0d\0\0\0\xd0\x01\0\0\0\0\0\0\x01\0\0\0\x0d\0\0\0\xf0\x03\0\0\0\0\0\0\x0a\0\ +\0\0\x0d\0\0\0\xfc\x03\0\0\0\0\0\0\x0a\0\0\0\x0d\0\0\0\x08\x04\0\0\0\0\0\0\x0a\ +\0\0\0\x0d\0\0\0\x14\x04\0\0\0\0\0\0\x0a\0\0\0\x0d\0\0\0\x2c\x04\0\0\0\0\0\0\0\ +\0\0\0\x0e\0\0\0\x2c\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x3c\0\0\0\0\0\0\0\0\0\0\0\ +\x0c\0\0\0\x50\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x60\0\0\0\0\0\0\0\0\0\0\0\x0b\0\ +\0\0\x70\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\ +\x90\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xa0\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xb0\0\ +\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xc0\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xd0\0\0\0\0\ +\0\0\0\0\0\0\0\x0b\0\0\0\xe8\0\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xf8\0\0\0\0\0\0\0\ +\0\0\0\0\x0c\0\0\0\x08\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x18\x01\0\0\0\0\0\0\0\ +\0\0\0\x0c\0\0\0\x28\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x38\x01\0\0\0\0\0\0\0\0\ +\0\0\x0c\0\0\0\x48\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x58\x01\0\0\0\0\0\0\0\0\0\ +\0\x0c\0\0\0\x68\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x78\x01\0\0\0\0\0\0\0\0\0\0\ +\x0c\0\0\0\x88\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x98\x01\0\0\0\0\0\0\0\0\0\0\ +\x0c\0\0\0\xa8\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xb8\x01\0\0\0\0\0\0\0\0\0\0\ +\x0c\0\0\0\xc8\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xd8\x01\0\0\0\0\0\0\0\0\0\0\ +\x0c\0\0\0\xe8\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xf8\x01\0\0\0\0\0\0\0\0\0\0\ +\x0c\0\0\0\x08\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x18\x02\0\0\0\0\0\0\0\0\0\0\ +\x0c\0\0\0\x28\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x38\x02\0\0\0\0\0\0\0\0\0\0\ +\x0c\0\0\0\x48\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x58\x02\0\0\0\0\0\0\0\0\0\0\ +\x0c\0\0\0\x68\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x78\x02\0\0\0\0\0\0\0\0\0\0\ +\x0c\0\0\0\x94\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xa4\x02\0\0\0\0\0\0\0\0\0\0\ +\x0b\0\0\0\xb4\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xc4\x02\0\0\0\0\0\0\0\0\0\0\ +\x0b\0\0\0\xd4\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xe4\x02\0\0\0\0\0\0\0\0\0\0\ +\x0b\0\0\0\xf4\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x0c\x03\0\0\0\0\0\0\0\0\0\0\ +\x0c\0\0\0\x1c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x2c\x03\0\0\0\0\0\0\0\0\0\0\ +\x0c\0\0\0\x3c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x4c\x03\0\0\0\0\0\0\0\0\0\0\ +\x0c\0\0\0\x5c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x6c\x03\0\0\0\0\0\0\0\0\0\0\ +\x0c\0\0\0\x7c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x8c\x03\0\0\0\0\0\0\0\0\0\0\ +\x0c\0\0\0\x9c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xac\x03\0\0\0\0\0\0\0\0\0\0\ +\x0c\0\0\0\xbc\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xcc\x03\0\0\0\0\0\0\0\0\0\0\ +\x0c\0\0\0\xdc\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xec\x03\0\0\0\0\0\0\0\0\0\0\ +\x0c\0\0\0\xfc\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x0c\x04\0\0\0\0\0\0\0\0\0\0\ +\x0c\0\0\0\x1c\x04\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x4d\x4e\x40\x41\x42\x43\x4c\0\ +\x2e\x74\x65\x78\x74\0\x2e\x72\x65\x6c\x2e\x42\x54\x46\x2e\x65\x78\x74\0\x64\ +\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\ +\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\0\ +\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x2e\x72\x65\x6c\x69\x74\x65\ +\x72\x2f\x62\x70\x66\x5f\x6d\x61\x70\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\ +\x72\x6f\x67\0\x2e\x72\x65\x6c\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\ +\x67\0\x2e\x6c\x6c\x76\x6d\x5f\x61\x64\x64\x72\x73\x69\x67\0\x6c\x69\x63\x65\ +\x6e\x73\x65\0\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\x2e\x63\0\ +\x2e\x73\x74\x72\x74\x61\x62\0\x2e\x73\x79\x6d\x74\x61\x62\0\x2e\x72\x6f\x64\ +\x61\x74\x61\0\x2e\x72\x65\x6c\x2e\x42\x54\x46\0\x4c\x49\x43\x45\x4e\x53\x45\0\ +\x4c\x42\x42\x31\x5f\x37\0\x4c\x42\x42\x31\x5f\x36\0\x4c\x42\x42\x30\x5f\x34\0\ +\x4c\x42\x42\x31\x5f\x33\0\x4c\x42\x42\x30\x5f\x33\0\x64\x75\x6d\x70\x5f\x62\ +\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x32\0\x64\x75\x6d\ +\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x31\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\x01\0\0\ +\0\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\x04\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x4e\0\0\0\x01\0\0\0\x06\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\x40\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x08\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\x6d\0\0\0\x01\0\0\0\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\x40\x01\0\0\0\0\0\0\x08\x02\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\xb1\0\0\0\x01\0\0\0\x02\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x48\x03\0\ +\0\0\0\0\0\x62\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\x89\0\0\0\x01\0\0\0\x03\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xaa\x03\0\0\0\0\0\0\x04\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xbd\0\0\0\x01\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xae\x03\0\0\0\0\0\0\x3d\x09\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x01\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\xeb\x0c\0\0\0\0\0\0\x2c\x04\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xa9\0\0\0\x02\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\x18\x11\0\0\0\0\0\0\x98\x01\0\0\0\0\0\0\x0e\0\0\0\x0e\0\0\0\x08\0\0\ +\0\0\0\0\0\x18\0\0\0\0\0\0\0\x4a\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\xb0\x12\0\0\0\0\0\0\x20\0\0\0\0\0\0\0\x08\0\0\0\x02\0\0\0\x08\0\0\0\0\0\0\0\ +\x10\0\0\0\0\0\0\0\x69\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xd0\x12\ +\0\0\0\0\0\0\x20\0\0\0\0\0\0\0\x08\0\0\0\x03\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\ +\0\0\0\0\xb9\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xf0\x12\0\0\0\0\0\ +\0\x50\0\0\0\0\0\0\0\x08\0\0\0\x06\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\ +\x07\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x40\x13\0\0\0\0\0\0\xe0\ +\x03\0\0\0\0\0\0\x08\0\0\0\x07\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\x7b\0\ +\0\0\x03\x4c\xff\x6f\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\0\0\x20\x17\0\0\0\0\0\0\x07\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xa1\0\0\0\x03\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x27\x17\0\0\0\0\0\0\x1a\x01\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"; + + return 0; +err: + bpf_object__destroy_skeleton(s); + return -1; +} + +#endif /* __ITERATORS_BPF_SKEL_H__ */ diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index 44184f82916a..0ee2347ba510 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -257,6 +257,7 @@ static int queue_stack_map_get_next_key(struct bpf_map *map, void *key, static int queue_map_btf_id; const struct bpf_map_ops queue_map_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = queue_stack_map_alloc_check, .map_alloc = queue_stack_map_alloc, .map_free = queue_stack_map_free, @@ -273,6 +274,7 @@ const struct bpf_map_ops queue_map_ops = { static int stack_map_btf_id; const struct bpf_map_ops stack_map_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = queue_stack_map_alloc_check, .map_alloc = queue_stack_map_alloc, .map_free = queue_stack_map_free, diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index 90b29c5b1da7..a55cd542f2ce 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -191,7 +191,7 @@ int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, rcu_read_lock(); sk = reuseport_array_lookup_elem(map, key); if (sk) { - *(u64 *)value = sock_gen_cookie(sk); + *(u64 *)value = __sock_gen_cookie(sk); err = 0; } else { err = -ENOENT; @@ -351,6 +351,7 @@ static int reuseport_array_get_next_key(struct bpf_map *map, void *key, static int reuseport_array_map_btf_id; const struct bpf_map_ops reuseport_array_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = reuseport_array_alloc_check, .map_alloc = reuseport_array_alloc, .map_free = reuseport_array_free, diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 002f8a5c9e51..31cb04a4dd2d 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -287,6 +287,7 @@ static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp, static int ringbuf_map_btf_id; const struct bpf_map_ops ringbuf_map_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc = ringbuf_map_alloc, .map_free = ringbuf_map_free, .map_mmap = ringbuf_map_mmap, diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 4fd830a62be2..06065fa27124 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -213,11 +213,13 @@ static int stack_map_get_build_id_32(void *page_addr, phdr = (Elf32_Phdr *)(page_addr + sizeof(Elf32_Ehdr)); - for (i = 0; i < ehdr->e_phnum; ++i) - if (phdr[i].p_type == PT_NOTE) - return stack_map_parse_build_id(page_addr, build_id, - page_addr + phdr[i].p_offset, - phdr[i].p_filesz); + for (i = 0; i < ehdr->e_phnum; ++i) { + if (phdr[i].p_type == PT_NOTE && + !stack_map_parse_build_id(page_addr, build_id, + page_addr + phdr[i].p_offset, + phdr[i].p_filesz)) + return 0; + } return -EINVAL; } @@ -236,11 +238,13 @@ static int stack_map_get_build_id_64(void *page_addr, phdr = (Elf64_Phdr *)(page_addr + sizeof(Elf64_Ehdr)); - for (i = 0; i < ehdr->e_phnum; ++i) - if (phdr[i].p_type == PT_NOTE) - return stack_map_parse_build_id(page_addr, build_id, - page_addr + phdr[i].p_offset, - phdr[i].p_filesz); + for (i = 0; i < ehdr->e_phnum; ++i) { + if (phdr[i].p_type == PT_NOTE && + !stack_map_parse_build_id(page_addr, build_id, + page_addr + phdr[i].p_offset, + phdr[i].p_filesz)) + return 0; + } return -EINVAL; } @@ -661,18 +665,17 @@ BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf, return __bpf_get_stack(regs, task, NULL, buf, size, flags); } -BTF_ID_LIST(bpf_get_task_stack_btf_ids) -BTF_ID(struct, task_struct) +BTF_ID_LIST_SINGLE(bpf_get_task_stack_btf_ids, struct, task_struct) const struct bpf_func_proto bpf_get_task_stack_proto = { .func = bpf_get_task_stack, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &bpf_get_task_stack_btf_ids[0], .arg2_type = ARG_PTR_TO_UNINIT_MEM, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, - .btf_id = bpf_get_task_stack_btf_ids, }; BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx, @@ -835,6 +838,7 @@ static void stack_map_free(struct bpf_map *map) static int stack_trace_map_btf_id; const struct bpf_map_ops stack_trace_map_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc = stack_map_alloc, .map_free = stack_map_free, .map_get_next_key = stack_map_get_next_key, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 86299a292214..8f50c9c19f1b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -4,6 +4,7 @@ #include <linux/bpf.h> #include <linux/bpf_trace.h> #include <linux/bpf_lirc.h> +#include <linux/bpf_verifier.h> #include <linux/btf.h> #include <linux/syscalls.h> #include <linux/slab.h> @@ -29,6 +30,7 @@ #include <linux/bpf_lsm.h> #include <linux/poll.h> #include <linux/bpf-netns.h> +#include <linux/rcupdate_trace.h> #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ @@ -90,6 +92,7 @@ int bpf_check_uarg_tail_zero(void __user *uaddr, } const struct bpf_map_ops bpf_map_offload_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc = bpf_map_offload_map_alloc, .map_free = bpf_map_offload_map_free, .map_check_btf = map_check_no_btf, @@ -157,10 +160,11 @@ static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key, if (bpf_map_is_dev_bound(map)) { return bpf_map_offload_update_elem(map, key, value, flags); } else if (map->map_type == BPF_MAP_TYPE_CPUMAP || - map->map_type == BPF_MAP_TYPE_SOCKHASH || - map->map_type == BPF_MAP_TYPE_SOCKMAP || map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { return map->ops->map_update_elem(map, key, value, flags); + } else if (map->map_type == BPF_MAP_TYPE_SOCKHASH || + map->map_type == BPF_MAP_TYPE_SOCKMAP) { + return sock_map_update_elem_sys(map, key, value, flags); } else if (IS_FD_PROG_ARRAY(map)) { return bpf_fd_array_map_update_elem(map, f.file, key, value, flags); @@ -768,7 +772,8 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_ARRAY && map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE && - map->map_type != BPF_MAP_TYPE_SK_STORAGE) + map->map_type != BPF_MAP_TYPE_SK_STORAGE && + map->map_type != BPF_MAP_TYPE_INODE_STORAGE) return -ENOTSUPP; if (map->spin_lock_off + sizeof(struct bpf_spin_lock) > map->value_size) { @@ -1728,10 +1733,14 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred) btf_put(prog->aux->btf); bpf_prog_free_linfo(prog); - if (deferred) - call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); - else + if (deferred) { + if (prog->aux->sleepable) + call_rcu_tasks_trace(&prog->aux->rcu, __bpf_prog_put_rcu); + else + call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); + } else { __bpf_prog_put_rcu(&prog->aux->rcu); + } } static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) @@ -2029,7 +2038,7 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, case BPF_PROG_TYPE_EXT: if (expected_attach_type) return -EINVAL; - /* fallthrough */ + fallthrough; default: return 0; } @@ -2101,6 +2110,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT | BPF_F_ANY_ALIGNMENT | BPF_F_TEST_STATE_FREQ | + BPF_F_SLEEPABLE | BPF_F_TEST_RND_HI32)) return -EINVAL; @@ -2145,17 +2155,18 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) prog->expected_attach_type = attr->expected_attach_type; prog->aux->attach_btf_id = attr->attach_btf_id; if (attr->attach_prog_fd) { - struct bpf_prog *tgt_prog; + struct bpf_prog *dst_prog; - tgt_prog = bpf_prog_get(attr->attach_prog_fd); - if (IS_ERR(tgt_prog)) { - err = PTR_ERR(tgt_prog); + dst_prog = bpf_prog_get(attr->attach_prog_fd); + if (IS_ERR(dst_prog)) { + err = PTR_ERR(dst_prog); goto free_prog_nouncharge; } - prog->aux->linked_prog = tgt_prog; + prog->aux->dst_prog = dst_prog; } prog->aux->offload_requested = !!attr->prog_ifindex; + prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE; err = security_bpf_prog_alloc(prog->aux); if (err) @@ -2488,11 +2499,23 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd) struct bpf_tracing_link { struct bpf_link link; enum bpf_attach_type attach_type; + struct bpf_trampoline *trampoline; + struct bpf_prog *tgt_prog; }; static void bpf_tracing_link_release(struct bpf_link *link) { - WARN_ON_ONCE(bpf_trampoline_unlink_prog(link->prog)); + struct bpf_tracing_link *tr_link = + container_of(link, struct bpf_tracing_link, link); + + WARN_ON_ONCE(bpf_trampoline_unlink_prog(link->prog, + tr_link->trampoline)); + + bpf_trampoline_put(tr_link->trampoline); + + /* tgt_prog is NULL if target is a kernel function */ + if (tr_link->tgt_prog) + bpf_prog_put(tr_link->tgt_prog); } static void bpf_tracing_link_dealloc(struct bpf_link *link) @@ -2532,10 +2555,15 @@ static const struct bpf_link_ops bpf_tracing_link_lops = { .fill_link_info = bpf_tracing_link_fill_link_info, }; -static int bpf_tracing_prog_attach(struct bpf_prog *prog) +static int bpf_tracing_prog_attach(struct bpf_prog *prog, + int tgt_prog_fd, + u32 btf_id) { struct bpf_link_primer link_primer; + struct bpf_prog *tgt_prog = NULL; + struct bpf_trampoline *tr = NULL; struct bpf_tracing_link *link; + u64 key = 0; int err; switch (prog->type) { @@ -2564,6 +2592,28 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog) goto out_put_prog; } + if (!!tgt_prog_fd != !!btf_id) { + err = -EINVAL; + goto out_put_prog; + } + + if (tgt_prog_fd) { + /* For now we only allow new targets for BPF_PROG_TYPE_EXT */ + if (prog->type != BPF_PROG_TYPE_EXT) { + err = -EINVAL; + goto out_put_prog; + } + + tgt_prog = bpf_prog_get(tgt_prog_fd); + if (IS_ERR(tgt_prog)) { + err = PTR_ERR(tgt_prog); + tgt_prog = NULL; + goto out_put_prog; + } + + key = bpf_trampoline_compute_key(tgt_prog, btf_id); + } + link = kzalloc(sizeof(*link), GFP_USER); if (!link) { err = -ENOMEM; @@ -2573,20 +2623,100 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog) &bpf_tracing_link_lops, prog); link->attach_type = prog->expected_attach_type; - err = bpf_link_prime(&link->link, &link_primer); - if (err) { - kfree(link); - goto out_put_prog; + mutex_lock(&prog->aux->dst_mutex); + + /* There are a few possible cases here: + * + * - if prog->aux->dst_trampoline is set, the program was just loaded + * and not yet attached to anything, so we can use the values stored + * in prog->aux + * + * - if prog->aux->dst_trampoline is NULL, the program has already been + * attached to a target and its initial target was cleared (below) + * + * - if tgt_prog != NULL, the caller specified tgt_prog_fd + + * target_btf_id using the link_create API. + * + * - if tgt_prog == NULL when this function was called using the old + * raw_tracepoint_open API, and we need a target from prog->aux + * + * The combination of no saved target in prog->aux, and no target + * specified on load is illegal, and we reject that here. + */ + if (!prog->aux->dst_trampoline && !tgt_prog) { + err = -ENOENT; + goto out_unlock; + } + + if (!prog->aux->dst_trampoline || + (key && key != prog->aux->dst_trampoline->key)) { + /* If there is no saved target, or the specified target is + * different from the destination specified at load time, we + * need a new trampoline and a check for compatibility + */ + struct bpf_attach_target_info tgt_info = {}; + + err = bpf_check_attach_target(NULL, prog, tgt_prog, btf_id, + &tgt_info); + if (err) + goto out_unlock; + + tr = bpf_trampoline_get(key, &tgt_info); + if (!tr) { + err = -ENOMEM; + goto out_unlock; + } + } else { + /* The caller didn't specify a target, or the target was the + * same as the destination supplied during program load. This + * means we can reuse the trampoline and reference from program + * load time, and there is no need to allocate a new one. This + * can only happen once for any program, as the saved values in + * prog->aux are cleared below. + */ + tr = prog->aux->dst_trampoline; + tgt_prog = prog->aux->dst_prog; } - err = bpf_trampoline_link_prog(prog); + err = bpf_link_prime(&link->link, &link_primer); + if (err) + goto out_unlock; + + err = bpf_trampoline_link_prog(prog, tr); if (err) { bpf_link_cleanup(&link_primer); - goto out_put_prog; + link = NULL; + goto out_unlock; } + link->tgt_prog = tgt_prog; + link->trampoline = tr; + + /* Always clear the trampoline and target prog from prog->aux to make + * sure the original attach destination is not kept alive after a + * program is (re-)attached to another target. + */ + if (prog->aux->dst_prog && + (tgt_prog_fd || tr != prog->aux->dst_trampoline)) + /* got extra prog ref from syscall, or attaching to different prog */ + bpf_prog_put(prog->aux->dst_prog); + if (prog->aux->dst_trampoline && tr != prog->aux->dst_trampoline) + /* we allocated a new trampoline, so free the old one */ + bpf_trampoline_put(prog->aux->dst_trampoline); + + prog->aux->dst_prog = NULL; + prog->aux->dst_trampoline = NULL; + mutex_unlock(&prog->aux->dst_mutex); + return bpf_link_settle(&link_primer); +out_unlock: + if (tr && tr != prog->aux->dst_trampoline) + bpf_trampoline_put(tr); + mutex_unlock(&prog->aux->dst_mutex); + kfree(link); out_put_prog: + if (tgt_prog_fd && tgt_prog) + bpf_prog_put(tgt_prog); bpf_prog_put(prog); return err; } @@ -2634,7 +2764,7 @@ static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link, u32 ulen = info->raw_tracepoint.tp_name_len; size_t tp_len = strlen(tp_name); - if (ulen && !ubuf) + if (!ulen ^ !ubuf) return -EINVAL; info->raw_tracepoint.tp_name_len = tp_len + 1; @@ -2700,7 +2830,7 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) tp_name = prog->aux->attach_func_name; break; } - return bpf_tracing_prog_attach(prog); + return bpf_tracing_prog_attach(prog, 0, 0); case BPF_PROG_TYPE_RAW_TRACEPOINT: case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: if (strncpy_from_user(buf, @@ -2783,7 +2913,6 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) case BPF_CGROUP_INET_INGRESS: case BPF_CGROUP_INET_EGRESS: return BPF_PROG_TYPE_CGROUP_SKB; - break; case BPF_CGROUP_INET_SOCK_CREATE: case BPF_CGROUP_INET_SOCK_RELEASE: case BPF_CGROUP_INET4_POST_BIND: @@ -2969,7 +3098,7 @@ static int bpf_prog_query(const union bpf_attr *attr, } } -#define BPF_PROG_TEST_RUN_LAST_FIELD test.ctx_out +#define BPF_PROG_TEST_RUN_LAST_FIELD test.cpu static int bpf_prog_test_run(const union bpf_attr *attr, union bpf_attr __user *uattr) @@ -3152,21 +3281,25 @@ static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog, const struct bpf_map *map; int i; + mutex_lock(&prog->aux->used_maps_mutex); for (i = 0, *off = 0; i < prog->aux->used_map_cnt; i++) { map = prog->aux->used_maps[i]; if (map == (void *)addr) { *type = BPF_PSEUDO_MAP_FD; - return map; + goto out; } if (!map->ops->map_direct_value_meta) continue; if (!map->ops->map_direct_value_meta(map, addr, off)) { *type = BPF_PSEUDO_MAP_VALUE; - return map; + goto out; } } + map = NULL; - return NULL; +out: + mutex_unlock(&prog->aux->used_maps_mutex); + return map; } static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog, @@ -3284,6 +3417,7 @@ static int bpf_prog_get_info_by_fd(struct file *file, memcpy(info.tag, prog->tag, sizeof(prog->tag)); memcpy(info.name, prog->aux->name, sizeof(prog->aux->name)); + mutex_lock(&prog->aux->used_maps_mutex); ulen = info.nr_map_ids; info.nr_map_ids = prog->aux->used_map_cnt; ulen = min_t(u32, info.nr_map_ids, ulen); @@ -3293,9 +3427,12 @@ static int bpf_prog_get_info_by_fd(struct file *file, for (i = 0; i < ulen; i++) if (put_user(prog->aux->used_maps[i]->id, - &user_map_ids[i])) + &user_map_ids[i])) { + mutex_unlock(&prog->aux->used_maps_mutex); return -EFAULT; + } } + mutex_unlock(&prog->aux->used_maps_mutex); err = set_info_rec_size(&info); if (err) @@ -3876,10 +4013,15 @@ err_put: static int tracing_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { - if (attr->link_create.attach_type == BPF_TRACE_ITER && - prog->expected_attach_type == BPF_TRACE_ITER) - return bpf_iter_link_attach(attr, prog); + if (attr->link_create.attach_type != prog->expected_attach_type) + return -EINVAL; + if (prog->expected_attach_type == BPF_TRACE_ITER) + return bpf_iter_link_attach(attr, prog); + else if (prog->type == BPF_PROG_TYPE_EXT) + return bpf_tracing_prog_attach(prog, + attr->link_create.target_fd, + attr->link_create.target_btf_id); return -EINVAL; } @@ -3893,18 +4035,25 @@ static int link_create(union bpf_attr *attr) if (CHECK_ATTR(BPF_LINK_CREATE)) return -EINVAL; - ptype = attach_type_to_prog_type(attr->link_create.attach_type); - if (ptype == BPF_PROG_TYPE_UNSPEC) - return -EINVAL; - - prog = bpf_prog_get_type(attr->link_create.prog_fd, ptype); + prog = bpf_prog_get(attr->link_create.prog_fd); if (IS_ERR(prog)) return PTR_ERR(prog); ret = bpf_prog_attach_check_attach_type(prog, attr->link_create.attach_type); if (ret) - goto err_out; + goto out; + + if (prog->type == BPF_PROG_TYPE_EXT) { + ret = tracing_bpf_link_attach(attr, prog); + goto out; + } + + ptype = attach_type_to_prog_type(attr->link_create.attach_type); + if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type) { + ret = -EINVAL; + goto out; + } switch (ptype) { case BPF_PROG_TYPE_CGROUP_SKB: @@ -3932,7 +4081,7 @@ static int link_create(union bpf_attr *attr) ret = -EINVAL; } -err_out: +out: if (ret < 0) bpf_prog_put(prog); return ret; @@ -4014,40 +4163,50 @@ static int link_detach(union bpf_attr *attr) return ret; } -static int bpf_link_inc_not_zero(struct bpf_link *link) +static struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link) { - return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? 0 : -ENOENT; + return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? link : ERR_PTR(-ENOENT); } -#define BPF_LINK_GET_FD_BY_ID_LAST_FIELD link_id - -static int bpf_link_get_fd_by_id(const union bpf_attr *attr) +struct bpf_link *bpf_link_by_id(u32 id) { struct bpf_link *link; - u32 id = attr->link_id; - int fd, err; - - if (CHECK_ATTR(BPF_LINK_GET_FD_BY_ID)) - return -EINVAL; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; + if (!id) + return ERR_PTR(-ENOENT); spin_lock_bh(&link_idr_lock); - link = idr_find(&link_idr, id); /* before link is "settled", ID is 0, pretend it doesn't exist yet */ + link = idr_find(&link_idr, id); if (link) { if (link->id) - err = bpf_link_inc_not_zero(link); + link = bpf_link_inc_not_zero(link); else - err = -EAGAIN; + link = ERR_PTR(-EAGAIN); } else { - err = -ENOENT; + link = ERR_PTR(-ENOENT); } spin_unlock_bh(&link_idr_lock); + return link; +} - if (err) - return err; +#define BPF_LINK_GET_FD_BY_ID_LAST_FIELD link_id + +static int bpf_link_get_fd_by_id(const union bpf_attr *attr) +{ + struct bpf_link *link; + u32 id = attr->link_id; + int fd; + + if (CHECK_ATTR(BPF_LINK_GET_FD_BY_ID)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + link = bpf_link_by_id(id); + if (IS_ERR(link)) + return PTR_ERR(link); fd = bpf_link_new_fd(link); if (fd < 0) @@ -4133,6 +4292,68 @@ static int bpf_iter_create(union bpf_attr *attr) return err; } +#define BPF_PROG_BIND_MAP_LAST_FIELD prog_bind_map.flags + +static int bpf_prog_bind_map(union bpf_attr *attr) +{ + struct bpf_prog *prog; + struct bpf_map *map; + struct bpf_map **used_maps_old, **used_maps_new; + int i, ret = 0; + + if (CHECK_ATTR(BPF_PROG_BIND_MAP)) + return -EINVAL; + + if (attr->prog_bind_map.flags) + return -EINVAL; + + prog = bpf_prog_get(attr->prog_bind_map.prog_fd); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + map = bpf_map_get(attr->prog_bind_map.map_fd); + if (IS_ERR(map)) { + ret = PTR_ERR(map); + goto out_prog_put; + } + + mutex_lock(&prog->aux->used_maps_mutex); + + used_maps_old = prog->aux->used_maps; + + for (i = 0; i < prog->aux->used_map_cnt; i++) + if (used_maps_old[i] == map) { + bpf_map_put(map); + goto out_unlock; + } + + used_maps_new = kmalloc_array(prog->aux->used_map_cnt + 1, + sizeof(used_maps_new[0]), + GFP_KERNEL); + if (!used_maps_new) { + ret = -ENOMEM; + goto out_unlock; + } + + memcpy(used_maps_new, used_maps_old, + sizeof(used_maps_old[0]) * prog->aux->used_map_cnt); + used_maps_new[prog->aux->used_map_cnt] = map; + + prog->aux->used_map_cnt++; + prog->aux->used_maps = used_maps_new; + + kfree(used_maps_old); + +out_unlock: + mutex_unlock(&prog->aux->used_maps_mutex); + + if (ret) + bpf_map_put(map); +out_prog_put: + bpf_prog_put(prog); + return ret; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr; @@ -4266,6 +4487,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_LINK_DETACH: err = link_detach(&attr); break; + case BPF_PROG_BIND_MAP: + err = bpf_prog_bind_map(&attr); + break; default: err = -EINVAL; break; diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c index 3b495773de5a..11b3380887fa 100644 --- a/kernel/bpf/sysfs_btf.c +++ b/kernel/bpf/sysfs_btf.c @@ -30,15 +30,15 @@ static struct kobject *btf_kobj; static int __init btf_vmlinux_init(void) { - if (!__start_BTF) + bin_attr_btf_vmlinux.size = __stop_BTF - __start_BTF; + + if (!__start_BTF || bin_attr_btf_vmlinux.size == 0) return 0; btf_kobj = kobject_create_and_add("btf", kernel_kobj); if (!btf_kobj) return -ENOMEM; - bin_attr_btf_vmlinux.size = __stop_BTF - __start_BTF; - return sysfs_create_bin_file(btf_kobj, &bin_attr_btf_vmlinux); } diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 232df29793e9..5b6af30bfbcd 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -22,19 +22,27 @@ struct bpf_iter_seq_task_info { }; static struct task_struct *task_seq_get_next(struct pid_namespace *ns, - u32 *tid) + u32 *tid, + bool skip_if_dup_files) { struct task_struct *task = NULL; struct pid *pid; rcu_read_lock(); retry: - pid = idr_get_next(&ns->idr, tid); + pid = find_ge_pid(*tid, ns); if (pid) { + *tid = pid_nr_ns(pid, ns); task = get_pid_task(pid, PIDTYPE_PID); if (!task) { ++*tid; goto retry; + } else if (skip_if_dup_files && task->tgid != task->pid && + task->files == task->group_leader->files) { + put_task_struct(task); + task = NULL; + ++*tid; + goto retry; } } rcu_read_unlock(); @@ -47,7 +55,7 @@ static void *task_seq_start(struct seq_file *seq, loff_t *pos) struct bpf_iter_seq_task_info *info = seq->private; struct task_struct *task; - task = task_seq_get_next(info->common.ns, &info->tid); + task = task_seq_get_next(info->common.ns, &info->tid, false); if (!task) return NULL; @@ -64,7 +72,7 @@ static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos) ++*pos; ++info->tid; put_task_struct((struct task_struct *)v); - task = task_seq_get_next(info->common.ns, &info->tid); + task = task_seq_get_next(info->common.ns, &info->tid, false); if (!task) return NULL; @@ -147,7 +155,7 @@ again: curr_files = *fstruct; curr_fd = info->fd; } else { - curr_task = task_seq_get_next(ns, &curr_tid); + curr_task = task_seq_get_next(ns, &curr_tid, true); if (!curr_task) return NULL; @@ -178,10 +186,11 @@ again: f = fcheck_files(curr_files, curr_fd); if (!f) continue; + if (!get_file_rcu(f)) + continue; /* set info->fd */ info->fd = curr_fd; - get_file(f); rcu_read_unlock(); return f; } diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 9be85aa4ec5f..35c5887d82ff 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -7,6 +7,8 @@ #include <linux/rbtree_latch.h> #include <linux/perf_event.h> #include <linux/btf.h> +#include <linux/rcupdate_trace.h> +#include <linux/rcupdate_wait.h> /* dummy _ops. The verifier will operate on target program's ops. */ const struct bpf_verifier_ops bpf_extension_verifier_ops = { @@ -63,7 +65,7 @@ static void bpf_trampoline_ksym_add(struct bpf_trampoline *tr) bpf_image_ksym_add(tr->image, ksym); } -struct bpf_trampoline *bpf_trampoline_lookup(u64 key) +static struct bpf_trampoline *bpf_trampoline_lookup(u64 key) { struct bpf_trampoline *tr; struct hlist_head *head; @@ -210,9 +212,12 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) * updates to trampoline would change the code from underneath the * preempted task. Hence wait for tasks to voluntarily schedule or go * to userspace. + * The same trampoline can hold both sleepable and non-sleepable progs. + * synchronize_rcu_tasks_trace() is needed to make sure all sleepable + * programs finish executing. + * Wait for these two grace periods together. */ - - synchronize_rcu_tasks(); + synchronize_rcu_mult(call_rcu_tasks, call_rcu_tasks_trace); err = arch_prepare_bpf_trampoline(new_image, new_image + PAGE_SIZE / 2, &tr->func.model, flags, tprogs, @@ -256,14 +261,12 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog) } } -int bpf_trampoline_link_prog(struct bpf_prog *prog) +int bpf_trampoline_link_prog(struct bpf_prog *prog, struct bpf_trampoline *tr) { enum bpf_tramp_prog_type kind; - struct bpf_trampoline *tr; int err = 0; int cnt; - tr = prog->aux->trampoline; kind = bpf_attach_type_to_tramp(prog); mutex_lock(&tr->mutex); if (tr->extension_prog) { @@ -296,7 +299,7 @@ int bpf_trampoline_link_prog(struct bpf_prog *prog) } hlist_add_head(&prog->aux->tramp_hlist, &tr->progs_hlist[kind]); tr->progs_cnt[kind]++; - err = bpf_trampoline_update(prog->aux->trampoline); + err = bpf_trampoline_update(tr); if (err) { hlist_del(&prog->aux->tramp_hlist); tr->progs_cnt[kind]--; @@ -307,13 +310,11 @@ out: } /* bpf_trampoline_unlink_prog() should never fail. */ -int bpf_trampoline_unlink_prog(struct bpf_prog *prog) +int bpf_trampoline_unlink_prog(struct bpf_prog *prog, struct bpf_trampoline *tr) { enum bpf_tramp_prog_type kind; - struct bpf_trampoline *tr; int err; - tr = prog->aux->trampoline; kind = bpf_attach_type_to_tramp(prog); mutex_lock(&tr->mutex); if (kind == BPF_TRAMP_REPLACE) { @@ -325,12 +326,32 @@ int bpf_trampoline_unlink_prog(struct bpf_prog *prog) } hlist_del(&prog->aux->tramp_hlist); tr->progs_cnt[kind]--; - err = bpf_trampoline_update(prog->aux->trampoline); + err = bpf_trampoline_update(tr); out: mutex_unlock(&tr->mutex); return err; } +struct bpf_trampoline *bpf_trampoline_get(u64 key, + struct bpf_attach_target_info *tgt_info) +{ + struct bpf_trampoline *tr; + + tr = bpf_trampoline_lookup(key); + if (!tr) + return NULL; + + mutex_lock(&tr->mutex); + if (tr->func.addr) + goto out; + + memcpy(&tr->func.model, &tgt_info->fmodel, sizeof(tgt_info->fmodel)); + tr->func.addr = (void *)tgt_info->tgt_addr; +out: + mutex_unlock(&tr->mutex); + return tr; +} + void bpf_trampoline_put(struct bpf_trampoline *tr) { if (!tr) @@ -344,7 +365,14 @@ void bpf_trampoline_put(struct bpf_trampoline *tr) if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT]))) goto out; bpf_image_ksym_del(&tr->ksym); - /* wait for tasks to get out of trampoline before freeing it */ + /* This code will be executed when all bpf progs (both sleepable and + * non-sleepable) went through + * bpf_prog_put()->call_rcu[_tasks_trace]()->bpf_prog_free_deferred(). + * Hence no need for another synchronize_rcu_tasks_trace() here, + * but synchronize_rcu_tasks() is still needed, since trampoline + * may not have had any sleepable programs and we need to wait + * for tasks to get out of trampoline code before freeing it. + */ synchronize_rcu_tasks(); bpf_jit_free_exec(tr->image); hlist_del(&tr->hlist); @@ -394,6 +422,17 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start) rcu_read_unlock(); } +void notrace __bpf_prog_enter_sleepable(void) +{ + rcu_read_lock_trace(); + might_fault(); +} + +void notrace __bpf_prog_exit_sleepable(void) +{ + rcu_read_unlock_trace(); +} + int __weak arch_prepare_bpf_trampoline(void *image, void *image_end, const struct btf_func_model *m, u32 flags, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ef938f17b944..6200519582a6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -21,6 +21,7 @@ #include <linux/ctype.h> #include <linux/error-injection.h> #include <linux/bpf_lsm.h> +#include <linux/btf_ids.h> #include "disasm.h" @@ -238,6 +239,7 @@ struct bpf_call_arg_meta { int ref_obj_id; int func_id; u32 btf_id; + u32 ret_btf_id; }; struct btf *btf_vmlinux; @@ -435,6 +437,15 @@ static bool arg_type_may_be_refcounted(enum bpf_arg_type type) return type == ARG_PTR_TO_SOCK_COMMON; } +static bool arg_type_may_be_null(enum bpf_arg_type type) +{ + return type == ARG_PTR_TO_MAP_VALUE_OR_NULL || + type == ARG_PTR_TO_MEM_OR_NULL || + type == ARG_PTR_TO_CTX_OR_NULL || + type == ARG_PTR_TO_SOCKET_OR_NULL || + type == ARG_PTR_TO_ALLOC_MEM_OR_NULL; +} + /* Determine whether the function releases some resources allocated by another * function call. The first reference type argument will be assumed to be * released by release_reference(). @@ -477,7 +488,12 @@ static bool is_acquire_function(enum bpf_func_id func_id, static bool is_ptr_cast_function(enum bpf_func_id func_id) { return func_id == BPF_FUNC_tcp_sock || - func_id == BPF_FUNC_sk_fullsock; + func_id == BPF_FUNC_sk_fullsock || + func_id == BPF_FUNC_skc_to_tcp_sock || + func_id == BPF_FUNC_skc_to_tcp6_sock || + func_id == BPF_FUNC_skc_to_udp6_sock || + func_id == BPF_FUNC_skc_to_tcp_timewait_sock || + func_id == BPF_FUNC_skc_to_tcp_request_sock; } /* string representation of 'enum bpf_reg_type' */ @@ -503,6 +519,7 @@ static const char * const reg_type_str[] = { [PTR_TO_XDP_SOCK] = "xdp_sock", [PTR_TO_BTF_ID] = "ptr_", [PTR_TO_BTF_ID_OR_NULL] = "ptr_or_null_", + [PTR_TO_PERCPU_BTF_ID] = "percpu_ptr_", [PTR_TO_MEM] = "mem", [PTR_TO_MEM_OR_NULL] = "mem_or_null", [PTR_TO_RDONLY_BUF] = "rdonly_buf", @@ -569,7 +586,9 @@ static void print_verifier_state(struct bpf_verifier_env *env, /* reg->off should be 0 for SCALAR_VALUE */ verbose(env, "%lld", reg->var_off.value + reg->off); } else { - if (t == PTR_TO_BTF_ID || t == PTR_TO_BTF_ID_OR_NULL) + if (t == PTR_TO_BTF_ID || + t == PTR_TO_BTF_ID_OR_NULL || + t == PTR_TO_PERCPU_BTF_ID) verbose(env, "%s", kernel_type_name(reg->btf_id)); verbose(env, "(id=%d", reg->id); if (reg_type_may_be_refcounted_or_null(t)) @@ -991,14 +1010,9 @@ static const int caller_saved[CALLER_SAVED_REGS] = { static void __mark_reg_not_init(const struct bpf_verifier_env *env, struct bpf_reg_state *reg); -/* Mark the unknown part of a register (variable offset or scalar value) as - * known to have the value @imm. - */ -static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm) +/* This helper doesn't clear reg->id */ +static void ___mark_reg_known(struct bpf_reg_state *reg, u64 imm) { - /* Clear id, off, and union(map_ptr, range) */ - memset(((u8 *)reg) + sizeof(reg->type), 0, - offsetof(struct bpf_reg_state, var_off) - sizeof(reg->type)); reg->var_off = tnum_const(imm); reg->smin_value = (s64)imm; reg->smax_value = (s64)imm; @@ -1011,6 +1025,17 @@ static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm) reg->u32_max_value = (u32)imm; } +/* Mark the unknown part of a register (variable offset or scalar value) as + * known to have the value @imm. + */ +static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm) +{ + /* Clear id, off, and union(map_ptr, range) */ + memset(((u8 *)reg) + sizeof(reg->type), 0, + offsetof(struct bpf_reg_state, var_off) - sizeof(reg->type)); + ___mark_reg_known(reg, imm); +} + static void __mark_reg32_known(struct bpf_reg_state *reg, u64 imm) { reg->var_off = tnum_const_subreg(reg->var_off, imm); @@ -1489,6 +1514,13 @@ static int check_subprogs(struct bpf_verifier_env *env) for (i = 0; i < insn_cnt; i++) { u8 code = insn[i].code; + if (code == (BPF_JMP | BPF_CALL) && + insn[i].imm == BPF_FUNC_tail_call && + insn[i].src_reg != BPF_PSEUDO_CALL) + subprog[cur_subprog].has_tail_call = true; + if (BPF_CLASS(code) == BPF_LD && + (BPF_MODE(code) == BPF_ABS || BPF_MODE(code) == BPF_IND)) + subprog[cur_subprog].has_ld_abs = true; if (BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32) goto next; if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL) @@ -2183,6 +2215,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_RDONLY_BUF_OR_NULL: case PTR_TO_RDWR_BUF: case PTR_TO_RDWR_BUF_OR_NULL: + case PTR_TO_PERCPU_BTF_ID: return true; default: return false; @@ -2200,6 +2233,20 @@ static bool register_is_const(struct bpf_reg_state *reg) return reg->type == SCALAR_VALUE && tnum_is_const(reg->var_off); } +static bool __is_scalar_unbounded(struct bpf_reg_state *reg) +{ + return tnum_is_unknown(reg->var_off) && + reg->smin_value == S64_MIN && reg->smax_value == S64_MAX && + reg->umin_value == 0 && reg->umax_value == U64_MAX && + reg->s32_min_value == S32_MIN && reg->s32_max_value == S32_MAX && + reg->u32_min_value == 0 && reg->u32_max_value == U32_MAX; +} + +static bool register_is_bounded(struct bpf_reg_state *reg) +{ + return reg->type == SCALAR_VALUE && !__is_scalar_unbounded(reg); +} + static bool __is_pointer_value(bool allow_ptr_leaks, const struct bpf_reg_state *reg) { @@ -2251,7 +2298,7 @@ static int check_stack_write(struct bpf_verifier_env *env, if (value_regno >= 0) reg = &cur->regs[value_regno]; - if (reg && size == BPF_REG_SIZE && register_is_const(reg) && + if (reg && size == BPF_REG_SIZE && register_is_bounded(reg) && !register_is_null(reg) && env->bpf_capable) { if (dst_reg != BPF_REG_FP) { /* The backtracking logic can only recognize explicit @@ -2625,11 +2672,18 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, #define MAX_PACKET_OFF 0xffff +static enum bpf_prog_type resolve_prog_type(struct bpf_prog *prog) +{ + return prog->aux->dst_prog ? prog->aux->dst_prog->type : prog->type; +} + static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, const struct bpf_call_arg_meta *meta, enum bpf_access_type t) { - switch (env->prog->type) { + enum bpf_prog_type prog_type = resolve_prog_type(env->prog); + + switch (prog_type) { /* Program types only with direct read access go here! */ case BPF_PROG_TYPE_LWT_IN: case BPF_PROG_TYPE_LWT_OUT: @@ -2639,7 +2693,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, case BPF_PROG_TYPE_CGROUP_SKB: if (t == BPF_WRITE) return false; - /* fallthrough */ + fallthrough; /* Program types with direct read + write access go here! */ case BPF_PROG_TYPE_SCHED_CLS: @@ -2970,10 +3024,37 @@ static int check_max_stack_depth(struct bpf_verifier_env *env) int depth = 0, frame = 0, idx = 0, i = 0, subprog_end; struct bpf_subprog_info *subprog = env->subprog_info; struct bpf_insn *insn = env->prog->insnsi; + bool tail_call_reachable = false; int ret_insn[MAX_CALL_FRAMES]; int ret_prog[MAX_CALL_FRAMES]; + int j; process_func: + /* protect against potential stack overflow that might happen when + * bpf2bpf calls get combined with tailcalls. Limit the caller's stack + * depth for such case down to 256 so that the worst case scenario + * would result in 8k stack size (32 which is tailcall limit * 256 = + * 8k). + * + * To get the idea what might happen, see an example: + * func1 -> sub rsp, 128 + * subfunc1 -> sub rsp, 256 + * tailcall1 -> add rsp, 256 + * func2 -> sub rsp, 192 (total stack size = 128 + 192 = 320) + * subfunc2 -> sub rsp, 64 + * subfunc22 -> sub rsp, 128 + * tailcall2 -> add rsp, 128 + * func3 -> sub rsp, 32 (total stack size 128 + 192 + 64 + 32 = 416) + * + * tailcall will unwind the current stack frame but it will not get rid + * of caller's stack as shown on the example above. + */ + if (idx && subprog[idx].has_tail_call && depth >= 256) { + verbose(env, + "tail_calls are not allowed when call stack of previous frames is %d bytes. Too large\n", + depth); + return -EACCES; + } /* round up to 32-bytes, since this is granularity * of interpreter stack size */ @@ -3002,6 +3083,10 @@ continue_func: i); return -EFAULT; } + + if (subprog[idx].has_tail_call) + tail_call_reachable = true; + frame++; if (frame >= MAX_CALL_FRAMES) { verbose(env, "the call stack of %d frames is too deep !\n", @@ -3010,6 +3095,15 @@ continue_func: } goto process_func; } + /* if tail call got detected across bpf2bpf calls then mark each of the + * currently present subprog frames as tail call reachable subprogs; + * this info will be utilized by JIT so that we will be preserving the + * tail call counter throughout bpf2bpf calls combined with tailcalls + */ + if (tail_call_reachable) + for (j = 0; j < frame; j++) + subprog[ret_prog[j]].tail_call_reachable = true; + /* end of for() loop means the last insn of the 'subprog' * was reached. Doesn't matter whether it was JA or EXIT */ @@ -3585,18 +3679,6 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, struct bpf_func_state *state = func(env, reg); int err, min_off, max_off, i, j, slot, spi; - if (reg->type != PTR_TO_STACK) { - /* Allow zero-byte read from NULL, regardless of pointer type */ - if (zero_size_allowed && access_size == 0 && - register_is_null(reg)) - return 0; - - verbose(env, "R%d type=%s expected=%s\n", regno, - reg_type_str[reg->type], - reg_type_str[PTR_TO_STACK]); - return -EACCES; - } - if (tnum_is_const(reg->var_off)) { min_off = max_off = reg->var_off.value + reg->off; err = __check_stack_boundary(env, regno, min_off, access_size, @@ -3741,9 +3823,19 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, access_size, zero_size_allowed, "rdwr", &env->prog->aux->max_rdwr_access); - default: /* scalar_value|ptr_to_stack or invalid ptr */ + case PTR_TO_STACK: return check_stack_boundary(env, regno, access_size, zero_size_allowed, meta); + default: /* scalar_value or invalid ptr */ + /* Allow zero-byte read from NULL, regardless of pointer type */ + if (zero_size_allowed && access_size == 0 && + register_is_null(reg)) + return 0; + + verbose(env, "R%d type=%s expected=%s\n", regno, + reg_type_str[reg->type], + reg_type_str[PTR_TO_STACK]); + return -EACCES; } } @@ -3775,10 +3867,6 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, struct bpf_map *map = reg->map_ptr; u64 val = reg->var_off.value; - if (reg->type != PTR_TO_MAP_VALUE) { - verbose(env, "R%d is not a pointer to map_value\n", regno); - return -EINVAL; - } if (!is_const) { verbose(env, "R%d doesn't have constant offset. bpf_spin_lock has to be at the constant offset\n", @@ -3845,12 +3933,6 @@ static bool arg_type_is_mem_size(enum bpf_arg_type type) type == ARG_CONST_SIZE_OR_ZERO; } -static bool arg_type_is_alloc_mem_ptr(enum bpf_arg_type type) -{ - return type == ARG_PTR_TO_ALLOC_MEM || - type == ARG_PTR_TO_ALLOC_MEM_OR_NULL; -} - static bool arg_type_is_alloc_size(enum bpf_arg_type type) { return type == ARG_CONST_ALLOC_SIZE_OR_ZERO; @@ -3872,14 +3954,194 @@ static int int_ptr_type_to_size(enum bpf_arg_type type) return -EINVAL; } +static int resolve_map_arg_type(struct bpf_verifier_env *env, + const struct bpf_call_arg_meta *meta, + enum bpf_arg_type *arg_type) +{ + if (!meta->map_ptr) { + /* kernel subsystem misconfigured verifier */ + verbose(env, "invalid map_ptr to access map->type\n"); + return -EACCES; + } + + switch (meta->map_ptr->map_type) { + case BPF_MAP_TYPE_SOCKMAP: + case BPF_MAP_TYPE_SOCKHASH: + if (*arg_type == ARG_PTR_TO_MAP_VALUE) { + *arg_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON; + } else { + verbose(env, "invalid arg_type for sockmap/sockhash\n"); + return -EINVAL; + } + break; + + default: + break; + } + return 0; +} + +struct bpf_reg_types { + const enum bpf_reg_type types[10]; + u32 *btf_id; +}; + +static const struct bpf_reg_types map_key_value_types = { + .types = { + PTR_TO_STACK, + PTR_TO_PACKET, + PTR_TO_PACKET_META, + PTR_TO_MAP_VALUE, + }, +}; + +static const struct bpf_reg_types sock_types = { + .types = { + PTR_TO_SOCK_COMMON, + PTR_TO_SOCKET, + PTR_TO_TCP_SOCK, + PTR_TO_XDP_SOCK, + }, +}; + +#ifdef CONFIG_NET +static const struct bpf_reg_types btf_id_sock_common_types = { + .types = { + PTR_TO_SOCK_COMMON, + PTR_TO_SOCKET, + PTR_TO_TCP_SOCK, + PTR_TO_XDP_SOCK, + PTR_TO_BTF_ID, + }, + .btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], +}; +#endif + +static const struct bpf_reg_types mem_types = { + .types = { + PTR_TO_STACK, + PTR_TO_PACKET, + PTR_TO_PACKET_META, + PTR_TO_MAP_VALUE, + PTR_TO_MEM, + PTR_TO_RDONLY_BUF, + PTR_TO_RDWR_BUF, + }, +}; + +static const struct bpf_reg_types int_ptr_types = { + .types = { + PTR_TO_STACK, + PTR_TO_PACKET, + PTR_TO_PACKET_META, + PTR_TO_MAP_VALUE, + }, +}; + +static const struct bpf_reg_types fullsock_types = { .types = { PTR_TO_SOCKET } }; +static const struct bpf_reg_types scalar_types = { .types = { SCALAR_VALUE } }; +static const struct bpf_reg_types context_types = { .types = { PTR_TO_CTX } }; +static const struct bpf_reg_types alloc_mem_types = { .types = { PTR_TO_MEM } }; +static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_TO_MAP } }; +static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } }; +static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } }; +static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PERCPU_BTF_ID } }; + +static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { + [ARG_PTR_TO_MAP_KEY] = &map_key_value_types, + [ARG_PTR_TO_MAP_VALUE] = &map_key_value_types, + [ARG_PTR_TO_UNINIT_MAP_VALUE] = &map_key_value_types, + [ARG_PTR_TO_MAP_VALUE_OR_NULL] = &map_key_value_types, + [ARG_CONST_SIZE] = &scalar_types, + [ARG_CONST_SIZE_OR_ZERO] = &scalar_types, + [ARG_CONST_ALLOC_SIZE_OR_ZERO] = &scalar_types, + [ARG_CONST_MAP_PTR] = &const_map_ptr_types, + [ARG_PTR_TO_CTX] = &context_types, + [ARG_PTR_TO_CTX_OR_NULL] = &context_types, + [ARG_PTR_TO_SOCK_COMMON] = &sock_types, +#ifdef CONFIG_NET + [ARG_PTR_TO_BTF_ID_SOCK_COMMON] = &btf_id_sock_common_types, +#endif + [ARG_PTR_TO_SOCKET] = &fullsock_types, + [ARG_PTR_TO_SOCKET_OR_NULL] = &fullsock_types, + [ARG_PTR_TO_BTF_ID] = &btf_ptr_types, + [ARG_PTR_TO_SPIN_LOCK] = &spin_lock_types, + [ARG_PTR_TO_MEM] = &mem_types, + [ARG_PTR_TO_MEM_OR_NULL] = &mem_types, + [ARG_PTR_TO_UNINIT_MEM] = &mem_types, + [ARG_PTR_TO_ALLOC_MEM] = &alloc_mem_types, + [ARG_PTR_TO_ALLOC_MEM_OR_NULL] = &alloc_mem_types, + [ARG_PTR_TO_INT] = &int_ptr_types, + [ARG_PTR_TO_LONG] = &int_ptr_types, + [ARG_PTR_TO_PERCPU_BTF_ID] = &percpu_btf_ptr_types, +}; + +static int check_reg_type(struct bpf_verifier_env *env, u32 regno, + enum bpf_arg_type arg_type, + const u32 *arg_btf_id) +{ + struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + enum bpf_reg_type expected, type = reg->type; + const struct bpf_reg_types *compatible; + int i, j; + + compatible = compatible_reg_types[arg_type]; + if (!compatible) { + verbose(env, "verifier internal error: unsupported arg type %d\n", arg_type); + return -EFAULT; + } + + for (i = 0; i < ARRAY_SIZE(compatible->types); i++) { + expected = compatible->types[i]; + if (expected == NOT_INIT) + break; + + if (type == expected) + goto found; + } + + verbose(env, "R%d type=%s expected=", regno, reg_type_str[type]); + for (j = 0; j + 1 < i; j++) + verbose(env, "%s, ", reg_type_str[compatible->types[j]]); + verbose(env, "%s\n", reg_type_str[compatible->types[j]]); + return -EACCES; + +found: + if (type == PTR_TO_BTF_ID) { + if (!arg_btf_id) { + if (!compatible->btf_id) { + verbose(env, "verifier internal error: missing arg compatible BTF ID\n"); + return -EFAULT; + } + arg_btf_id = compatible->btf_id; + } + + if (!btf_struct_ids_match(&env->log, reg->off, reg->btf_id, + *arg_btf_id)) { + verbose(env, "R%d is of type %s but %s is expected\n", + regno, kernel_type_name(reg->btf_id), + kernel_type_name(*arg_btf_id)); + return -EACCES; + } + + if (!tnum_is_const(reg->var_off) || reg->var_off.value) { + verbose(env, "R%d is a pointer to in-kernel struct with non-zero offset\n", + regno); + return -EACCES; + } + } + + return 0; +} + static int check_func_arg(struct bpf_verifier_env *env, u32 arg, struct bpf_call_arg_meta *meta, const struct bpf_func_proto *fn) { u32 regno = BPF_REG_1 + arg; struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; - enum bpf_reg_type expected_type, type = reg->type; enum bpf_arg_type arg_type = fn->arg_type[arg]; + enum bpf_reg_type type = reg->type; int err = 0; if (arg_type == ARG_DONTCARE) @@ -3904,120 +4166,32 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, return -EACCES; } - if (arg_type == ARG_PTR_TO_MAP_KEY || - arg_type == ARG_PTR_TO_MAP_VALUE || + if (arg_type == ARG_PTR_TO_MAP_VALUE || arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE || arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL) { - expected_type = PTR_TO_STACK; - if (register_is_null(reg) && - arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL) - /* final test in check_stack_boundary() */; - else if (!type_is_pkt_pointer(type) && - type != PTR_TO_MAP_VALUE && - type != expected_type) - goto err_type; - } else if (arg_type == ARG_CONST_SIZE || - arg_type == ARG_CONST_SIZE_OR_ZERO || - arg_type == ARG_CONST_ALLOC_SIZE_OR_ZERO) { - expected_type = SCALAR_VALUE; - if (type != expected_type) - goto err_type; - } else if (arg_type == ARG_CONST_MAP_PTR) { - expected_type = CONST_PTR_TO_MAP; - if (type != expected_type) - goto err_type; - } else if (arg_type == ARG_PTR_TO_CTX || - arg_type == ARG_PTR_TO_CTX_OR_NULL) { - expected_type = PTR_TO_CTX; - if (!(register_is_null(reg) && - arg_type == ARG_PTR_TO_CTX_OR_NULL)) { - if (type != expected_type) - goto err_type; - err = check_ctx_reg(env, reg, regno); - if (err < 0) - return err; - } - } else if (arg_type == ARG_PTR_TO_SOCK_COMMON) { - expected_type = PTR_TO_SOCK_COMMON; - /* Any sk pointer can be ARG_PTR_TO_SOCK_COMMON */ - if (!type_is_sk_pointer(type)) - goto err_type; - if (reg->ref_obj_id) { - if (meta->ref_obj_id) { - verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", - regno, reg->ref_obj_id, - meta->ref_obj_id); - return -EFAULT; - } - meta->ref_obj_id = reg->ref_obj_id; - } - } else if (arg_type == ARG_PTR_TO_SOCKET || - arg_type == ARG_PTR_TO_SOCKET_OR_NULL) { - expected_type = PTR_TO_SOCKET; - if (!(register_is_null(reg) && - arg_type == ARG_PTR_TO_SOCKET_OR_NULL)) { - if (type != expected_type) - goto err_type; - } - } else if (arg_type == ARG_PTR_TO_BTF_ID) { - expected_type = PTR_TO_BTF_ID; - if (type != expected_type) - goto err_type; - if (!fn->check_btf_id) { - if (reg->btf_id != meta->btf_id) { - verbose(env, "Helper has type %s got %s in R%d\n", - kernel_type_name(meta->btf_id), - kernel_type_name(reg->btf_id), regno); - - return -EACCES; - } - } else if (!fn->check_btf_id(reg->btf_id, arg)) { - verbose(env, "Helper does not support %s in R%d\n", - kernel_type_name(reg->btf_id), regno); + err = resolve_map_arg_type(env, meta, &arg_type); + if (err) + return err; + } - return -EACCES; - } - if (!tnum_is_const(reg->var_off) || reg->var_off.value || reg->off) { - verbose(env, "R%d is a pointer to in-kernel struct with non-zero offset\n", - regno); - return -EACCES; - } - } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) { - if (meta->func_id == BPF_FUNC_spin_lock) { - if (process_spin_lock(env, regno, true)) - return -EACCES; - } else if (meta->func_id == BPF_FUNC_spin_unlock) { - if (process_spin_lock(env, regno, false)) - return -EACCES; - } else { - verbose(env, "verifier internal error\n"); - return -EFAULT; - } - } else if (arg_type_is_mem_ptr(arg_type)) { - expected_type = PTR_TO_STACK; - /* One exception here. In case function allows for NULL to be - * passed in as argument, it's a SCALAR_VALUE type. Final test - * happens during stack boundary checking. + if (register_is_null(reg) && arg_type_may_be_null(arg_type)) + /* A NULL register has a SCALAR_VALUE type, so skip + * type checking. */ - if (register_is_null(reg) && - (arg_type == ARG_PTR_TO_MEM_OR_NULL || - arg_type == ARG_PTR_TO_ALLOC_MEM_OR_NULL)) - /* final test in check_stack_boundary() */; - else if (!type_is_pkt_pointer(type) && - type != PTR_TO_MAP_VALUE && - type != PTR_TO_MEM && - type != PTR_TO_RDONLY_BUF && - type != PTR_TO_RDWR_BUF && - type != expected_type) - goto err_type; - meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM; - } else if (arg_type_is_alloc_mem_ptr(arg_type)) { - expected_type = PTR_TO_MEM; - if (register_is_null(reg) && - arg_type == ARG_PTR_TO_ALLOC_MEM_OR_NULL) - /* final test in check_stack_boundary() */; - else if (type != expected_type) - goto err_type; + goto skip_type_check; + + err = check_reg_type(env, regno, arg_type, fn->arg_btf_id[arg]); + if (err) + return err; + + if (type == PTR_TO_CTX) { + err = check_ctx_reg(env, reg, regno); + if (err < 0) + return err; + } + +skip_type_check: + if (reg->ref_obj_id) { if (meta->ref_obj_id) { verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", regno, reg->ref_obj_id, @@ -4025,15 +4199,6 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, return -EFAULT; } meta->ref_obj_id = reg->ref_obj_id; - } else if (arg_type_is_int_ptr(arg_type)) { - expected_type = PTR_TO_STACK; - if (!type_is_pkt_pointer(type) && - type != PTR_TO_MAP_VALUE && - type != expected_type) - goto err_type; - } else { - verbose(env, "unsupported arg_type %d\n", arg_type); - return -EFAULT; } if (arg_type == ARG_CONST_MAP_PTR) { @@ -4072,6 +4237,28 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, err = check_helper_mem_access(env, regno, meta->map_ptr->value_size, false, meta); + } else if (arg_type == ARG_PTR_TO_PERCPU_BTF_ID) { + if (!reg->btf_id) { + verbose(env, "Helper has invalid btf_id in R%d\n", regno); + return -EACCES; + } + meta->ret_btf_id = reg->btf_id; + } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) { + if (meta->func_id == BPF_FUNC_spin_lock) { + if (process_spin_lock(env, regno, true)) + return -EACCES; + } else if (meta->func_id == BPF_FUNC_spin_unlock) { + if (process_spin_lock(env, regno, false)) + return -EACCES; + } else { + verbose(env, "verifier internal error\n"); + return -EFAULT; + } + } else if (arg_type_is_mem_ptr(arg_type)) { + /* The access to this pointer is only checked when we hit the + * next is_mem_size argument below. + */ + meta->raw_mode = (arg_type == ARG_PTR_TO_UNINIT_MEM); } else if (arg_type_is_mem_size(arg_type)) { bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO); @@ -4137,10 +4324,43 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, } return err; -err_type: - verbose(env, "R%d type=%s expected=%s\n", regno, - reg_type_str[type], reg_type_str[expected_type]); - return -EACCES; +} + +static bool may_update_sockmap(struct bpf_verifier_env *env, int func_id) +{ + enum bpf_attach_type eatype = env->prog->expected_attach_type; + enum bpf_prog_type type = resolve_prog_type(env->prog); + + if (func_id != BPF_FUNC_map_update_elem) + return false; + + /* It's not possible to get access to a locked struct sock in these + * contexts, so updating is safe. + */ + switch (type) { + case BPF_PROG_TYPE_TRACING: + if (eatype == BPF_TRACE_ITER) + return true; + break; + case BPF_PROG_TYPE_SOCKET_FILTER: + case BPF_PROG_TYPE_SCHED_CLS: + case BPF_PROG_TYPE_SCHED_ACT: + case BPF_PROG_TYPE_XDP: + case BPF_PROG_TYPE_SK_REUSEPORT: + case BPF_PROG_TYPE_FLOW_DISSECTOR: + case BPF_PROG_TYPE_SK_LOOKUP: + return true; + default: + break; + } + + verbose(env, "cannot update sockmap in this context\n"); + return false; +} + +static bool allow_tail_call_in_subprogs(struct bpf_verifier_env *env) +{ + return env->prog->jit_requested && IS_ENABLED(CONFIG_X86_64); } static int check_map_func_compatibility(struct bpf_verifier_env *env, @@ -4214,7 +4434,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_map_delete_elem && func_id != BPF_FUNC_msg_redirect_map && func_id != BPF_FUNC_sk_select_reuseport && - func_id != BPF_FUNC_map_lookup_elem) + func_id != BPF_FUNC_map_lookup_elem && + !may_update_sockmap(env, func_id)) goto error; break; case BPF_MAP_TYPE_SOCKHASH: @@ -4223,7 +4444,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_map_delete_elem && func_id != BPF_FUNC_msg_redirect_hash && func_id != BPF_FUNC_sk_select_reuseport && - func_id != BPF_FUNC_map_lookup_elem) + func_id != BPF_FUNC_map_lookup_elem && + !may_update_sockmap(env, func_id)) goto error; break; case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY: @@ -4242,6 +4464,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_sk_storage_delete) goto error; break; + case BPF_MAP_TYPE_INODE_STORAGE: + if (func_id != BPF_FUNC_inode_storage_get && + func_id != BPF_FUNC_inode_storage_delete) + goto error; + break; default: break; } @@ -4251,8 +4478,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, case BPF_FUNC_tail_call: if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY) goto error; - if (env->subprog_cnt > 1) { - verbose(env, "tail_calls are not allowed in programs with bpf-to-bpf calls\n"); + if (env->subprog_cnt > 1 && !allow_tail_call_in_subprogs(env)) { + verbose(env, "tail_calls are not allowed in non-JITed programs with bpf-to-bpf calls\n"); return -EINVAL; } break; @@ -4315,6 +4542,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (map->map_type != BPF_MAP_TYPE_SK_STORAGE) goto error; break; + case BPF_FUNC_inode_storage_get: + case BPF_FUNC_inode_storage_delete: + if (map->map_type != BPF_MAP_TYPE_INODE_STORAGE) + goto error; + break; default: break; } @@ -4402,10 +4634,26 @@ static bool check_refcount_ok(const struct bpf_func_proto *fn, int func_id) return count <= 1; } +static bool check_btf_id_ok(const struct bpf_func_proto *fn) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(fn->arg_type); i++) { + if (fn->arg_type[i] == ARG_PTR_TO_BTF_ID && !fn->arg_btf_id[i]) + return false; + + if (fn->arg_type[i] != ARG_PTR_TO_BTF_ID && fn->arg_btf_id[i]) + return false; + } + + return true; +} + static int check_func_proto(const struct bpf_func_proto *fn, int func_id) { return check_raw_mode_ok(fn) && check_arg_pair_ok(fn) && + check_btf_id_ok(fn) && check_refcount_ok(fn, func_id) ? 0 : -EINVAL; } @@ -4775,6 +5023,11 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn return -EINVAL; } + if (fn->allowed && !fn->allowed(env->prog)) { + verbose(env, "helper call is not allowed in probe\n"); + return -EINVAL; + } + /* With LD_ABS/IND some JITs save/restore skb from r1. */ changes_data = bpf_helper_changes_pkt_data(fn->func); if (changes_data && fn->arg1_type != ARG_PTR_TO_CTX) { @@ -4796,11 +5049,6 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn meta.func_id = func_id; /* check args */ for (i = 0; i < 5; i++) { - if (!fn->check_btf_id) { - err = btf_resolve_helper_id(&env->log, fn, i); - if (err > 0) - meta.btf_id = err; - } err = check_func_arg(env, i, &meta, fn); if (err) return err; @@ -4885,25 +5133,49 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn regs[BPF_REG_0].id = ++env->id_gen; } else { regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; - regs[BPF_REG_0].id = ++env->id_gen; } } else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) { mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL; - regs[BPF_REG_0].id = ++env->id_gen; } else if (fn->ret_type == RET_PTR_TO_SOCK_COMMON_OR_NULL) { mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON_OR_NULL; - regs[BPF_REG_0].id = ++env->id_gen; } else if (fn->ret_type == RET_PTR_TO_TCP_SOCK_OR_NULL) { mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL; - regs[BPF_REG_0].id = ++env->id_gen; } else if (fn->ret_type == RET_PTR_TO_ALLOC_MEM_OR_NULL) { mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL; - regs[BPF_REG_0].id = ++env->id_gen; regs[BPF_REG_0].mem_size = meta.mem_size; + } else if (fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL || + fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID) { + const struct btf_type *t; + + mark_reg_known_zero(env, regs, BPF_REG_0); + t = btf_type_skip_modifiers(btf_vmlinux, meta.ret_btf_id, NULL); + if (!btf_type_is_struct(t)) { + u32 tsize; + const struct btf_type *ret; + const char *tname; + + /* resolve the type size of ksym. */ + ret = btf_resolve_size(btf_vmlinux, t, &tsize); + if (IS_ERR(ret)) { + tname = btf_name_by_offset(btf_vmlinux, t->name_off); + verbose(env, "unable to resolve the size of type '%s': %ld\n", + tname, PTR_ERR(ret)); + return -EINVAL; + } + regs[BPF_REG_0].type = + fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ? + PTR_TO_MEM : PTR_TO_MEM_OR_NULL; + regs[BPF_REG_0].mem_size = tsize; + } else { + regs[BPF_REG_0].type = + fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ? + PTR_TO_BTF_ID : PTR_TO_BTF_ID_OR_NULL; + regs[BPF_REG_0].btf_id = meta.ret_btf_id; + } } else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL) { int ret_btf_id; @@ -4922,6 +5194,9 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn return -EINVAL; } + if (reg_type_may_be_null(regs[BPF_REG_0].type)) + regs[BPF_REG_0].id = ++env->id_gen; + if (is_ptr_cast_function(func_id)) { /* For release_reference() */ regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id; @@ -5219,6 +5494,10 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst, reg_type_str[ptr_reg->type]); return -EACCES; case CONST_PTR_TO_MAP: + /* smin_val represents the known value */ + if (known && smin_val == 0 && opcode == BPF_ADD) + break; + fallthrough; case PTR_TO_PACKET_END: case PTR_TO_SOCKET: case PTR_TO_SOCKET_OR_NULL: @@ -5236,7 +5515,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, off_reg == dst_reg ? dst : src); return -EACCES; } - /* fall-through */ + fallthrough; default: break; } @@ -5634,8 +5913,7 @@ static void scalar_min_max_and(struct bpf_reg_state *dst_reg, u64 umax_val = src_reg->umax_value; if (src_known && dst_known) { - __mark_reg_known(dst_reg, dst_reg->var_off.value & - src_reg->var_off.value); + __mark_reg_known(dst_reg, dst_reg->var_off.value); return; } @@ -5667,8 +5945,8 @@ static void scalar32_min_max_or(struct bpf_reg_state *dst_reg, bool src_known = tnum_subreg_is_const(src_reg->var_off); bool dst_known = tnum_subreg_is_const(dst_reg->var_off); struct tnum var32_off = tnum_subreg(dst_reg->var_off); - s32 smin_val = src_reg->smin_value; - u32 umin_val = src_reg->umin_value; + s32 smin_val = src_reg->s32_min_value; + u32 umin_val = src_reg->u32_min_value; /* Assuming scalar64_min_max_or will be called so it is safe * to skip updating register for known case. @@ -5691,8 +5969,8 @@ static void scalar32_min_max_or(struct bpf_reg_state *dst_reg, /* ORing two positives gives a positive, so safe to * cast result into s64. */ - dst_reg->s32_min_value = dst_reg->umin_value; - dst_reg->s32_max_value = dst_reg->umax_value; + dst_reg->s32_min_value = dst_reg->u32_min_value; + dst_reg->s32_max_value = dst_reg->u32_max_value; } } @@ -5705,8 +5983,7 @@ static void scalar_min_max_or(struct bpf_reg_state *dst_reg, u64 umin_val = src_reg->umin_value; if (src_known && dst_known) { - __mark_reg_known(dst_reg, dst_reg->var_off.value | - src_reg->var_off.value); + __mark_reg_known(dst_reg, dst_reg->var_off.value); return; } @@ -5732,6 +6009,67 @@ static void scalar_min_max_or(struct bpf_reg_state *dst_reg, __update_reg_bounds(dst_reg); } +static void scalar32_min_max_xor(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + bool src_known = tnum_subreg_is_const(src_reg->var_off); + bool dst_known = tnum_subreg_is_const(dst_reg->var_off); + struct tnum var32_off = tnum_subreg(dst_reg->var_off); + s32 smin_val = src_reg->s32_min_value; + + /* Assuming scalar64_min_max_xor will be called so it is safe + * to skip updating register for known case. + */ + if (src_known && dst_known) + return; + + /* We get both minimum and maximum from the var32_off. */ + dst_reg->u32_min_value = var32_off.value; + dst_reg->u32_max_value = var32_off.value | var32_off.mask; + + if (dst_reg->s32_min_value >= 0 && smin_val >= 0) { + /* XORing two positive sign numbers gives a positive, + * so safe to cast u32 result into s32. + */ + dst_reg->s32_min_value = dst_reg->u32_min_value; + dst_reg->s32_max_value = dst_reg->u32_max_value; + } else { + dst_reg->s32_min_value = S32_MIN; + dst_reg->s32_max_value = S32_MAX; + } +} + +static void scalar_min_max_xor(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + bool src_known = tnum_is_const(src_reg->var_off); + bool dst_known = tnum_is_const(dst_reg->var_off); + s64 smin_val = src_reg->smin_value; + + if (src_known && dst_known) { + /* dst_reg->var_off.value has been updated earlier */ + __mark_reg_known(dst_reg, dst_reg->var_off.value); + return; + } + + /* We get both minimum and maximum from the var_off. */ + dst_reg->umin_value = dst_reg->var_off.value; + dst_reg->umax_value = dst_reg->var_off.value | dst_reg->var_off.mask; + + if (dst_reg->smin_value >= 0 && smin_val >= 0) { + /* XORing two positive sign numbers gives a positive, + * so safe to cast u64 result into s64. + */ + dst_reg->smin_value = dst_reg->umin_value; + dst_reg->smax_value = dst_reg->umax_value; + } else { + dst_reg->smin_value = S64_MIN; + dst_reg->smax_value = S64_MAX; + } + + __update_reg_bounds(dst_reg); +} + static void __scalar32_min_max_lsh(struct bpf_reg_state *dst_reg, u64 umin_val, u64 umax_val) { @@ -6040,6 +6378,11 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, scalar32_min_max_or(dst_reg, &src_reg); scalar_min_max_or(dst_reg, &src_reg); break; + case BPF_XOR: + dst_reg->var_off = tnum_xor(dst_reg->var_off, src_reg.var_off); + scalar32_min_max_xor(dst_reg, &src_reg); + scalar_min_max_xor(dst_reg, &src_reg); + break; case BPF_LSH: if (umax_val >= insn_bitness) { /* Shifts greater than 31 or 63 are undefined. @@ -6111,6 +6454,11 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, src_reg = NULL; if (dst_reg->type != SCALAR_VALUE) ptr_reg = dst_reg; + else + /* Make sure ID is cleared otherwise dst_reg min/max could be + * incorrectly propagated into other registers by find_equal_scalars() + */ + dst_reg->id = 0; if (BPF_SRC(insn->code) == BPF_X) { src_reg = ®s[insn->src_reg]; if (src_reg->type != SCALAR_VALUE) { @@ -6244,6 +6592,12 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) /* case: R1 = R2 * copy register state to dest reg */ + if (src_reg->type == SCALAR_VALUE && !src_reg->id) + /* Assign src and dst registers the same ID + * that will be used by find_equal_scalars() + * to propagate min/max range. + */ + src_reg->id = ++env->id_gen; *dst_reg = *src_reg; dst_reg->live |= REG_LIVE_WRITTEN; dst_reg->subreg_def = DEF_NOT_SUBREG; @@ -6256,6 +6610,11 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return -EACCES; } else if (src_reg->type == SCALAR_VALUE) { *dst_reg = *src_reg; + /* Make sure ID is cleared otherwise + * dst_reg min/max could be incorrectly + * propagated into src_reg by find_equal_scalars() + */ + dst_reg->id = 0; dst_reg->live |= REG_LIVE_WRITTEN; dst_reg->subreg_def = env->insn_idx + 1; } else { @@ -6646,14 +7005,18 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, struct bpf_reg_state *reg = opcode == BPF_JEQ ? true_reg : false_reg; - /* For BPF_JEQ, if this is false we know nothing Jon Snow, but - * if it is true we know the value for sure. Likewise for - * BPF_JNE. + /* JEQ/JNE comparison doesn't change the register equivalence. + * r1 = r2; + * if (r1 == 42) goto label; + * ... + * label: // here both r1 and r2 are known to be 42. + * + * Hence when marking register as known preserve it's ID. */ if (is_jmp32) __mark_reg32_known(reg, val32); else - __mark_reg_known(reg, val); + ___mark_reg_known(reg, val); break; } case BPF_JSET: @@ -6847,7 +7210,8 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, struct bpf_reg_state *reg, u32 id, bool is_null) { - if (reg_type_may_be_null(reg->type) && reg->id == id) { + if (reg_type_may_be_null(reg->type) && reg->id == id && + !WARN_ON_ONCE(!reg->id)) { /* Old offset (both fixed and variable parts) should * have been known-zero, because we don't allow pointer * arithmetic on pointers that might be NULL. @@ -7044,6 +7408,30 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn, return true; } +static void find_equal_scalars(struct bpf_verifier_state *vstate, + struct bpf_reg_state *known_reg) +{ + struct bpf_func_state *state; + struct bpf_reg_state *reg; + int i, j; + + for (i = 0; i <= vstate->curframe; i++) { + state = vstate->frame[i]; + for (j = 0; j < MAX_BPF_REG; j++) { + reg = &state->regs[j]; + if (reg->type == SCALAR_VALUE && reg->id == known_reg->id) + *reg = *known_reg; + } + + bpf_for_each_spilled_reg(j, state, reg) { + if (!reg) + continue; + if (reg->type == SCALAR_VALUE && reg->id == known_reg->id) + *reg = *known_reg; + } + } +} + static int check_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx) { @@ -7172,6 +7560,12 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, reg_combine_min_max(&other_branch_regs[insn->src_reg], &other_branch_regs[insn->dst_reg], src_reg, dst_reg, opcode); + if (src_reg->id && + !WARN_ON_ONCE(src_reg->id != other_branch_regs[insn->src_reg].id)) { + find_equal_scalars(this_branch, src_reg); + find_equal_scalars(other_branch, &other_branch_regs[insn->src_reg]); + } + } } else if (dst_reg->type == SCALAR_VALUE) { reg_set_min_max(&other_branch_regs[insn->dst_reg], @@ -7179,6 +7573,12 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, opcode, is_jmp32); } + if (dst_reg->type == SCALAR_VALUE && dst_reg->id && + !WARN_ON_ONCE(dst_reg->id != other_branch_regs[insn->dst_reg].id)) { + find_equal_scalars(this_branch, dst_reg); + find_equal_scalars(other_branch, &other_branch_regs[insn->dst_reg]); + } + /* detect if R == 0 where R is returned from bpf_map_lookup_elem(). * NOTE: these optimizations below are related with pointer comparison * which will never be JMP32. @@ -7210,6 +7610,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) { struct bpf_insn_aux_data *aux = cur_aux(env); struct bpf_reg_state *regs = cur_regs(env); + struct bpf_reg_state *dst_reg; struct bpf_map *map; int err; @@ -7226,25 +7627,45 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) if (err) return err; + dst_reg = ®s[insn->dst_reg]; if (insn->src_reg == 0) { u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; - regs[insn->dst_reg].type = SCALAR_VALUE; + dst_reg->type = SCALAR_VALUE; __mark_reg_known(®s[insn->dst_reg], imm); return 0; } + if (insn->src_reg == BPF_PSEUDO_BTF_ID) { + mark_reg_known_zero(env, regs, insn->dst_reg); + + dst_reg->type = aux->btf_var.reg_type; + switch (dst_reg->type) { + case PTR_TO_MEM: + dst_reg->mem_size = aux->btf_var.mem_size; + break; + case PTR_TO_BTF_ID: + case PTR_TO_PERCPU_BTF_ID: + dst_reg->btf_id = aux->btf_var.btf_id; + break; + default: + verbose(env, "bpf verifier is misconfigured\n"); + return -EFAULT; + } + return 0; + } + map = env->used_maps[aux->map_index]; mark_reg_known_zero(env, regs, insn->dst_reg); - regs[insn->dst_reg].map_ptr = map; + dst_reg->map_ptr = map; if (insn->src_reg == BPF_PSEUDO_MAP_VALUE) { - regs[insn->dst_reg].type = PTR_TO_MAP_VALUE; - regs[insn->dst_reg].off = aux->map_off; + dst_reg->type = PTR_TO_MAP_VALUE; + dst_reg->off = aux->map_off; if (map_value_has_spin_lock(map)) - regs[insn->dst_reg].id = ++env->id_gen; + dst_reg->id = ++env->id_gen; } else if (insn->src_reg == BPF_PSEUDO_MAP_FD) { - regs[insn->dst_reg].type = CONST_PTR_TO_MAP; + dst_reg->type = CONST_PTR_TO_MAP; } else { verbose(env, "bpf verifier is misconfigured\n"); return -EINVAL; @@ -7287,7 +7708,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) u8 mode = BPF_MODE(insn->code); int i, err; - if (!may_access_skb(env->prog->type)) { + if (!may_access_skb(resolve_prog_type(env->prog))) { verbose(env, "BPF_LD_[ABS|IND] instructions not allowed for this program type\n"); return -EINVAL; } @@ -7297,18 +7718,6 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return -EINVAL; } - if (env->subprog_cnt > 1) { - /* when program has LD_ABS insn JITs and interpreter assume - * that r1 == ctx == skb which is not the case for callees - * that can have arbitrary arguments. It's problematic - * for main prog as well since JITs would need to analyze - * all functions in order to make proper register save/restore - * decisions in the main prog. Hence disallow LD_ABS with calls - */ - verbose(env, "BPF_LD_[ABS|IND] instructions cannot be mixed with bpf-to-bpf calls\n"); - return -EINVAL; - } - if (insn->dst_reg != BPF_REG_0 || insn->off != 0 || BPF_SIZE(insn->code) == BPF_DW || (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) { @@ -7375,11 +7784,12 @@ static int check_return_code(struct bpf_verifier_env *env) const struct bpf_prog *prog = env->prog; struct bpf_reg_state *reg; struct tnum range = tnum_range(0, 1); + enum bpf_prog_type prog_type = resolve_prog_type(env->prog); int err; /* LSM and struct_ops func-ptr's return type could be "void" */ - if ((env->prog->type == BPF_PROG_TYPE_STRUCT_OPS || - env->prog->type == BPF_PROG_TYPE_LSM) && + if ((prog_type == BPF_PROG_TYPE_STRUCT_OPS || + prog_type == BPF_PROG_TYPE_LSM) && !prog->aux->attach_func_proto->type) return 0; @@ -7398,7 +7808,7 @@ static int check_return_code(struct bpf_verifier_env *env) return -EACCES; } - switch (env->prog->type) { + switch (prog_type) { case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG || env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG || @@ -7718,6 +8128,23 @@ err_free: return ret; } +static int check_abnormal_return(struct bpf_verifier_env *env) +{ + int i; + + for (i = 1; i < env->subprog_cnt; i++) { + if (env->subprog_info[i].has_ld_abs) { + verbose(env, "LD_ABS is not allowed in subprogs without BTF\n"); + return -EINVAL; + } + if (env->subprog_info[i].has_tail_call) { + verbose(env, "tail_call is not allowed in subprogs without BTF\n"); + return -EINVAL; + } + } + return 0; +} + /* The minimum supported BTF func info size */ #define MIN_BPF_FUNCINFO_SIZE 8 #define MAX_FUNCINFO_REC_SIZE 252 @@ -7726,20 +8153,24 @@ static int check_btf_func(struct bpf_verifier_env *env, const union bpf_attr *attr, union bpf_attr __user *uattr) { + const struct btf_type *type, *func_proto, *ret_type; u32 i, nfuncs, urec_size, min_size; u32 krec_size = sizeof(struct bpf_func_info); struct bpf_func_info *krecord; struct bpf_func_info_aux *info_aux = NULL; - const struct btf_type *type; struct bpf_prog *prog; const struct btf *btf; void __user *urecord; u32 prev_offset = 0; + bool scalar_return; int ret = -ENOMEM; nfuncs = attr->func_info_cnt; - if (!nfuncs) + if (!nfuncs) { + if (check_abnormal_return(env)) + return -EINVAL; return 0; + } if (nfuncs != env->subprog_cnt) { verbose(env, "number of funcs in func_info doesn't match number of subprogs\n"); @@ -7787,25 +8218,23 @@ static int check_btf_func(struct bpf_verifier_env *env, } /* check insn_off */ + ret = -EINVAL; if (i == 0) { if (krecord[i].insn_off) { verbose(env, "nonzero insn_off %u for the first func info record", krecord[i].insn_off); - ret = -EINVAL; goto err_free; } } else if (krecord[i].insn_off <= prev_offset) { verbose(env, "same or smaller insn offset (%u) than previous func info record (%u)", krecord[i].insn_off, prev_offset); - ret = -EINVAL; goto err_free; } if (env->subprog_info[i].start != krecord[i].insn_off) { verbose(env, "func_info BTF section doesn't match subprog layout in BPF program\n"); - ret = -EINVAL; goto err_free; } @@ -7814,10 +8243,26 @@ static int check_btf_func(struct bpf_verifier_env *env, if (!type || !btf_type_is_func(type)) { verbose(env, "invalid type id %d in func info", krecord[i].type_id); - ret = -EINVAL; goto err_free; } info_aux[i].linkage = BTF_INFO_VLEN(type->info); + + func_proto = btf_type_by_id(btf, type->type); + if (unlikely(!func_proto || !btf_type_is_func_proto(func_proto))) + /* btf_func_check() already verified it during BTF load */ + goto err_free; + ret_type = btf_type_skip_modifiers(btf, func_proto->type, NULL); + scalar_return = + btf_type_is_small_int(ret_type) || btf_type_is_enum(ret_type); + if (i && !scalar_return && env->subprog_info[i].has_ld_abs) { + verbose(env, "LD_ABS is only allowed in functions that return 'int'.\n"); + goto err_free; + } + if (i && !scalar_return && env->subprog_info[i].has_tail_call) { + verbose(env, "tail_call is only allowed in functions that return 'int'.\n"); + goto err_free; + } + prev_offset = krecord[i].insn_off; urecord += urec_size; } @@ -7978,8 +8423,11 @@ static int check_btf_info(struct bpf_verifier_env *env, struct btf *btf; int err; - if (!attr->func_info_cnt && !attr->line_info_cnt) + if (!attr->func_info_cnt && !attr->line_info_cnt) { + if (check_abnormal_return(env)) + return -EINVAL; return 0; + } btf = btf_get_by_fd(attr->prog_btf_fd); if (IS_ERR(btf)) @@ -9119,6 +9567,92 @@ process_bpf_exit: return 0; } +/* replace pseudo btf_id with kernel symbol address */ +static int check_pseudo_btf_id(struct bpf_verifier_env *env, + struct bpf_insn *insn, + struct bpf_insn_aux_data *aux) +{ + u32 datasec_id, type, id = insn->imm; + const struct btf_var_secinfo *vsi; + const struct btf_type *datasec; + const struct btf_type *t; + const char *sym_name; + bool percpu = false; + u64 addr; + int i; + + if (!btf_vmlinux) { + verbose(env, "kernel is missing BTF, make sure CONFIG_DEBUG_INFO_BTF=y is specified in Kconfig.\n"); + return -EINVAL; + } + + if (insn[1].imm != 0) { + verbose(env, "reserved field (insn[1].imm) is used in pseudo_btf_id ldimm64 insn.\n"); + return -EINVAL; + } + + t = btf_type_by_id(btf_vmlinux, id); + if (!t) { + verbose(env, "ldimm64 insn specifies invalid btf_id %d.\n", id); + return -ENOENT; + } + + if (!btf_type_is_var(t)) { + verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR.\n", + id); + return -EINVAL; + } + + sym_name = btf_name_by_offset(btf_vmlinux, t->name_off); + addr = kallsyms_lookup_name(sym_name); + if (!addr) { + verbose(env, "ldimm64 failed to find the address for kernel symbol '%s'.\n", + sym_name); + return -ENOENT; + } + + datasec_id = btf_find_by_name_kind(btf_vmlinux, ".data..percpu", + BTF_KIND_DATASEC); + if (datasec_id > 0) { + datasec = btf_type_by_id(btf_vmlinux, datasec_id); + for_each_vsi(i, datasec, vsi) { + if (vsi->type == id) { + percpu = true; + break; + } + } + } + + insn[0].imm = (u32)addr; + insn[1].imm = addr >> 32; + + type = t->type; + t = btf_type_skip_modifiers(btf_vmlinux, type, NULL); + if (percpu) { + aux->btf_var.reg_type = PTR_TO_PERCPU_BTF_ID; + aux->btf_var.btf_id = type; + } else if (!btf_type_is_struct(t)) { + const struct btf_type *ret; + const char *tname; + u32 tsize; + + /* resolve the type size of ksym. */ + ret = btf_resolve_size(btf_vmlinux, t, &tsize); + if (IS_ERR(ret)) { + tname = btf_name_by_offset(btf_vmlinux, t->name_off); + verbose(env, "ldimm64 unable to resolve the size of type '%s': %ld\n", + tname, PTR_ERR(ret)); + return -EINVAL; + } + aux->btf_var.reg_type = PTR_TO_MEM; + aux->btf_var.mem_size = tsize; + } else { + aux->btf_var.reg_type = PTR_TO_BTF_ID; + aux->btf_var.btf_id = type; + } + return 0; +} + static int check_map_prealloc(struct bpf_map *map) { return (map->map_type != BPF_MAP_TYPE_HASH && @@ -9154,6 +9688,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, struct bpf_prog *prog) { + enum bpf_prog_type prog_type = resolve_prog_type(prog); /* * Validate that trace type programs use preallocated hash maps. * @@ -9171,8 +9706,8 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, * now, but warnings are emitted so developers are made aware of * the unsafety and can fix their programs before this is enforced. */ - if (is_tracing_prog_type(prog->type) && !is_preallocated_map(map)) { - if (prog->type == BPF_PROG_TYPE_PERF_EVENT) { + if (is_tracing_prog_type(prog_type) && !is_preallocated_map(map)) { + if (prog_type == BPF_PROG_TYPE_PERF_EVENT) { verbose(env, "perf_event programs can only use preallocated hash map\n"); return -EINVAL; } @@ -9184,8 +9719,8 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, verbose(env, "trace type programs with run-time allocated hash maps are unsafe. Switch to preallocated hash maps.\n"); } - if ((is_tracing_prog_type(prog->type) || - prog->type == BPF_PROG_TYPE_SOCKET_FILTER) && + if ((is_tracing_prog_type(prog_type) || + prog_type == BPF_PROG_TYPE_SOCKET_FILTER) && map_value_has_spin_lock(map)) { verbose(env, "tracing progs cannot use bpf_spin_lock yet\n"); return -EINVAL; @@ -9202,6 +9737,23 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, return -EINVAL; } + if (prog->aux->sleepable) + switch (map->map_type) { + case BPF_MAP_TYPE_HASH: + case BPF_MAP_TYPE_LRU_HASH: + case BPF_MAP_TYPE_ARRAY: + if (!is_preallocated_map(map)) { + verbose(env, + "Sleepable programs can only use preallocated hash maps\n"); + return -EINVAL; + } + break; + default: + verbose(env, + "Sleepable programs can only use array and hash maps\n"); + return -EINVAL; + } + return 0; } @@ -9211,10 +9763,14 @@ static bool bpf_map_is_cgroup_storage(struct bpf_map *map) map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE); } -/* look for pseudo eBPF instructions that access map FDs and - * replace them with actual map pointers +/* find and rewrite pseudo imm in ld_imm64 instructions: + * + * 1. if it accesses map FD, replace it with actual map pointer. + * 2. if it accesses btf_id of a VAR, replace it with pointer to the var. + * + * NOTE: btf_vmlinux is required for converting pseudo btf_id. */ -static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) +static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) { struct bpf_insn *insn = env->prog->insnsi; int insn_cnt = env->prog->len; @@ -9255,6 +9811,14 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) /* valid generic load 64-bit imm */ goto next_insn; + if (insn[0].src_reg == BPF_PSEUDO_BTF_ID) { + aux = &env->insn_aux_data[i]; + err = check_pseudo_btf_id(env, insn, aux); + if (err) + return err; + goto next_insn; + } + /* In final convert_pseudo_ld_imm64() step, this is * converted into regular 64-bit imm load insn. */ @@ -9436,6 +10000,18 @@ static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len } } +static void adjust_poke_descs(struct bpf_prog *prog, u32 len) +{ + struct bpf_jit_poke_descriptor *tab = prog->aux->poke_tab; + int i, sz = prog->aux->size_poke_tab; + struct bpf_jit_poke_descriptor *desc; + + for (i = 0; i < sz; i++) { + desc = &tab[i]; + desc->insn_idx += len - 1; + } +} + static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off, const struct bpf_insn *patch, u32 len) { @@ -9452,6 +10028,7 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of if (adjust_insn_aux_data(env, new_prog, off, len)) return NULL; adjust_subprog_starts(env, off, len); + adjust_poke_descs(new_prog, len); return new_prog; } @@ -9897,7 +10474,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) insn->code = BPF_LDX | BPF_PROBE_MEM | BPF_SIZE((insn)->code); env->prog->aux->num_exentries++; - } else if (env->prog->type != BPF_PROG_TYPE_STRUCT_OPS) { + } else if (resolve_prog_type(env->prog) != BPF_PROG_TYPE_STRUCT_OPS) { verbose(env, "Writes through BTF pointers are not allowed\n"); return -EINVAL; } @@ -9982,6 +10559,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) { struct bpf_prog *prog = env->prog, **func, *tmp; int i, j, subprog_start, subprog_end = 0, len, subprog; + struct bpf_map *map_ptr; struct bpf_insn *insn; void *old_bpf_func; int err, num_exentries; @@ -10049,6 +10627,31 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->btf = prog->aux->btf; func[i]->aux->func_info = prog->aux->func_info; + for (j = 0; j < prog->aux->size_poke_tab; j++) { + u32 insn_idx = prog->aux->poke_tab[j].insn_idx; + int ret; + + if (!(insn_idx >= subprog_start && + insn_idx <= subprog_end)) + continue; + + ret = bpf_jit_add_poke_descriptor(func[i], + &prog->aux->poke_tab[j]); + if (ret < 0) { + verbose(env, "adding tail call poke descriptor failed\n"); + goto out_free; + } + + func[i]->insnsi[insn_idx - subprog_start].imm = ret + 1; + + map_ptr = func[i]->aux->poke_tab[ret].tail_call.map; + ret = map_ptr->ops->map_poke_track(map_ptr, func[i]->aux); + if (ret < 0) { + verbose(env, "tracking tail call prog failed\n"); + goto out_free; + } + } + /* Use bpf_prog_F_tag to indicate functions in stack traces. * Long term would need debug info to populate names */ @@ -10067,6 +10670,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) num_exentries++; } func[i]->aux->num_exentries = num_exentries; + func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable; func[i] = bpf_int_jit_compile(func[i]); if (!func[i]->jited) { err = -ENOTSUPP; @@ -10074,6 +10678,19 @@ static int jit_subprogs(struct bpf_verifier_env *env) } cond_resched(); } + + /* Untrack main program's aux structs so that during map_poke_run() + * we will not stumble upon the unfilled poke descriptors; each + * of the main program's poke descs got distributed across subprogs + * and got tracked onto map, so we are sure that none of them will + * be missed after the operation below + */ + for (i = 0; i < prog->aux->size_poke_tab; i++) { + map_ptr = prog->aux->poke_tab[i].tail_call.map; + + map_ptr->ops->map_poke_untrack(map_ptr, prog->aux); + } + /* at this point all bpf functions were successfully JITed * now populate all bpf_calls with correct addresses and * run last pass of JIT @@ -10142,9 +10759,16 @@ static int jit_subprogs(struct bpf_verifier_env *env) bpf_prog_free_unused_jited_linfo(prog); return 0; out_free: - for (i = 0; i < env->subprog_cnt; i++) - if (func[i]) - bpf_jit_free(func[i]); + for (i = 0; i < env->subprog_cnt; i++) { + if (!func[i]) + continue; + + for (j = 0; j < func[i]->aux->size_poke_tab; j++) { + map_ptr = func[i]->aux->poke_tab[j].tail_call.map; + map_ptr->ops->map_poke_untrack(map_ptr, func[i]->aux); + } + bpf_jit_free(func[i]); + } kfree(func); out_undo_insn: /* cleanup main prog to be interpreted */ @@ -10178,6 +10802,13 @@ static int fixup_call_args(struct bpf_verifier_env *env) return err; } #ifndef CONFIG_BPF_JIT_ALWAYS_ON + if (env->subprog_cnt > 1 && env->prog->aux->tail_call_reachable) { + /* When JIT fails the progs with bpf2bpf calls and tail_calls + * have to be rejected, since interpreter doesn't support them yet. + */ + verbose(env, "tail_calls are not allowed in non-JITed programs with bpf-to-bpf calls\n"); + return -EINVAL; + } for (i = 0; i < prog->len; i++, insn++) { if (insn->code != (BPF_JMP | BPF_CALL) || insn->src_reg != BPF_PSEUDO_CALL) @@ -10341,8 +10972,9 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) * the program array. */ prog->cb_access = 1; - env->prog->aux->stack_depth = MAX_BPF_STACK; - env->prog->aux->max_pkt_offset = MAX_PACKET_OFF; + if (!allow_tail_call_in_subprogs(env)) + prog->aux->stack_depth = MAX_BPF_STACK; + prog->aux->max_pkt_offset = MAX_PACKET_OFF; /* mark bpf_tail_call as different opcode to avoid * conditional branch in the interpeter for every normal @@ -10362,6 +10994,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) .reason = BPF_POKE_REASON_TAIL_CALL, .tail_call.map = BPF_MAP_PTR(aux->map_ptr_state), .tail_call.key = bpf_map_key_immediate(aux), + .insn_idx = i + delta, }; ret = bpf_jit_add_poke_descriptor(prog, &desc); @@ -10427,7 +11060,9 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) if (insn->imm == BPF_FUNC_map_lookup_elem && ops->map_gen_lookup) { cnt = ops->map_gen_lookup(map_ptr, insn_buf); - if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { + if (cnt == -EOPNOTSUPP) + goto patch_map_ops_generic; + if (cnt <= 0 || cnt >= ARRAY_SIZE(insn_buf)) { verbose(env, "bpf verifier is misconfigured\n"); return -EINVAL; } @@ -10457,7 +11092,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) (int (*)(struct bpf_map *map, void *value))NULL)); BUILD_BUG_ON(!__same_type(ops->map_peek_elem, (int (*)(struct bpf_map *map, void *value))NULL)); - +patch_map_ops_generic: switch (insn->imm) { case BPF_FUNC_map_lookup_elem: insn->imm = BPF_CAST_CALL(ops->map_lookup_elem) - @@ -10810,59 +11445,79 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env) } #define SECURITY_PREFIX "security_" -static int check_attach_modify_return(struct bpf_prog *prog, unsigned long addr) +static int check_attach_modify_return(unsigned long addr, const char *func_name) { if (within_error_injection_list(addr) || - !strncmp(SECURITY_PREFIX, prog->aux->attach_func_name, - sizeof(SECURITY_PREFIX) - 1)) + !strncmp(SECURITY_PREFIX, func_name, sizeof(SECURITY_PREFIX) - 1)) return 0; return -EINVAL; } -static int check_attach_btf_id(struct bpf_verifier_env *env) +/* non exhaustive list of sleepable bpf_lsm_*() functions */ +BTF_SET_START(btf_sleepable_lsm_hooks) +#ifdef CONFIG_BPF_LSM +BTF_ID(func, bpf_lsm_bprm_committed_creds) +#else +BTF_ID_UNUSED +#endif +BTF_SET_END(btf_sleepable_lsm_hooks) + +static int check_sleepable_lsm_hook(u32 btf_id) +{ + return btf_id_set_contains(&btf_sleepable_lsm_hooks, btf_id); +} + +/* list of non-sleepable functions that are otherwise on + * ALLOW_ERROR_INJECTION list + */ +BTF_SET_START(btf_non_sleepable_error_inject) +/* Three functions below can be called from sleepable and non-sleepable context. + * Assume non-sleepable from bpf safety point of view. + */ +BTF_ID(func, __add_to_page_cache_locked) +BTF_ID(func, should_fail_alloc_page) +BTF_ID(func, should_failslab) +BTF_SET_END(btf_non_sleepable_error_inject) + +static int check_non_sleepable_error_inject(u32 btf_id) +{ + return btf_id_set_contains(&btf_non_sleepable_error_inject, btf_id); +} + +int bpf_check_attach_target(struct bpf_verifier_log *log, + const struct bpf_prog *prog, + const struct bpf_prog *tgt_prog, + u32 btf_id, + struct bpf_attach_target_info *tgt_info) { - struct bpf_prog *prog = env->prog; bool prog_extension = prog->type == BPF_PROG_TYPE_EXT; - struct bpf_prog *tgt_prog = prog->aux->linked_prog; - u32 btf_id = prog->aux->attach_btf_id; const char prefix[] = "btf_trace_"; - struct btf_func_model fmodel; int ret = 0, subprog = -1, i; - struct bpf_trampoline *tr; const struct btf_type *t; bool conservative = true; const char *tname; struct btf *btf; - long addr; - u64 key; - - if (prog->type == BPF_PROG_TYPE_STRUCT_OPS) - return check_struct_ops_btf_id(env); - - if (prog->type != BPF_PROG_TYPE_TRACING && - prog->type != BPF_PROG_TYPE_LSM && - !prog_extension) - return 0; + long addr = 0; if (!btf_id) { - verbose(env, "Tracing programs must provide btf_id\n"); + bpf_log(log, "Tracing programs must provide btf_id\n"); return -EINVAL; } - btf = bpf_prog_get_target_btf(prog); + btf = tgt_prog ? tgt_prog->aux->btf : btf_vmlinux; if (!btf) { - verbose(env, + bpf_log(log, "FENTRY/FEXIT program can only be attached to another program annotated with BTF\n"); return -EINVAL; } t = btf_type_by_id(btf, btf_id); if (!t) { - verbose(env, "attach_btf_id %u is invalid\n", btf_id); + bpf_log(log, "attach_btf_id %u is invalid\n", btf_id); return -EINVAL; } tname = btf_name_by_offset(btf, t->name_off); if (!tname) { - verbose(env, "attach_btf_id %u doesn't have a name\n", btf_id); + bpf_log(log, "attach_btf_id %u doesn't have a name\n", btf_id); return -EINVAL; } if (tgt_prog) { @@ -10874,26 +11529,24 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) break; } if (subprog == -1) { - verbose(env, "Subprog %s doesn't exist\n", tname); + bpf_log(log, "Subprog %s doesn't exist\n", tname); return -EINVAL; } conservative = aux->func_info_aux[subprog].unreliable; if (prog_extension) { if (conservative) { - verbose(env, + bpf_log(log, "Cannot replace static functions\n"); return -EINVAL; } if (!prog->jit_requested) { - verbose(env, + bpf_log(log, "Extension programs should be JITed\n"); return -EINVAL; } - env->ops = bpf_verifier_ops[tgt_prog->type]; - prog->expected_attach_type = tgt_prog->expected_attach_type; } if (!tgt_prog->jited) { - verbose(env, "Can attach to only JITed progs\n"); + bpf_log(log, "Can attach to only JITed progs\n"); return -EINVAL; } if (tgt_prog->type == prog->type) { @@ -10901,7 +11554,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) * Cannot attach program extension to another extension. * It's ok to attach fentry/fexit to extension program. */ - verbose(env, "Cannot recursively attach\n"); + bpf_log(log, "Cannot recursively attach\n"); return -EINVAL; } if (tgt_prog->type == BPF_PROG_TYPE_TRACING && @@ -10923,32 +11576,30 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) * reasonable stack size. Hence extending fentry is not * allowed. */ - verbose(env, "Cannot extend fentry/fexit\n"); + bpf_log(log, "Cannot extend fentry/fexit\n"); return -EINVAL; } - key = ((u64)aux->id) << 32 | btf_id; } else { if (prog_extension) { - verbose(env, "Cannot replace kernel functions\n"); + bpf_log(log, "Cannot replace kernel functions\n"); return -EINVAL; } - key = btf_id; } switch (prog->expected_attach_type) { case BPF_TRACE_RAW_TP: if (tgt_prog) { - verbose(env, + bpf_log(log, "Only FENTRY/FEXIT progs are attachable to another BPF prog\n"); return -EINVAL; } if (!btf_type_is_typedef(t)) { - verbose(env, "attach_btf_id %u is not a typedef\n", + bpf_log(log, "attach_btf_id %u is not a typedef\n", btf_id); return -EINVAL; } if (strncmp(prefix, tname, sizeof(prefix) - 1)) { - verbose(env, "attach_btf_id %u points to wrong type name %s\n", + bpf_log(log, "attach_btf_id %u points to wrong type name %s\n", btf_id, tname); return -EINVAL; } @@ -10962,73 +11613,52 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) /* should never happen in valid vmlinux build */ return -EINVAL; - /* remember two read only pointers that are valid for - * the life time of the kernel - */ - prog->aux->attach_func_name = tname; - prog->aux->attach_func_proto = t; - prog->aux->attach_btf_trace = true; - return 0; + break; case BPF_TRACE_ITER: if (!btf_type_is_func(t)) { - verbose(env, "attach_btf_id %u is not a function\n", + bpf_log(log, "attach_btf_id %u is not a function\n", btf_id); return -EINVAL; } t = btf_type_by_id(btf, t->type); if (!btf_type_is_func_proto(t)) return -EINVAL; - prog->aux->attach_func_name = tname; - prog->aux->attach_func_proto = t; - if (!bpf_iter_prog_supported(prog)) - return -EINVAL; - ret = btf_distill_func_proto(&env->log, btf, t, - tname, &fmodel); - return ret; + ret = btf_distill_func_proto(log, btf, t, tname, &tgt_info->fmodel); + if (ret) + return ret; + break; default: if (!prog_extension) return -EINVAL; - /* fallthrough */ + fallthrough; case BPF_MODIFY_RETURN: case BPF_LSM_MAC: case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: - prog->aux->attach_func_name = tname; - if (prog->type == BPF_PROG_TYPE_LSM) { - ret = bpf_lsm_verify_prog(&env->log, prog); - if (ret < 0) - return ret; - } - if (!btf_type_is_func(t)) { - verbose(env, "attach_btf_id %u is not a function\n", + bpf_log(log, "attach_btf_id %u is not a function\n", btf_id); return -EINVAL; } if (prog_extension && - btf_check_type_match(env, prog, btf, t)) + btf_check_type_match(log, prog, btf, t)) return -EINVAL; t = btf_type_by_id(btf, t->type); if (!btf_type_is_func_proto(t)) return -EINVAL; - tr = bpf_trampoline_lookup(key); - if (!tr) - return -ENOMEM; - /* t is either vmlinux type or another program's type */ - prog->aux->attach_func_proto = t; - mutex_lock(&tr->mutex); - if (tr->func.addr) { - prog->aux->trampoline = tr; - goto out; - } - if (tgt_prog && conservative) { - prog->aux->attach_func_proto = NULL; + + if ((prog->aux->saved_dst_prog_type || prog->aux->saved_dst_attach_type) && + (!tgt_prog || prog->aux->saved_dst_prog_type != tgt_prog->type || + prog->aux->saved_dst_attach_type != tgt_prog->expected_attach_type)) + return -EINVAL; + + if (tgt_prog && conservative) t = NULL; - } - ret = btf_distill_func_proto(&env->log, btf, t, - tname, &tr->func.model); + + ret = btf_distill_func_proto(log, btf, t, tname, &tgt_info->fmodel); if (ret < 0) - goto out; + return ret; + if (tgt_prog) { if (subprog == 0) addr = (long) tgt_prog->bpf_func; @@ -11037,31 +11667,137 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) } else { addr = kallsyms_lookup_name(tname); if (!addr) { - verbose(env, + bpf_log(log, "The address of function %s cannot be found\n", tname); - ret = -ENOENT; - goto out; + return -ENOENT; } } - if (prog->expected_attach_type == BPF_MODIFY_RETURN) { - ret = check_attach_modify_return(prog, addr); - if (ret) - verbose(env, "%s() is not modifiable\n", - prog->aux->attach_func_name); + if (prog->aux->sleepable) { + ret = -EINVAL; + switch (prog->type) { + case BPF_PROG_TYPE_TRACING: + /* fentry/fexit/fmod_ret progs can be sleepable only if they are + * attached to ALLOW_ERROR_INJECTION and are not in denylist. + */ + if (!check_non_sleepable_error_inject(btf_id) && + within_error_injection_list(addr)) + ret = 0; + break; + case BPF_PROG_TYPE_LSM: + /* LSM progs check that they are attached to bpf_lsm_*() funcs. + * Only some of them are sleepable. + */ + if (check_sleepable_lsm_hook(btf_id)) + ret = 0; + break; + default: + break; + } + if (ret) { + bpf_log(log, "%s is not sleepable\n", tname); + return ret; + } + } else if (prog->expected_attach_type == BPF_MODIFY_RETURN) { + if (tgt_prog) { + bpf_log(log, "can't modify return codes of BPF programs\n"); + return -EINVAL; + } + ret = check_attach_modify_return(addr, tname); + if (ret) { + bpf_log(log, "%s() is not modifiable\n", tname); + return ret; + } } - if (ret) - goto out; - tr->func.addr = (void *)addr; - prog->aux->trampoline = tr; -out: - mutex_unlock(&tr->mutex); - if (ret) - bpf_trampoline_put(tr); + break; + } + tgt_info->tgt_addr = addr; + tgt_info->tgt_name = tname; + tgt_info->tgt_type = t; + return 0; +} + +static int check_attach_btf_id(struct bpf_verifier_env *env) +{ + struct bpf_prog *prog = env->prog; + struct bpf_prog *tgt_prog = prog->aux->dst_prog; + struct bpf_attach_target_info tgt_info = {}; + u32 btf_id = prog->aux->attach_btf_id; + struct bpf_trampoline *tr; + int ret; + u64 key; + + if (prog->aux->sleepable && prog->type != BPF_PROG_TYPE_TRACING && + prog->type != BPF_PROG_TYPE_LSM) { + verbose(env, "Only fentry/fexit/fmod_ret and lsm programs can be sleepable\n"); + return -EINVAL; + } + + if (prog->type == BPF_PROG_TYPE_STRUCT_OPS) + return check_struct_ops_btf_id(env); + + if (prog->type != BPF_PROG_TYPE_TRACING && + prog->type != BPF_PROG_TYPE_LSM && + prog->type != BPF_PROG_TYPE_EXT) + return 0; + + ret = bpf_check_attach_target(&env->log, prog, tgt_prog, btf_id, &tgt_info); + if (ret) return ret; + + if (tgt_prog && prog->type == BPF_PROG_TYPE_EXT) { + /* to make freplace equivalent to their targets, they need to + * inherit env->ops and expected_attach_type for the rest of the + * verification + */ + env->ops = bpf_verifier_ops[tgt_prog->type]; + prog->expected_attach_type = tgt_prog->expected_attach_type; + } + + /* store info about the attachment target that will be used later */ + prog->aux->attach_func_proto = tgt_info.tgt_type; + prog->aux->attach_func_name = tgt_info.tgt_name; + + if (tgt_prog) { + prog->aux->saved_dst_prog_type = tgt_prog->type; + prog->aux->saved_dst_attach_type = tgt_prog->expected_attach_type; } + + if (prog->expected_attach_type == BPF_TRACE_RAW_TP) { + prog->aux->attach_btf_trace = true; + return 0; + } else if (prog->expected_attach_type == BPF_TRACE_ITER) { + if (!bpf_iter_prog_supported(prog)) + return -EINVAL; + return 0; + } + + if (prog->type == BPF_PROG_TYPE_LSM) { + ret = bpf_lsm_verify_prog(&env->log, prog); + if (ret < 0) + return ret; + } + + key = bpf_trampoline_compute_key(tgt_prog, btf_id); + tr = bpf_trampoline_get(key, &tgt_info); + if (!tr) + return -ENOMEM; + + prog->aux->dst_trampoline = tr; + return 0; +} + +struct btf *bpf_get_btf_vmlinux(void) +{ + if (!btf_vmlinux && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) { + mutex_lock(&bpf_verifier_lock); + if (!btf_vmlinux) + btf_vmlinux = btf_parse_vmlinux(); + mutex_unlock(&bpf_verifier_lock); + } + return btf_vmlinux; } int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, @@ -11097,12 +11833,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, env->ops = bpf_verifier_ops[env->prog->type]; is_priv = bpf_capable(); - if (!btf_vmlinux && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) { - mutex_lock(&bpf_verifier_lock); - if (!btf_vmlinux) - btf_vmlinux = btf_parse_vmlinux(); - mutex_unlock(&bpf_verifier_lock); - } + bpf_get_btf_vmlinux(); /* grab the mutex to protect few globals used by verifier */ if (!is_priv) @@ -11145,10 +11876,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, if (is_priv) env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ; - ret = replace_map_fd_with_map_ptr(env); - if (ret < 0) - goto skip_full_check; - if (bpf_prog_is_dev_bound(env->prog->aux)) { ret = bpf_prog_offload_verifier_prep(env->prog); if (ret) @@ -11174,6 +11901,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, if (ret) goto skip_full_check; + ret = resolve_pseudo_ldimm64(env); + if (ret < 0) + goto skip_full_check; + ret = check_cfg(env); if (ret < 0) goto skip_full_check; diff --git a/kernel/capability.c b/kernel/capability.c index 1444f3954d75..de7eac903a2a 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -93,7 +93,7 @@ static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy) break; case _LINUX_CAPABILITY_VERSION_2: warn_deprecated_v2(); - /* fall through - v3 is otherwise equivalent to v2. */ + fallthrough; /* v3 is otherwise equivalent to v2 */ case _LINUX_CAPABILITY_VERSION_3: *tocopy = _LINUX_CAPABILITY_U32S_3; break; @@ -418,7 +418,7 @@ EXPORT_SYMBOL(ns_capable_noaudit); /** * ns_capable_setid - Determine if the current task has a superior capability * in effect, while signalling that this check is being done from within a - * setid syscall. + * setid or setgroups syscall. * @ns: The usernamespace we want the capability in * @cap: The capability to be tested for * diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index dd247747ec14..e41c21819ba0 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2006,7 +2006,6 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) BUG_ON(!list_empty(&root_cgrp->self.children)); BUG_ON(atomic_read(&root->nr_cgrps) != 1); - kernfs_activate(root_cgrp->kn); ret = 0; goto out; @@ -3682,6 +3681,9 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, struct cgroup_subsys_state *css; int ret; + if (!nbytes) + return 0; + /* * If namespaces are delegation boundaries, disallow writes to * files in an non-init namespace root from inside the namespace diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 642415b8c3c9..57b5b5d0a5fd 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -390,7 +390,7 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) * The top cpuset doesn't have any online cpu as a * consequence of a race between cpuset_hotplug_work * and cpu hotplug notifier. But we know the top - * cpuset's effective_cpus is on its way to to be + * cpuset's effective_cpus is on its way to be * identical to cpu_online_mask. */ cpumask_copy(pmask, cpu_online_mask); diff --git a/kernel/compat.c b/kernel/compat.c index b8d2800bb4b7..05adfd6fa8bf 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -255,11 +255,11 @@ get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat) return -EFAULT; switch (_NSIG_WORDS) { case 4: set->sig[3] = v.sig[6] | (((long)v.sig[7]) << 32 ); - /* fall through */ + fallthrough; case 3: set->sig[2] = v.sig[4] | (((long)v.sig[5]) << 32 ); - /* fall through */ + fallthrough; case 2: set->sig[1] = v.sig[2] | (((long)v.sig[3]) << 32 ); - /* fall through */ + fallthrough; case 1: set->sig[0] = v.sig[0] | (((long)v.sig[1]) << 32 ); } #else diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c index 44a259338e33..f7e1d0eccdbc 100644 --- a/kernel/cpu_pm.c +++ b/kernel/cpu_pm.c @@ -15,18 +15,28 @@ static ATOMIC_NOTIFIER_HEAD(cpu_pm_notifier_chain); -static int cpu_pm_notify(enum cpu_pm_event event, int nr_to_call, int *nr_calls) +static int cpu_pm_notify(enum cpu_pm_event event) { int ret; /* - * __atomic_notifier_call_chain has a RCU read critical section, which + * atomic_notifier_call_chain has a RCU read critical section, which * could be disfunctional in cpu idle. Copy RCU_NONIDLE code to let * RCU know this. */ rcu_irq_enter_irqson(); - ret = __atomic_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL, - nr_to_call, nr_calls); + ret = atomic_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL); + rcu_irq_exit_irqson(); + + return notifier_to_errno(ret); +} + +static int cpu_pm_notify_robust(enum cpu_pm_event event_up, enum cpu_pm_event event_down) +{ + int ret; + + rcu_irq_enter_irqson(); + ret = atomic_notifier_call_chain_robust(&cpu_pm_notifier_chain, event_up, event_down, NULL); rcu_irq_exit_irqson(); return notifier_to_errno(ret); @@ -80,18 +90,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier); */ int cpu_pm_enter(void) { - int nr_calls = 0; - int ret = 0; - - ret = cpu_pm_notify(CPU_PM_ENTER, -1, &nr_calls); - if (ret) - /* - * Inform listeners (nr_calls - 1) about failure of CPU PM - * PM entry who are notified earlier to prepare for it. - */ - cpu_pm_notify(CPU_PM_ENTER_FAILED, nr_calls - 1, NULL); - - return ret; + return cpu_pm_notify_robust(CPU_PM_ENTER, CPU_PM_ENTER_FAILED); } EXPORT_SYMBOL_GPL(cpu_pm_enter); @@ -109,7 +108,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_enter); */ int cpu_pm_exit(void) { - return cpu_pm_notify(CPU_PM_EXIT, -1, NULL); + return cpu_pm_notify(CPU_PM_EXIT); } EXPORT_SYMBOL_GPL(cpu_pm_exit); @@ -131,18 +130,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_exit); */ int cpu_cluster_pm_enter(void) { - int nr_calls = 0; - int ret = 0; - - ret = cpu_pm_notify(CPU_CLUSTER_PM_ENTER, -1, &nr_calls); - if (ret) - /* - * Inform listeners (nr_calls - 1) about failure of CPU cluster - * PM entry who are notified earlier to prepare for it. - */ - cpu_pm_notify(CPU_CLUSTER_PM_ENTER_FAILED, nr_calls - 1, NULL); - - return ret; + return cpu_pm_notify_robust(CPU_CLUSTER_PM_ENTER, CPU_CLUSTER_PM_ENTER_FAILED); } EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter); @@ -163,7 +151,7 @@ EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter); */ int cpu_cluster_pm_exit(void) { - return cpu_pm_notify(CPU_CLUSTER_PM_EXIT, -1, NULL); + return cpu_pm_notify(CPU_CLUSTER_PM_EXIT); } EXPORT_SYMBOL_GPL(cpu_cluster_pm_exit); diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index b16dbc1bf056..1e75a8923a8d 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -80,7 +80,7 @@ static int exception_level; struct kgdb_io *dbg_io_ops; static DEFINE_SPINLOCK(kgdb_registration_lock); -/* Action for the reboot notifiter, a global allow kdb to change it */ +/* Action for the reboot notifier, a global allow kdb to change it */ static int kgdbreboot; /* kgdb console driver is loaded */ static int kgdb_con_registered; @@ -94,14 +94,6 @@ int dbg_switch_cpu; /* Use kdb or gdbserver mode */ int dbg_kdb_mode = 1; -static int __init opt_kgdb_con(char *str) -{ - kgdb_use_con = 1; - return 0; -} - -early_param("kgdbcon", opt_kgdb_con); - module_param(kgdb_use_con, int, 0644); module_param(kgdbreboot, int, 0644); @@ -163,7 +155,7 @@ early_param("nokgdbroundup", opt_nokgdbroundup); /* * Weak aliases for breakpoint management, - * can be overriden by architectures when needed: + * can be overridden by architectures when needed: */ int __weak kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) { @@ -177,17 +169,23 @@ int __weak kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE); return err; } +NOKPROBE_SYMBOL(kgdb_arch_set_breakpoint); int __weak kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt) { return copy_to_kernel_nofault((char *)bpt->bpt_addr, (char *)bpt->saved_instr, BREAK_INSTR_SIZE); } +NOKPROBE_SYMBOL(kgdb_arch_remove_breakpoint); int __weak kgdb_validate_break_address(unsigned long addr) { struct kgdb_bkpt tmp; int err; + + if (kgdb_within_blocklist(addr)) + return -EINVAL; + /* Validate setting the breakpoint and then removing it. If the * remove fails, the kernel needs to emit a bad message because we * are deep trouble not being able to put things back the way we @@ -208,6 +206,7 @@ unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs) { return instruction_pointer(regs); } +NOKPROBE_SYMBOL(kgdb_arch_pc); int __weak kgdb_arch_init(void) { @@ -218,6 +217,7 @@ int __weak kgdb_skipexception(int exception, struct pt_regs *regs) { return 0; } +NOKPROBE_SYMBOL(kgdb_skipexception); #ifdef CONFIG_SMP @@ -239,6 +239,7 @@ void __weak kgdb_call_nmi_hook(void *ignored) */ kgdb_nmicallback(raw_smp_processor_id(), get_irq_regs()); } +NOKPROBE_SYMBOL(kgdb_call_nmi_hook); void __weak kgdb_roundup_cpus(void) { @@ -272,6 +273,7 @@ void __weak kgdb_roundup_cpus(void) kgdb_info[cpu].rounding_up = false; } } +NOKPROBE_SYMBOL(kgdb_roundup_cpus); #endif @@ -298,6 +300,7 @@ static void kgdb_flush_swbreak_addr(unsigned long addr) /* Force flush instruction cache if it was outside the mm */ flush_icache_range(addr, addr + BREAK_INSTR_SIZE); } +NOKPROBE_SYMBOL(kgdb_flush_swbreak_addr); /* * SW breakpoint management: @@ -325,6 +328,7 @@ int dbg_activate_sw_breakpoints(void) } return ret; } +NOKPROBE_SYMBOL(dbg_activate_sw_breakpoints); int dbg_set_sw_break(unsigned long addr) { @@ -388,6 +392,7 @@ int dbg_deactivate_sw_breakpoints(void) } return ret; } +NOKPROBE_SYMBOL(dbg_deactivate_sw_breakpoints); int dbg_remove_sw_break(unsigned long addr) { @@ -509,6 +514,7 @@ static int kgdb_io_ready(int print_wait) } return 1; } +NOKPROBE_SYMBOL(kgdb_io_ready); static int kgdb_reenter_check(struct kgdb_state *ks) { @@ -556,6 +562,7 @@ static int kgdb_reenter_check(struct kgdb_state *ks) return 1; } +NOKPROBE_SYMBOL(kgdb_reenter_check); static void dbg_touch_watchdogs(void) { @@ -563,6 +570,7 @@ static void dbg_touch_watchdogs(void) clocksource_touch_watchdog(); rcu_cpu_stall_reset(); } +NOKPROBE_SYMBOL(dbg_touch_watchdogs); static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs, int exception_state) @@ -752,6 +760,8 @@ cpu_master_loop: } } + dbg_activate_sw_breakpoints(); + /* Call the I/O driver's post_exception routine */ if (dbg_io_ops->post_exception) dbg_io_ops->post_exception(); @@ -794,6 +804,7 @@ kgdb_restore: return kgdb_info[cpu].ret_state; } +NOKPROBE_SYMBOL(kgdb_cpu_enter); /* * kgdb_handle_exception() - main entry point from a kernel exception @@ -838,6 +849,7 @@ out: arch_kgdb_ops.enable_nmi(1); return ret; } +NOKPROBE_SYMBOL(kgdb_handle_exception); /* * GDB places a breakpoint at this function to know dynamically loaded objects. @@ -872,6 +884,7 @@ int kgdb_nmicallback(int cpu, void *regs) #endif return 1; } +NOKPROBE_SYMBOL(kgdb_nmicallback); int kgdb_nmicallin(int cpu, int trapnr, void *regs, int err_code, atomic_t *send_ready) @@ -897,6 +910,7 @@ int kgdb_nmicallin(int cpu, int trapnr, void *regs, int err_code, #endif return 1; } +NOKPROBE_SYMBOL(kgdb_nmicallin); static void kgdb_console_write(struct console *co, const char *s, unsigned count) @@ -920,6 +934,20 @@ static struct console kgdbcons = { .index = -1, }; +static int __init opt_kgdb_con(char *str) +{ + kgdb_use_con = 1; + + if (kgdb_io_module_registered && !kgdb_con_registered) { + register_console(&kgdbcons); + kgdb_con_registered = 1; + } + + return 0; +} + +early_param("kgdbcon", opt_kgdb_con); + #ifdef CONFIG_MAGIC_SYSRQ static void sysrq_handle_dbg(int key) { diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index a790026e42d0..a77df59d9ca5 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -725,7 +725,7 @@ static void gdb_cmd_query(struct kgdb_state *ks) } } - do_each_thread(g, p) { + for_each_process_thread(g, p) { if (i >= ks->thr_query && !finished) { int_to_threadref(thref, p->pid); ptr = pack_threadid(ptr, thref); @@ -735,7 +735,7 @@ static void gdb_cmd_query(struct kgdb_state *ks) finished = 1; } i++; - } while_each_thread(g, p); + } *(--ptr) = '\0'; break; @@ -1046,14 +1046,14 @@ int gdb_serial_stub(struct kgdb_state *ks) return DBG_PASS_EVENT; } #endif - /* Fall through */ + fallthrough; case 'C': /* Exception passing */ tmp = gdb_cmd_exception_pass(ks); if (tmp > 0) goto default_handle; if (tmp == 0) break; - /* Fall through - on tmp < 0 */ + fallthrough; /* on tmp < 0 */ case 'c': /* Continue packet */ case 's': /* Single step packet */ if (kgdb_contthread && kgdb_contthread != current) { @@ -1061,8 +1061,7 @@ int gdb_serial_stub(struct kgdb_state *ks) error_packet(remcom_out_buffer, -EINVAL); break; } - dbg_activate_sw_breakpoints(); - /* Fall through - to default processing */ + fallthrough; /* to default processing */ default: default_handle: error = kgdb_arch_handle_exception(ks->ex_vector, diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c index d7ebb2c79cb8..ec4940146612 100644 --- a/kernel/debug/kdb/kdb_bp.c +++ b/kernel/debug/kdb/kdb_bp.c @@ -307,6 +307,15 @@ static int kdb_bp(int argc, const char **argv) return KDB_BADINT; /* + * This check is redundant (since the breakpoint machinery should + * be doing the same check during kdb_bp_install) but gives the + * user immediate feedback. + */ + diag = kgdb_validate_break_address(template.bp_addr); + if (diag) + return diag; + + /* * Find an empty bp structure to allocate */ for (bpno = 0, bp = kdb_breakpoints; bpno < KDB_MAXBPT; bpno++, bp++) { diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index 18e03aba2cfc..1f9f0e47aeda 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c @@ -149,14 +149,14 @@ kdb_bt(int argc, const char **argv) return 0; } /* Now the inactive tasks */ - kdb_do_each_thread(g, p) { + for_each_process_thread(g, p) { if (KDB_FLAG(CMD_INTERRUPT)) return 0; if (task_curr(p)) continue; if (kdb_bt1(p, mask, btaprompt)) return 0; - } kdb_while_each_thread(g, p); + } } else if (strcmp(argv[0], "btp") == 0) { struct task_struct *p; unsigned long pid; diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c index 53a0df6e4d92..0220afda3200 100644 --- a/kernel/debug/kdb/kdb_debugger.c +++ b/kernel/debug/kdb/kdb_debugger.c @@ -147,7 +147,6 @@ int kdb_stub(struct kgdb_state *ks) return DBG_PASS_EVENT; } kdb_bp_install(ks->linux_regs); - dbg_activate_sw_breakpoints(); /* Set the exit state to a single step or a continue */ if (KDB_STATE(DOING_SS)) gdbstub_state(ks, "s"); @@ -167,7 +166,6 @@ int kdb_stub(struct kgdb_state *ks) * differently vs the gdbstub */ kgdb_single_step = 0; - dbg_deactivate_sw_breakpoints(); return DBG_SWITCH_CPU_EVENT; } return kgdb_info[ks->cpu].ret_state; diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 9d847ab851db..6735ac36b718 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -545,18 +545,18 @@ static int kdb_search_string(char *searched, char *searchfor) static void kdb_msg_write(const char *msg, int msg_len) { struct console *c; + const char *cp; + int len; if (msg_len == 0) return; - if (dbg_io_ops) { - const char *cp = msg; - int len = msg_len; + cp = msg; + len = msg_len; - while (len--) { - dbg_io_ops->write_char(*cp); - cp++; - } + while (len--) { + dbg_io_ops->write_char(*cp); + cp++; } for_each_console(c) { @@ -706,12 +706,16 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap) size_avail = sizeof(kdb_buffer) - len; goto kdb_print_out; } - if (kdb_grepping_flag >= KDB_GREPPING_FLAG_SEARCH) + if (kdb_grepping_flag >= KDB_GREPPING_FLAG_SEARCH) { /* * This was a interactive search (using '/' at more - * prompt) and it has completed. Clear the flag. + * prompt) and it has completed. Replace the \0 with + * its original value to ensure multi-line strings + * are handled properly, and return to normal mode. */ + *cphold = replaced_byte; kdb_grepping_flag = 0; + } /* * at this point the string is a full line and * should be printed, up to the null. diff --git a/kernel/debug/kdb/kdb_keyboard.c b/kernel/debug/kdb/kdb_keyboard.c index 750497b0003a..f877a0a0d7cf 100644 --- a/kernel/debug/kdb/kdb_keyboard.c +++ b/kernel/debug/kdb/kdb_keyboard.c @@ -173,11 +173,11 @@ int kdb_get_kbd_char(void) case KT_LATIN: if (isprint(keychar)) break; /* printable characters */ - /* fall through */ + fallthrough; case KT_SPEC: if (keychar == K_ENTER) break; - /* fall through */ + fallthrough; default: return -1; /* ignore unprintables */ } diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 5c7949061671..930ac1b25ec7 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -2299,10 +2299,10 @@ void kdb_ps_suppressed(void) if (kdb_task_state(p, mask_I)) ++idle; } - kdb_do_each_thread(g, p) { + for_each_process_thread(g, p) { if (kdb_task_state(p, mask_M)) ++daemon; - } kdb_while_each_thread(g, p); + } if (idle || daemon) { if (idle) kdb_printf("%d idle process%s (state I)%s\n", @@ -2370,12 +2370,12 @@ static int kdb_ps(int argc, const char **argv) } kdb_printf("\n"); /* Now the real tasks */ - kdb_do_each_thread(g, p) { + for_each_process_thread(g, p) { if (KDB_FLAG(CMD_INTERRUPT)) return 0; if (kdb_task_state(p, mask)) kdb_ps1(p); - } kdb_while_each_thread(g, p); + } return 0; } diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index 2e296e4a234c..a4281fb99299 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -230,10 +230,6 @@ extern struct task_struct *kdb_curr_task(int); #define kdb_task_has_cpu(p) (task_curr(p)) -/* Simplify coexistence with NPTL */ -#define kdb_do_each_thread(g, p) do_each_thread(g, p) -#define kdb_while_each_thread(g, p) while_each_thread(g, p) - #define GFP_KDB (in_interrupt() ? GFP_ATOMIC : GFP_KERNEL) extern void *debug_kmalloc(size_t size, gfp_t flags); diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index 004c5b6c87f8..6226502ce049 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c @@ -432,7 +432,7 @@ int kdb_getphysword(unsigned long *word, unsigned long addr, size_t size) *word = w8; break; } - /* fall through */ + fallthrough; default: diag = KDB_BADWIDTH; kdb_printf("kdb_getphysword: bad width %ld\n", (long) size); @@ -481,7 +481,7 @@ int kdb_getword(unsigned long *word, unsigned long addr, size_t size) *word = w8; break; } - /* fall through */ + fallthrough; default: diag = KDB_BADWIDTH; kdb_printf("kdb_getword: bad width %ld\n", (long) size); @@ -525,7 +525,7 @@ int kdb_putword(unsigned long addr, unsigned long word, size_t size) diag = kdb_putarea(addr, w8); break; } - /* fall through */ + fallthrough; default: diag = KDB_BADWIDTH; kdb_printf("kdb_putword: bad width %ld\n", (long) size); diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 847a9d1fa634..c99de4a21458 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -9,6 +9,7 @@ config HAS_DMA default y config DMA_OPS + depends on HAS_DMA bool # @@ -43,6 +44,12 @@ config ARCH_HAS_DMA_SET_MASK config ARCH_HAS_DMA_WRITE_COMBINE bool +# +# Select if the architectures provides the arch_dma_mark_clean hook +# +config ARCH_HAS_DMA_MARK_CLEAN + bool + config DMA_DECLARE_COHERENT bool @@ -68,9 +75,6 @@ config ARCH_HAS_DMA_PREP_COHERENT config ARCH_HAS_FORCE_DMA_UNENCRYPTED bool -config DMA_NONCOHERENT_CACHE_SYNC - bool - config DMA_VIRT_OPS bool depends on HAS_DMA @@ -114,10 +118,21 @@ config DMA_CMA You can disable CMA by specifying "cma=0" on the kernel's command line. - For more information see <include/linux/dma-contiguous.h>. + For more information see <kernel/dma/contiguous.c>. If unsure, say "n". if DMA_CMA + +config DMA_PERNUMA_CMA + bool "Enable separate DMA Contiguous Memory Area for each NUMA Node" + default NUMA && ARM64 + help + Enable this option to get pernuma CMA areas so that devices like + ARM64 SMMU can get local memory by DMA coherent APIs. + + You can set the size of pernuma CMA by specifying "cma_pernuma=size" + on the kernel's command line. + comment "Default contiguous memory area size:" config CMA_SIZE_MBYTES @@ -162,7 +177,7 @@ endchoice config CMA_ALIGNMENT int "Maximum PAGE_SIZE order of alignment for contiguous buffers" - range 4 12 + range 2 12 default 8 help DMA mapping framework by default aligns all buffers to the smallest diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile index 32c7c1942bbd..dc755ab68aab 100644 --- a/kernel/dma/Makefile +++ b/kernel/dma/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_HAS_DMA) += mapping.o direct.o +obj-$(CONFIG_DMA_OPS) += ops_helpers.o obj-$(CONFIG_DMA_OPS) += dummy.o obj-$(CONFIG_DMA_CMA) += contiguous.o obj-$(CONFIG_DMA_DECLARE_COHERENT) += coherent.o diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c index 2a0c4985f38e..5b5b6c7ec7f2 100644 --- a/kernel/dma/coherent.c +++ b/kernel/dma/coherent.c @@ -7,7 +7,8 @@ #include <linux/slab.h> #include <linux/kernel.h> #include <linux/module.h> -#include <linux/dma-mapping.h> +#include <linux/dma-direct.h> +#include <linux/dma-map-ops.h> struct dma_coherent_mem { void *virt_base; @@ -32,9 +33,8 @@ static inline dma_addr_t dma_get_device_base(struct device *dev, struct dma_coherent_mem * mem) { if (mem->use_dev_dma_pfn_offset) - return (mem->pfn_base - dev->dma_pfn_offset) << PAGE_SHIFT; - else - return mem->device_base; + return phys_to_dma(dev, PFN_PHYS(mem->pfn_base)); + return mem->device_base; } static int dma_init_coherent_memory(phys_addr_t phys_addr, @@ -107,6 +107,23 @@ static int dma_assign_coherent_memory(struct device *dev, return 0; } +/* + * Declare a region of memory to be handed out by dma_alloc_coherent() when it + * is asked for coherent memory for this device. This shall only be used + * from platform code, usually based on the device tree description. + * + * phys_addr is the CPU physical address to which the memory is currently + * assigned (this will be ioremapped so the CPU can access the region). + * + * device_addr is the DMA address the device needs to be programmed with to + * actually address this memory (this will be handed out as the dma_addr_t in + * dma_alloc_coherent()). + * + * size is the size of the area (must be a multiple of PAGE_SIZE). + * + * As a simplification for the platforms, only *one* such region of memory may + * be declared per device. + */ int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, dma_addr_t device_addr, size_t size) { diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index cff7e60968b9..16b95ff12e4d 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -5,6 +5,34 @@ * Written by: * Marek Szyprowski <m.szyprowski@samsung.com> * Michal Nazarewicz <mina86@mina86.com> + * + * Contiguous Memory Allocator + * + * The Contiguous Memory Allocator (CMA) makes it possible to + * allocate big contiguous chunks of memory after the system has + * booted. + * + * Why is it needed? + * + * Various devices on embedded systems have no scatter-getter and/or + * IO map support and require contiguous blocks of memory to + * operate. They include devices such as cameras, hardware video + * coders, etc. + * + * Such devices often require big memory buffers (a full HD frame + * is, for instance, more then 2 mega pixels large, i.e. more than 6 + * MB of memory), which makes mechanisms such as kmalloc() or + * alloc_page() ineffective. + * + * At the same time, a solution where a big memory region is + * reserved for a device is suboptimal since often more memory is + * reserved then strictly required and, moreover, the memory is + * inaccessible to page system even if device drivers don't use it. + * + * CMA tries to solve this issue by operating on memory regions + * where only movable pages can be allocated from. This way, kernel + * can use the memory for pagecache and when device driver requests + * it, allocated pages can be migrated. */ #define pr_fmt(fmt) "cma: " fmt @@ -16,12 +44,11 @@ #endif #include <asm/page.h> -#include <asm/dma-contiguous.h> #include <linux/memblock.h> #include <linux/err.h> #include <linux/sizes.h> -#include <linux/dma-contiguous.h> +#include <linux/dma-map-ops.h> #include <linux/cma.h> #ifdef CONFIG_CMA_SIZE_MBYTES @@ -69,20 +96,24 @@ static int __init early_cma(char *p) } early_param("cma", early_cma); +#ifdef CONFIG_DMA_PERNUMA_CMA + +static struct cma *dma_contiguous_pernuma_area[MAX_NUMNODES]; +static phys_addr_t pernuma_size_bytes __initdata; + +static int __init early_cma_pernuma(char *p) +{ + pernuma_size_bytes = memparse(p, &p); + return 0; +} +early_param("cma_pernuma", early_cma_pernuma); +#endif + #ifdef CONFIG_CMA_SIZE_PERCENTAGE static phys_addr_t __init __maybe_unused cma_early_percent_memory(void) { - struct memblock_region *reg; - unsigned long total_pages = 0; - - /* - * We cannot use memblock_phys_mem_size() here, because - * memblock_analyze() has not been called yet. - */ - for_each_memblock(memory, reg) - total_pages += memblock_region_memory_end_pfn(reg) - - memblock_region_memory_base_pfn(reg); + unsigned long total_pages = PHYS_PFN(memblock_phys_mem_size()); return (total_pages * CONFIG_CMA_SIZE_PERCENTAGE / 100) << PAGE_SHIFT; } @@ -96,6 +127,34 @@ static inline __maybe_unused phys_addr_t cma_early_percent_memory(void) #endif +#ifdef CONFIG_DMA_PERNUMA_CMA +void __init dma_pernuma_cma_reserve(void) +{ + int nid; + + if (!pernuma_size_bytes) + return; + + for_each_online_node(nid) { + int ret; + char name[CMA_MAX_NAME]; + struct cma **cma = &dma_contiguous_pernuma_area[nid]; + + snprintf(name, sizeof(name), "pernuma%d", nid); + ret = cma_declare_contiguous_nid(0, pernuma_size_bytes, 0, 0, + 0, false, name, cma, nid); + if (ret) { + pr_warn("%s: reservation failed: err %d, node %d", __func__, + ret, nid); + continue; + } + + pr_debug("%s: reserved %llu MiB on node %d\n", __func__, + (unsigned long long)pernuma_size_bytes / SZ_1M, nid); + } +} +#endif + /** * dma_contiguous_reserve() - reserve area(s) for contiguous memory handling * @limit: End address of the reserved memory (optional, 0 for any). @@ -143,6 +202,11 @@ void __init dma_contiguous_reserve(phys_addr_t limit) } } +void __weak +dma_contiguous_early_fixup(phys_addr_t base, unsigned long size) +{ +} + /** * dma_contiguous_reserve_area() - reserve custom contiguous area * @size: Size of the reserved area (in bytes), @@ -228,23 +292,44 @@ static struct page *cma_alloc_aligned(struct cma *cma, size_t size, gfp_t gfp) * @size: Requested allocation size. * @gfp: Allocation flags. * - * This function allocates contiguous memory buffer for specified device. It - * tries to use device specific contiguous memory area if available, or the - * default global one. + * tries to use device specific contiguous memory area if available, or it + * tries to use per-numa cma, if the allocation fails, it will fallback to + * try default global one. * - * Note that it byapss one-page size of allocations from the global area as - * the addresses within one page are always contiguous, so there is no need - * to waste CMA pages for that kind; it also helps reduce fragmentations. + * Note that it bypass one-page size of allocations from the per-numa and + * global area as the addresses within one page are always contiguous, so + * there is no need to waste CMA pages for that kind; it also helps reduce + * fragmentations. */ struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp) { +#ifdef CONFIG_DMA_PERNUMA_CMA + int nid = dev_to_node(dev); +#endif + /* CMA can be used only in the context which permits sleeping */ if (!gfpflags_allow_blocking(gfp)) return NULL; if (dev->cma_area) return cma_alloc_aligned(dev->cma_area, size, gfp); - if (size <= PAGE_SIZE || !dma_contiguous_default_area) + if (size <= PAGE_SIZE) + return NULL; + +#ifdef CONFIG_DMA_PERNUMA_CMA + if (nid != NUMA_NO_NODE && !(gfp & (GFP_DMA | GFP_DMA32))) { + struct cma *cma = dma_contiguous_pernuma_area[nid]; + struct page *page; + + if (cma) { + page = cma_alloc_aligned(cma, size, gfp); + if (page) + return page; + } + } +#endif + if (!dma_contiguous_default_area) return NULL; + return cma_alloc_aligned(dma_contiguous_default_area, size, gfp); } @@ -261,9 +346,27 @@ struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp) */ void dma_free_contiguous(struct device *dev, struct page *page, size_t size) { - if (!cma_release(dev_get_cma_area(dev), page, - PAGE_ALIGN(size) >> PAGE_SHIFT)) - __free_pages(page, get_order(size)); + unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; + + /* if dev has its own cma, free page from there */ + if (dev->cma_area) { + if (cma_release(dev->cma_area, page, count)) + return; + } else { + /* + * otherwise, page is from either per-numa cma or default cma + */ +#ifdef CONFIG_DMA_PERNUMA_CMA + if (cma_release(dma_contiguous_pernuma_area[page_to_nid(page)], + page, count)) + return; +#endif + if (cma_release(dma_contiguous_default_area, page, count)) + return; + } + + /* not in any cma, free from buddy */ + __free_pages(page, get_order(size)); } /* @@ -279,14 +382,14 @@ void dma_free_contiguous(struct device *dev, struct page *page, size_t size) static int rmem_cma_device_init(struct reserved_mem *rmem, struct device *dev) { - dev_set_cma_area(dev, rmem->priv); + dev->cma_area = rmem->priv; return 0; } static void rmem_cma_device_release(struct reserved_mem *rmem, struct device *dev) { - dev_set_cma_area(dev, NULL); + dev->cma_area = NULL; } static const struct reserved_mem_ops rmem_cma_ops = { @@ -327,7 +430,7 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem) dma_contiguous_early_fixup(rmem->base, rmem->size); if (default_cma) - dma_contiguous_set_default(cma); + dma_contiguous_default_area = cma; rmem->ops = &rmem_cma_ops; rmem->priv = cma; diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 8e9f7b301c6d..14de1271463f 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -9,10 +9,9 @@ #include <linux/sched/task_stack.h> #include <linux/scatterlist.h> -#include <linux/dma-mapping.h> +#include <linux/dma-map-ops.h> #include <linux/sched/task.h> #include <linux/stacktrace.h> -#include <linux/dma-debug.h> #include <linux/spinlock.h> #include <linux/vmalloc.h> #include <linux/debugfs.h> @@ -24,8 +23,8 @@ #include <linux/ctype.h> #include <linux/list.h> #include <linux/slab.h> - #include <asm/sections.h> +#include "debug.h" #define HASH_SIZE 16384ULL #define HASH_FN_SHIFT 13 @@ -1219,7 +1218,7 @@ void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, entry->dev = dev; entry->type = dma_debug_single; entry->pfn = page_to_pfn(page); - entry->offset = offset, + entry->offset = offset; entry->dev_addr = dma_addr; entry->size = size; entry->direction = direction; @@ -1235,7 +1234,6 @@ void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, add_dma_entry(entry); } -EXPORT_SYMBOL(debug_dma_map_page); void debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr) { @@ -1290,7 +1288,6 @@ void debug_dma_unmap_page(struct device *dev, dma_addr_t addr, return; check_unmap(&ref); } -EXPORT_SYMBOL(debug_dma_unmap_page); void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, int mapped_ents, int direction) @@ -1310,7 +1307,7 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, entry->type = dma_debug_sg; entry->dev = dev; entry->pfn = page_to_pfn(sg_page(s)); - entry->offset = s->offset, + entry->offset = s->offset; entry->size = sg_dma_len(s); entry->dev_addr = sg_dma_address(s); entry->direction = direction; @@ -1328,7 +1325,6 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, add_dma_entry(entry); } } -EXPORT_SYMBOL(debug_dma_map_sg); static int get_nr_mapped_entries(struct device *dev, struct dma_debug_entry *ref) @@ -1380,7 +1376,6 @@ void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, check_unmap(&ref); } } -EXPORT_SYMBOL(debug_dma_unmap_sg); void debug_dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t dma_addr, void *virt) @@ -1466,7 +1461,6 @@ void debug_dma_map_resource(struct device *dev, phys_addr_t addr, size_t size, add_dma_entry(entry); } -EXPORT_SYMBOL(debug_dma_map_resource); void debug_dma_unmap_resource(struct device *dev, dma_addr_t dma_addr, size_t size, int direction) @@ -1484,7 +1478,6 @@ void debug_dma_unmap_resource(struct device *dev, dma_addr_t dma_addr, check_unmap(&ref); } -EXPORT_SYMBOL(debug_dma_unmap_resource); void debug_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, int direction) @@ -1503,7 +1496,6 @@ void debug_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, check_sync(dev, &ref, true); } -EXPORT_SYMBOL(debug_dma_sync_single_for_cpu); void debug_dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size, @@ -1523,7 +1515,6 @@ void debug_dma_sync_single_for_device(struct device *dev, check_sync(dev, &ref, false); } -EXPORT_SYMBOL(debug_dma_sync_single_for_device); void debug_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, int direction) @@ -1556,7 +1547,6 @@ void debug_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, check_sync(dev, &ref, true); } } -EXPORT_SYMBOL(debug_dma_sync_sg_for_cpu); void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, int direction) @@ -1588,7 +1578,6 @@ void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, check_sync(dev, &ref, false); } } -EXPORT_SYMBOL(debug_dma_sync_sg_for_device); static int __init dma_debug_driver_setup(char *str) { diff --git a/kernel/dma/debug.h b/kernel/dma/debug.h new file mode 100644 index 000000000000..83643b3010b2 --- /dev/null +++ b/kernel/dma/debug.h @@ -0,0 +1,122 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2008 Advanced Micro Devices, Inc. + * + * Author: Joerg Roedel <joerg.roedel@amd.com> + */ + +#ifndef _KERNEL_DMA_DEBUG_H +#define _KERNEL_DMA_DEBUG_H + +#ifdef CONFIG_DMA_API_DEBUG +extern void debug_dma_map_page(struct device *dev, struct page *page, + size_t offset, size_t size, + int direction, dma_addr_t dma_addr); + +extern void debug_dma_unmap_page(struct device *dev, dma_addr_t addr, + size_t size, int direction); + +extern void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, + int nents, int mapped_ents, int direction); + +extern void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, + int nelems, int dir); + +extern void debug_dma_alloc_coherent(struct device *dev, size_t size, + dma_addr_t dma_addr, void *virt); + +extern void debug_dma_free_coherent(struct device *dev, size_t size, + void *virt, dma_addr_t addr); + +extern void debug_dma_map_resource(struct device *dev, phys_addr_t addr, + size_t size, int direction, + dma_addr_t dma_addr); + +extern void debug_dma_unmap_resource(struct device *dev, dma_addr_t dma_addr, + size_t size, int direction); + +extern void debug_dma_sync_single_for_cpu(struct device *dev, + dma_addr_t dma_handle, size_t size, + int direction); + +extern void debug_dma_sync_single_for_device(struct device *dev, + dma_addr_t dma_handle, + size_t size, int direction); + +extern void debug_dma_sync_sg_for_cpu(struct device *dev, + struct scatterlist *sg, + int nelems, int direction); + +extern void debug_dma_sync_sg_for_device(struct device *dev, + struct scatterlist *sg, + int nelems, int direction); +#else /* CONFIG_DMA_API_DEBUG */ +static inline void debug_dma_map_page(struct device *dev, struct page *page, + size_t offset, size_t size, + int direction, dma_addr_t dma_addr) +{ +} + +static inline void debug_dma_unmap_page(struct device *dev, dma_addr_t addr, + size_t size, int direction) +{ +} + +static inline void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, + int nents, int mapped_ents, int direction) +{ +} + +static inline void debug_dma_unmap_sg(struct device *dev, + struct scatterlist *sglist, + int nelems, int dir) +{ +} + +static inline void debug_dma_alloc_coherent(struct device *dev, size_t size, + dma_addr_t dma_addr, void *virt) +{ +} + +static inline void debug_dma_free_coherent(struct device *dev, size_t size, + void *virt, dma_addr_t addr) +{ +} + +static inline void debug_dma_map_resource(struct device *dev, phys_addr_t addr, + size_t size, int direction, + dma_addr_t dma_addr) +{ +} + +static inline void debug_dma_unmap_resource(struct device *dev, + dma_addr_t dma_addr, size_t size, + int direction) +{ +} + +static inline void debug_dma_sync_single_for_cpu(struct device *dev, + dma_addr_t dma_handle, + size_t size, int direction) +{ +} + +static inline void debug_dma_sync_single_for_device(struct device *dev, + dma_addr_t dma_handle, + size_t size, int direction) +{ +} + +static inline void debug_dma_sync_sg_for_cpu(struct device *dev, + struct scatterlist *sg, + int nelems, int direction) +{ +} + +static inline void debug_dma_sync_sg_for_device(struct device *dev, + struct scatterlist *sg, + int nelems, int direction) +{ +} +#endif /* CONFIG_DMA_API_DEBUG */ +#endif /* _KERNEL_DMA_DEBUG_H */ diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index bb0041e99659..06c111544f61 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -1,21 +1,22 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright (C) 2018 Christoph Hellwig. + * Copyright (C) 2018-2020 Christoph Hellwig. * * DMA operations that map physical memory directly without using an IOMMU. */ #include <linux/memblock.h> /* for max_pfn */ #include <linux/export.h> #include <linux/mm.h> -#include <linux/dma-direct.h> +#include <linux/dma-map-ops.h> #include <linux/scatterlist.h> -#include <linux/dma-contiguous.h> #include <linux/pfn.h> #include <linux/vmalloc.h> #include <linux/set_memory.h> +#include <linux/slab.h> +#include "direct.h" /* - * Most architectures use ZONE_DMA for the first 16 Megabytes, but some use it + * Most architectures use ZONE_DMA for the first 16 Megabytes, but some use * it for entirely different regions. In that case the arch code needs to * override the variable below for dma-direct to work properly. */ @@ -25,7 +26,7 @@ static inline dma_addr_t phys_to_dma_direct(struct device *dev, phys_addr_t phys) { if (force_dma_unencrypted(dev)) - return __phys_to_dma(dev, phys); + return phys_to_dma_unencrypted(dev, phys); return phys_to_dma(dev, phys); } @@ -43,16 +44,11 @@ u64 dma_direct_get_required_mask(struct device *dev) return (1ULL << (fls64(max_dma) - 1)) * 2 - 1; } -gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, +static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, u64 *phys_limit) { u64 dma_limit = min_not_zero(dma_mask, dev->bus_dma_limit); - if (force_dma_unencrypted(dev)) - *phys_limit = __dma_to_phys(dev, dma_limit); - else - *phys_limit = dma_to_phys(dev, dma_limit); - /* * Optimistically try the zone that the physical address mask falls * into first. If that returns memory that isn't actually addressable @@ -61,6 +57,7 @@ gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, * Note that GFP_DMA32 and GFP_DMA are no ops without the corresponding * zones. */ + *phys_limit = dma_to_phys(dev, dma_limit); if (*phys_limit <= DMA_BIT_MASK(zone_dma_bits)) return GFP_DMA; if (*phys_limit <= DMA_BIT_MASK(32)) @@ -68,47 +65,18 @@ gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, return 0; } -bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) -{ - return phys_to_dma_direct(dev, phys) + size - 1 <= - min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit); -} - -/* - * Decrypting memory is allowed to block, so if this device requires - * unencrypted memory it must come from atomic pools. - */ -static inline bool dma_should_alloc_from_pool(struct device *dev, gfp_t gfp, - unsigned long attrs) +static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) { - if (!IS_ENABLED(CONFIG_DMA_COHERENT_POOL)) - return false; - if (gfpflags_allow_blocking(gfp)) - return false; - if (force_dma_unencrypted(dev)) - return true; - if (!IS_ENABLED(CONFIG_DMA_DIRECT_REMAP)) - return false; - if (dma_alloc_need_uncached(dev, attrs)) - return true; - return false; -} + dma_addr_t dma_addr = phys_to_dma_direct(dev, phys); -static inline bool dma_should_free_from_pool(struct device *dev, - unsigned long attrs) -{ - if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL)) - return true; - if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) && - !force_dma_unencrypted(dev)) + if (dma_addr == DMA_MAPPING_ERROR) return false; - if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP)) - return true; - return false; + return dma_addr + size - 1 <= + min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit); } static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, - gfp_t gfp, unsigned long attrs) + gfp_t gfp) { int node = dev_to_node(dev); struct page *page = NULL; @@ -116,11 +84,6 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, WARN_ON_ONCE(!PAGE_ALIGNED(size)); - if (attrs & DMA_ATTR_NO_WARN) - gfp |= __GFP_NOWARN; - - /* we always manually zero the memory once we are done: */ - gfp &= ~__GFP_ZERO; gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, &phys_limit); page = dma_alloc_contiguous(dev, size, gfp); @@ -151,7 +114,23 @@ again: return page; } -void *dma_direct_alloc_pages(struct device *dev, size_t size, +static void *dma_direct_alloc_from_pool(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t gfp) +{ + struct page *page; + u64 phys_mask; + void *ret; + + gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, + &phys_mask); + page = dma_alloc_from_pool(dev, size, &ret, gfp, dma_coherent_ok); + if (!page) + return NULL; + *dma_handle = phys_to_dma_direct(dev, page_to_phys(page)); + return ret; +} + +void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) { struct page *page; @@ -159,30 +138,44 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, int err; size = PAGE_ALIGN(size); - - if (dma_should_alloc_from_pool(dev, gfp, attrs)) { - ret = dma_alloc_from_pool(dev, size, &page, gfp); - if (!ret) - return NULL; - goto done; - } - - page = __dma_direct_alloc_pages(dev, size, gfp, attrs); - if (!page) - return NULL; + if (attrs & DMA_ATTR_NO_WARN) + gfp |= __GFP_NOWARN; if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) && !force_dma_unencrypted(dev)) { + page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO); + if (!page) + return NULL; /* remove any dirty cache lines on the kernel alias */ if (!PageHighMem(page)) arch_dma_prep_coherent(page, size); + *dma_handle = phys_to_dma_direct(dev, page_to_phys(page)); /* return the page pointer as the opaque cookie */ - ret = page; - goto done; + return page; } + if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) && + !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && + !dev_is_dma_coherent(dev)) + return arch_dma_alloc(dev, size, dma_handle, gfp, attrs); + + /* + * Remapping or decrypting memory may block. If either is required and + * we can't block, allocate the memory from the atomic pools. + */ + if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) && + !gfpflags_allow_blocking(gfp) && + (force_dma_unencrypted(dev) || + (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && !dev_is_dma_coherent(dev)))) + return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp); + + /* we always manually zero the memory once we are done */ + page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO); + if (!page) + return NULL; + if ((IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && - dma_alloc_need_uncached(dev, attrs)) || + !dev_is_dma_coherent(dev)) || (IS_ENABLED(CONFIG_DMA_REMAP) && PageHighMem(page))) { /* remove any dirty cache lines on the kernel alias */ arch_dma_prep_coherent(page, size); @@ -225,17 +218,14 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, memset(ret, 0, size); if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) && - dma_alloc_need_uncached(dev, attrs)) { + !dev_is_dma_coherent(dev)) { arch_dma_prep_coherent(page, size); ret = arch_dma_set_uncached(ret, size); if (IS_ERR(ret)) goto out_encrypt_pages; } done: - if (force_dma_unencrypted(dev)) - *dma_handle = __phys_to_dma(dev, page_to_phys(page)); - else - *dma_handle = phys_to_dma(dev, page_to_phys(page)); + *dma_handle = phys_to_dma_direct(dev, page_to_phys(page)); return ret; out_encrypt_pages: @@ -251,16 +241,11 @@ out_free_pages: return NULL; } -void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, - dma_addr_t dma_addr, unsigned long attrs) +void dma_direct_free(struct device *dev, size_t size, + void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs) { unsigned int page_order = get_order(size); - /* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */ - if (dma_should_free_from_pool(dev, attrs) && - dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size))) - return; - if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) && !force_dma_unencrypted(dev)) { /* cpu_addr is a struct page cookie, not a kernel address */ @@ -268,6 +253,18 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, return; } + if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) && + !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && + !dev_is_dma_coherent(dev)) { + arch_dma_free(dev, size, cpu_addr, dma_addr, attrs); + return; + } + + /* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */ + if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) && + dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size))) + return; + if (force_dma_unencrypted(dev)) set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order); @@ -279,25 +276,60 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, dma_free_contiguous(dev, dma_direct_to_page(dev, dma_addr), size); } -void *dma_direct_alloc(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) +struct page *dma_direct_alloc_pages(struct device *dev, size_t size, + dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp) { - if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) && - !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && - dma_alloc_need_uncached(dev, attrs)) - return arch_dma_alloc(dev, size, dma_handle, gfp, attrs); - return dma_direct_alloc_pages(dev, size, dma_handle, gfp, attrs); + struct page *page; + void *ret; + + if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) && + force_dma_unencrypted(dev) && !gfpflags_allow_blocking(gfp)) + return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp); + + page = __dma_direct_alloc_pages(dev, size, gfp); + if (!page) + return NULL; + if (PageHighMem(page)) { + /* + * Depending on the cma= arguments and per-arch setup + * dma_alloc_contiguous could return highmem pages. + * Without remapping there is no way to return them here, + * so log an error and fail. + */ + dev_info(dev, "Rejecting highmem page from CMA.\n"); + goto out_free_pages; + } + + ret = page_address(page); + if (force_dma_unencrypted(dev)) { + if (set_memory_decrypted((unsigned long)ret, + 1 << get_order(size))) + goto out_free_pages; + } + memset(ret, 0, size); + *dma_handle = phys_to_dma_direct(dev, page_to_phys(page)); + return page; +out_free_pages: + dma_free_contiguous(dev, page, size); + return NULL; } -void dma_direct_free(struct device *dev, size_t size, - void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs) +void dma_direct_free_pages(struct device *dev, size_t size, + struct page *page, dma_addr_t dma_addr, + enum dma_data_direction dir) { - if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) && - !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && - dma_alloc_need_uncached(dev, attrs)) - arch_dma_free(dev, size, cpu_addr, dma_addr, attrs); - else - dma_direct_free_pages(dev, size, cpu_addr, dma_addr, attrs); + unsigned int page_order = get_order(size); + void *vaddr = page_address(page); + + /* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */ + if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) && + dma_free_from_pool(dev, vaddr, size)) + return; + + if (force_dma_unencrypted(dev)) + set_memory_encrypted((unsigned long)vaddr, 1 << page_order); + + dma_free_contiguous(dev, page, size); } #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \ @@ -340,6 +372,9 @@ void dma_direct_sync_sg_for_cpu(struct device *dev, if (unlikely(is_swiotlb_buffer(paddr))) swiotlb_tbl_sync_single(dev, paddr, sg->length, dir, SYNC_FOR_CPU); + + if (dir == DMA_FROM_DEVICE) + arch_dma_mark_clean(paddr, sg->length); } if (!dev_is_dma_coherent(dev)) @@ -448,13 +483,13 @@ int dma_direct_supported(struct device *dev, u64 mask) return 1; /* - * This check needs to be against the actual bit mask value, so - * use __phys_to_dma() here so that the SME encryption mask isn't + * This check needs to be against the actual bit mask value, so use + * phys_to_dma_unencrypted() here so that the SME encryption mask isn't * part of the check. */ if (IS_ENABLED(CONFIG_ZONE_DMA)) min_mask = min_t(u64, min_mask, DMA_BIT_MASK(zone_dma_bits)); - return mask >= __phys_to_dma(dev, min_mask); + return mask >= phys_to_dma_unencrypted(dev, min_mask); } size_t dma_direct_max_mapping_size(struct device *dev) @@ -471,3 +506,45 @@ bool dma_direct_need_sync(struct device *dev, dma_addr_t dma_addr) return !dev_is_dma_coherent(dev) || is_swiotlb_buffer(dma_to_phys(dev, dma_addr)); } + +/** + * dma_direct_set_offset - Assign scalar offset for a single DMA range. + * @dev: device pointer; needed to "own" the alloced memory. + * @cpu_start: beginning of memory region covered by this offset. + * @dma_start: beginning of DMA/PCI region covered by this offset. + * @size: size of the region. + * + * This is for the simple case of a uniform offset which cannot + * be discovered by "dma-ranges". + * + * It returns -ENOMEM if out of memory, -EINVAL if a map + * already exists, 0 otherwise. + * + * Note: any call to this from a driver is a bug. The mapping needs + * to be described by the device tree or other firmware interfaces. + */ +int dma_direct_set_offset(struct device *dev, phys_addr_t cpu_start, + dma_addr_t dma_start, u64 size) +{ + struct bus_dma_region *map; + u64 offset = (u64)cpu_start - (u64)dma_start; + + if (dev->dma_range_map) { + dev_err(dev, "attempt to add DMA range to existing map\n"); + return -EINVAL; + } + + if (!offset) + return 0; + + map = kcalloc(2, sizeof(*map), GFP_KERNEL); + if (!map) + return -ENOMEM; + map[0].cpu_start = cpu_start; + map[0].dma_start = dma_start; + map[0].offset = offset; + map[0].size = size; + dev->dma_range_map = map; + return 0; +} +EXPORT_SYMBOL_GPL(dma_direct_set_offset); diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h new file mode 100644 index 000000000000..b98615578737 --- /dev/null +++ b/kernel/dma/direct.h @@ -0,0 +1,119 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2018 Christoph Hellwig. + * + * DMA operations that map physical memory directly without using an IOMMU. + */ +#ifndef _KERNEL_DMA_DIRECT_H +#define _KERNEL_DMA_DIRECT_H + +#include <linux/dma-direct.h> + +int dma_direct_get_sgtable(struct device *dev, struct sg_table *sgt, + void *cpu_addr, dma_addr_t dma_addr, size_t size, + unsigned long attrs); +bool dma_direct_can_mmap(struct device *dev); +int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma, + void *cpu_addr, dma_addr_t dma_addr, size_t size, + unsigned long attrs); +bool dma_direct_need_sync(struct device *dev, dma_addr_t dma_addr); +int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, + enum dma_data_direction dir, unsigned long attrs); +size_t dma_direct_max_mapping_size(struct device *dev); + +#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \ + defined(CONFIG_SWIOTLB) +void dma_direct_sync_sg_for_device(struct device *dev, struct scatterlist *sgl, + int nents, enum dma_data_direction dir); +#else +static inline void dma_direct_sync_sg_for_device(struct device *dev, + struct scatterlist *sgl, int nents, enum dma_data_direction dir) +{ +} +#endif + +#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \ + defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \ + defined(CONFIG_SWIOTLB) +void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl, + int nents, enum dma_data_direction dir, unsigned long attrs); +void dma_direct_sync_sg_for_cpu(struct device *dev, + struct scatterlist *sgl, int nents, enum dma_data_direction dir); +#else +static inline void dma_direct_unmap_sg(struct device *dev, + struct scatterlist *sgl, int nents, enum dma_data_direction dir, + unsigned long attrs) +{ +} +static inline void dma_direct_sync_sg_for_cpu(struct device *dev, + struct scatterlist *sgl, int nents, enum dma_data_direction dir) +{ +} +#endif + +static inline void dma_direct_sync_single_for_device(struct device *dev, + dma_addr_t addr, size_t size, enum dma_data_direction dir) +{ + phys_addr_t paddr = dma_to_phys(dev, addr); + + if (unlikely(is_swiotlb_buffer(paddr))) + swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_DEVICE); + + if (!dev_is_dma_coherent(dev)) + arch_sync_dma_for_device(paddr, size, dir); +} + +static inline void dma_direct_sync_single_for_cpu(struct device *dev, + dma_addr_t addr, size_t size, enum dma_data_direction dir) +{ + phys_addr_t paddr = dma_to_phys(dev, addr); + + if (!dev_is_dma_coherent(dev)) { + arch_sync_dma_for_cpu(paddr, size, dir); + arch_sync_dma_for_cpu_all(); + } + + if (unlikely(is_swiotlb_buffer(paddr))) + swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_CPU); + + if (dir == DMA_FROM_DEVICE) + arch_dma_mark_clean(paddr, size); +} + +static inline dma_addr_t dma_direct_map_page(struct device *dev, + struct page *page, unsigned long offset, size_t size, + enum dma_data_direction dir, unsigned long attrs) +{ + phys_addr_t phys = page_to_phys(page) + offset; + dma_addr_t dma_addr = phys_to_dma(dev, phys); + + if (unlikely(swiotlb_force == SWIOTLB_FORCE)) + return swiotlb_map(dev, phys, size, dir, attrs); + + if (unlikely(!dma_capable(dev, dma_addr, size, true))) { + if (swiotlb_force != SWIOTLB_NO_FORCE) + return swiotlb_map(dev, phys, size, dir, attrs); + + dev_WARN_ONCE(dev, 1, + "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n", + &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); + return DMA_MAPPING_ERROR; + } + + if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) + arch_sync_dma_for_device(phys, size, dir); + return dma_addr; +} + +static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr, + size_t size, enum dma_data_direction dir, unsigned long attrs) +{ + phys_addr_t phys = dma_to_phys(dev, addr); + + if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) + dma_direct_sync_single_for_cpu(dev, addr, size, dir); + + if (unlikely(is_swiotlb_buffer(phys))) + swiotlb_tbl_unmap_single(dev, phys, size, size, dir, attrs); +} +#endif /* _KERNEL_DMA_DIRECT_H */ diff --git a/kernel/dma/dummy.c b/kernel/dma/dummy.c index 05607642c888..eacd4c5b10bf 100644 --- a/kernel/dma/dummy.c +++ b/kernel/dma/dummy.c @@ -2,7 +2,7 @@ /* * Dummy DMA ops that always fail. */ -#include <linux/dma-mapping.h> +#include <linux/dma-map-ops.h> static int dma_dummy_mmap(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, dma_addr_t dma_addr, size_t size, @@ -36,4 +36,3 @@ const struct dma_map_ops dma_dummy_ops = { .map_sg = dma_dummy_map_sg, .dma_supported = dma_dummy_supported, }; -EXPORT_SYMBOL(dma_dummy_ops); diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 0d129421e75f..51bb8fa8eb89 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -7,13 +7,14 @@ */ #include <linux/memblock.h> /* for max_pfn */ #include <linux/acpi.h> -#include <linux/dma-direct.h> -#include <linux/dma-noncoherent.h> +#include <linux/dma-map-ops.h> #include <linux/export.h> #include <linux/gfp.h> #include <linux/of_device.h> #include <linux/slab.h> #include <linux/vmalloc.h> +#include "debug.h" +#include "direct.h" /* * Managed DMA API @@ -144,6 +145,10 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, dma_addr_t addr; BUG_ON(!valid_dma_direction(dir)); + + if (WARN_ON_ONCE(!dev->dma_mask)) + return DMA_MAPPING_ERROR; + if (dma_map_direct(dev, ops)) addr = dma_direct_map_page(dev, page, offset, size, dir, attrs); else @@ -179,6 +184,10 @@ int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, int ents; BUG_ON(!valid_dma_direction(dir)); + + if (WARN_ON_ONCE(!dev->dma_mask)) + return 0; + if (dma_map_direct(dev, ops)) ents = dma_direct_map_sg(dev, sg, nents, dir, attrs); else @@ -213,6 +222,9 @@ dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr, BUG_ON(!valid_dma_direction(dir)); + if (WARN_ON_ONCE(!dev->dma_mask)) + return DMA_MAPPING_ERROR; + /* Don't allow RAM to be mapped */ if (WARN_ON_ONCE(pfn_valid(PHYS_PFN(phys_addr)))) return DMA_MAPPING_ERROR; @@ -296,22 +308,6 @@ void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, EXPORT_SYMBOL(dma_sync_sg_for_device); /* - * Create scatter-list for the already allocated DMA buffer. - */ -int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt, - void *cpu_addr, dma_addr_t dma_addr, size_t size, - unsigned long attrs) -{ - struct page *page = virt_to_page(cpu_addr); - int ret; - - ret = sg_alloc_table(sgt, 1, GFP_KERNEL); - if (!ret) - sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0); - return ret; -} - -/* * The whole dma_get_sgtable() idea is fundamentally unsafe - it seems * that the intention is to allow exporting memory allocated via the * coherent DMA APIs through the dma_buf API, which only accepts a @@ -346,9 +342,7 @@ pgprot_t dma_pgprot(struct device *dev, pgprot_t prot, unsigned long attrs) { if (force_dma_unencrypted(dev)) prot = pgprot_decrypted(prot); - if (dev_is_dma_coherent(dev) || - (IS_ENABLED(CONFIG_DMA_NONCOHERENT_CACHE_SYNC) && - (attrs & DMA_ATTR_NON_CONSISTENT))) + if (dev_is_dma_coherent(dev)) return prot; #ifdef CONFIG_ARCH_HAS_DMA_WRITE_COMBINE if (attrs & DMA_ATTR_WRITE_COMBINE) @@ -358,35 +352,6 @@ pgprot_t dma_pgprot(struct device *dev, pgprot_t prot, unsigned long attrs) } #endif /* CONFIG_MMU */ -/* - * Create userspace mapping for the DMA-coherent memory. - */ -int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, - void *cpu_addr, dma_addr_t dma_addr, size_t size, - unsigned long attrs) -{ -#ifdef CONFIG_MMU - unsigned long user_count = vma_pages(vma); - unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT; - unsigned long off = vma->vm_pgoff; - int ret = -ENXIO; - - vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs); - - if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret)) - return ret; - - if (off >= count || user_count > count - off) - return -ENXIO; - - return remap_pfn_range(vma, vma->vm_start, - page_to_pfn(virt_to_page(cpu_addr)) + vma->vm_pgoff, - user_count << PAGE_SHIFT, vma->vm_page_prot); -#else - return -ENXIO; -#endif /* CONFIG_MMU */ -} - /** * dma_can_mmap - check if a given device supports dma_mmap_* * @dev: device to check @@ -506,6 +471,86 @@ void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr, } EXPORT_SYMBOL(dma_free_attrs); +struct page *dma_alloc_pages(struct device *dev, size_t size, + dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + struct page *page; + + if (WARN_ON_ONCE(!dev->coherent_dma_mask)) + return NULL; + if (WARN_ON_ONCE(gfp & (__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM))) + return NULL; + + size = PAGE_ALIGN(size); + if (dma_alloc_direct(dev, ops)) + page = dma_direct_alloc_pages(dev, size, dma_handle, dir, gfp); + else if (ops->alloc_pages) + page = ops->alloc_pages(dev, size, dma_handle, dir, gfp); + else + return NULL; + + debug_dma_map_page(dev, page, 0, size, dir, *dma_handle); + + return page; +} +EXPORT_SYMBOL_GPL(dma_alloc_pages); + +void dma_free_pages(struct device *dev, size_t size, struct page *page, + dma_addr_t dma_handle, enum dma_data_direction dir) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + size = PAGE_ALIGN(size); + debug_dma_unmap_page(dev, dma_handle, size, dir); + + if (dma_alloc_direct(dev, ops)) + dma_direct_free_pages(dev, size, page, dma_handle, dir); + else if (ops->free_pages) + ops->free_pages(dev, size, page, dma_handle, dir); +} +EXPORT_SYMBOL_GPL(dma_free_pages); + +void *dma_alloc_noncoherent(struct device *dev, size_t size, + dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + void *vaddr; + + if (!ops || !ops->alloc_noncoherent) { + struct page *page; + + page = dma_alloc_pages(dev, size, dma_handle, dir, gfp); + if (!page) + return NULL; + return page_address(page); + } + + size = PAGE_ALIGN(size); + vaddr = ops->alloc_noncoherent(dev, size, dma_handle, dir, gfp); + if (vaddr) + debug_dma_map_page(dev, virt_to_page(vaddr), 0, size, dir, + *dma_handle); + return vaddr; +} +EXPORT_SYMBOL_GPL(dma_alloc_noncoherent); + +void dma_free_noncoherent(struct device *dev, size_t size, void *vaddr, + dma_addr_t dma_handle, enum dma_data_direction dir) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + if (!ops || !ops->free_noncoherent) { + dma_free_pages(dev, size, virt_to_page(vaddr), dma_handle, dir); + return; + } + + size = PAGE_ALIGN(size); + debug_dma_unmap_page(dev, dma_handle, size, dir); + ops->free_noncoherent(dev, size, vaddr, dma_handle, dir); +} +EXPORT_SYMBOL_GPL(dma_free_noncoherent); + int dma_supported(struct device *dev, u64 mask) { const struct dma_map_ops *ops = get_dma_ops(dev); @@ -563,20 +608,6 @@ int dma_set_coherent_mask(struct device *dev, u64 mask) EXPORT_SYMBOL(dma_set_coherent_mask); #endif -void dma_cache_sync(struct device *dev, void *vaddr, size_t size, - enum dma_data_direction dir) -{ - const struct dma_map_ops *ops = get_dma_ops(dev); - - BUG_ON(!valid_dma_direction(dir)); - - if (dma_alloc_direct(dev, ops)) - arch_dma_cache_sync(dev, vaddr, size, dir); - else if (ops->cache_sync) - ops->cache_sync(dev, vaddr, size, dir); -} -EXPORT_SYMBOL(dma_cache_sync); - size_t dma_max_mapping_size(struct device *dev) { const struct dma_map_ops *ops = get_dma_ops(dev); diff --git a/kernel/dma/ops_helpers.c b/kernel/dma/ops_helpers.c new file mode 100644 index 000000000000..910ae69cae77 --- /dev/null +++ b/kernel/dma/ops_helpers.c @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Helpers for DMA ops implementations. These generally rely on the fact that + * the allocated memory contains normal pages in the direct kernel mapping. + */ +#include <linux/dma-map-ops.h> + +/* + * Create scatter-list for the already allocated DMA buffer. + */ +int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt, + void *cpu_addr, dma_addr_t dma_addr, size_t size, + unsigned long attrs) +{ + struct page *page = virt_to_page(cpu_addr); + int ret; + + ret = sg_alloc_table(sgt, 1, GFP_KERNEL); + if (!ret) + sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0); + return ret; +} + +/* + * Create userspace mapping for the DMA-coherent memory. + */ +int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, + void *cpu_addr, dma_addr_t dma_addr, size_t size, + unsigned long attrs) +{ +#ifdef CONFIG_MMU + unsigned long user_count = vma_pages(vma); + unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT; + unsigned long off = vma->vm_pgoff; + int ret = -ENXIO; + + vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs); + + if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret)) + return ret; + + if (off >= count || user_count > count - off) + return -ENXIO; + + return remap_pfn_range(vma, vma->vm_start, + page_to_pfn(virt_to_page(cpu_addr)) + vma->vm_pgoff, + user_count << PAGE_SHIFT, vma->vm_page_prot); +#else + return -ENXIO; +#endif /* CONFIG_MMU */ +} + +struct page *dma_common_alloc_pages(struct device *dev, size_t size, + dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + struct page *page; + + page = dma_alloc_contiguous(dev, size, gfp); + if (!page) + page = alloc_pages_node(dev_to_node(dev), gfp, get_order(size)); + if (!page) + return NULL; + + *dma_handle = ops->map_page(dev, page, 0, size, dir, + DMA_ATTR_SKIP_CPU_SYNC); + if (*dma_handle == DMA_MAPPING_ERROR) { + dma_free_contiguous(dev, page, size); + return NULL; + } + + memset(page_address(page), 0, size); + return page; +} + +void dma_common_free_pages(struct device *dev, size_t size, struct page *page, + dma_addr_t dma_handle, enum dma_data_direction dir) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + if (ops->unmap_page) + ops->unmap_page(dev, dma_handle, size, dir, + DMA_ATTR_SKIP_CPU_SYNC); + dma_free_contiguous(dev, page, size); +} diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index 6bc74a2d5127..d4637f72239b 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -3,9 +3,10 @@ * Copyright (C) 2012 ARM Ltd. * Copyright (C) 2020 Google LLC */ +#include <linux/cma.h> #include <linux/debugfs.h> +#include <linux/dma-map-ops.h> #include <linux/dma-direct.h> -#include <linux/dma-noncoherent.h> #include <linux/init.h> #include <linux/genalloc.h> #include <linux/set_memory.h> @@ -55,11 +56,34 @@ static void dma_atomic_pool_size_add(gfp_t gfp, size_t size) pool_size_kernel += size; } +static bool cma_in_zone(gfp_t gfp) +{ + unsigned long size; + phys_addr_t end; + struct cma *cma; + + cma = dev_get_cma_area(NULL); + if (!cma) + return false; + + size = cma_get_size(cma); + if (!size) + return false; + + /* CMA can't cross zone boundaries, see cma_activate_area() */ + end = cma_get_base(cma) + size - 1; + if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA)) + return end <= DMA_BIT_MASK(zone_dma_bits); + if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32)) + return end <= DMA_BIT_MASK(32); + return true; +} + static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size, gfp_t gfp) { unsigned int order; - struct page *page; + struct page *page = NULL; void *addr; int ret = -ENOMEM; @@ -68,7 +92,11 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size, do { pool_size = 1 << (PAGE_SHIFT + order); - page = alloc_pages(gfp, order); + if (cma_in_zone(gfp)) + page = dma_alloc_from_contiguous(NULL, 1 << order, + order, false); + if (!page) + page = alloc_pages(gfp, order); } while (!page && order-- > 0); if (!page) goto out; @@ -86,7 +114,7 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size, #endif /* * Memory in the atomic DMA pools must be unencrypted, the pools do not - * shrink so no re-encryption occurs in dma_direct_free_pages(). + * shrink so no re-encryption occurs in dma_direct_free(). */ ret = set_memory_decrypted((unsigned long)page_to_virt(page), 1 << order); @@ -196,93 +224,75 @@ static int __init dma_atomic_pool_init(void) } postcore_initcall(dma_atomic_pool_init); -static inline struct gen_pool *dma_guess_pool_from_device(struct device *dev) +static inline struct gen_pool *dma_guess_pool(struct gen_pool *prev, gfp_t gfp) { - u64 phys_mask; - gfp_t gfp; - - gfp = dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, - &phys_mask); - if (IS_ENABLED(CONFIG_ZONE_DMA) && gfp == GFP_DMA) + if (prev == NULL) { + if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32)) + return atomic_pool_dma32; + if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA)) + return atomic_pool_dma; + return atomic_pool_kernel; + } + if (prev == atomic_pool_kernel) + return atomic_pool_dma32 ? atomic_pool_dma32 : atomic_pool_dma; + if (prev == atomic_pool_dma32) return atomic_pool_dma; - if (IS_ENABLED(CONFIG_ZONE_DMA32) && gfp == GFP_DMA32) - return atomic_pool_dma32; - return atomic_pool_kernel; + return NULL; } -static inline struct gen_pool *dma_get_safer_pool(struct gen_pool *bad_pool) +static struct page *__dma_alloc_from_pool(struct device *dev, size_t size, + struct gen_pool *pool, void **cpu_addr, + bool (*phys_addr_ok)(struct device *, phys_addr_t, size_t)) { - if (bad_pool == atomic_pool_kernel) - return atomic_pool_dma32 ? : atomic_pool_dma; + unsigned long addr; + phys_addr_t phys; - if (bad_pool == atomic_pool_dma32) - return atomic_pool_dma; + addr = gen_pool_alloc(pool, size); + if (!addr) + return NULL; - return NULL; -} + phys = gen_pool_virt_to_phys(pool, addr); + if (phys_addr_ok && !phys_addr_ok(dev, phys, size)) { + gen_pool_free(pool, addr, size); + return NULL; + } -static inline struct gen_pool *dma_guess_pool(struct device *dev, - struct gen_pool *bad_pool) -{ - if (bad_pool) - return dma_get_safer_pool(bad_pool); + if (gen_pool_avail(pool) < atomic_pool_size) + schedule_work(&atomic_pool_work); - return dma_guess_pool_from_device(dev); + *cpu_addr = (void *)addr; + memset(*cpu_addr, 0, size); + return pfn_to_page(__phys_to_pfn(phys)); } -void *dma_alloc_from_pool(struct device *dev, size_t size, - struct page **ret_page, gfp_t flags) +struct page *dma_alloc_from_pool(struct device *dev, size_t size, + void **cpu_addr, gfp_t gfp, + bool (*phys_addr_ok)(struct device *, phys_addr_t, size_t)) { struct gen_pool *pool = NULL; - unsigned long val = 0; - void *ptr = NULL; - phys_addr_t phys; - - while (1) { - pool = dma_guess_pool(dev, pool); - if (!pool) { - WARN(1, "Failed to get suitable pool for %s\n", - dev_name(dev)); - break; - } - - val = gen_pool_alloc(pool, size); - if (!val) - continue; - - phys = gen_pool_virt_to_phys(pool, val); - if (dma_coherent_ok(dev, phys, size)) - break; - - gen_pool_free(pool, val, size); - val = 0; - } - - - if (val) { - *ret_page = pfn_to_page(__phys_to_pfn(phys)); - ptr = (void *)val; - memset(ptr, 0, size); + struct page *page; - if (gen_pool_avail(pool) < atomic_pool_size) - schedule_work(&atomic_pool_work); + while ((pool = dma_guess_pool(pool, gfp))) { + page = __dma_alloc_from_pool(dev, size, pool, cpu_addr, + phys_addr_ok); + if (page) + return page; } - return ptr; + WARN(1, "Failed to get suitable pool for %s\n", dev_name(dev)); + return NULL; } bool dma_free_from_pool(struct device *dev, void *start, size_t size) { struct gen_pool *pool = NULL; - while (1) { - pool = dma_guess_pool(dev, pool); - if (!pool) - return false; - - if (gen_pool_has_addr(pool, (unsigned long)start, size)) { - gen_pool_free(pool, (unsigned long)start, size); - return true; - } + while ((pool = dma_guess_pool(pool, 0))) { + if (!gen_pool_has_addr(pool, (unsigned long)start, size)) + continue; + gen_pool_free(pool, (unsigned long)start, size); + return true; } + + return false; } diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index 78b23f089cf1..905c3fa005f1 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -2,7 +2,7 @@ /* * Copyright (c) 2014 The Linux Foundation */ -#include <linux/dma-mapping.h> +#include <linux/dma-map-ops.h> #include <linux/slab.h> #include <linux/vmalloc.h> diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index c19379fabd20..b4eea0abc3f0 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -22,7 +22,7 @@ #include <linux/cache.h> #include <linux/dma-direct.h> -#include <linux/dma-noncoherent.h> +#include <linux/dma-map-ops.h> #include <linux/mm.h> #include <linux/export.h> #include <linux/spinlock.h> @@ -93,7 +93,7 @@ static unsigned int io_tlb_index; * Max segment that we can provide which (if pages are contingous) will * not be bounced (unless SWIOTLB_FORCE is set). */ -unsigned int max_segment; +static unsigned int max_segment; /* * We need to save away the original address corresponding to a mapped entry @@ -172,9 +172,7 @@ void swiotlb_print_info(void) return; } - pr_info("mapped [mem %#010llx-%#010llx] (%luMB)\n", - (unsigned long long)io_tlb_start, - (unsigned long long)io_tlb_end, + pr_info("mapped [mem %pa-%pa] (%luMB)\n", &io_tlb_start, &io_tlb_end, bytes >> 20); } @@ -670,13 +668,13 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size, swiotlb_force); swiotlb_addr = swiotlb_tbl_map_single(dev, - __phys_to_dma(dev, io_tlb_start), + phys_to_dma_unencrypted(dev, io_tlb_start), paddr, size, size, dir, attrs); if (swiotlb_addr == (phys_addr_t)DMA_MAPPING_ERROR) return DMA_MAPPING_ERROR; /* Ensure that the address returned is DMA'ble */ - dma_addr = __phys_to_dma(dev, swiotlb_addr); + dma_addr = phys_to_dma_unencrypted(dev, swiotlb_addr); if (unlikely(!dma_capable(dev, dma_addr, size, true))) { swiotlb_tbl_unmap_single(dev, swiotlb_addr, size, size, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC); diff --git a/kernel/dma/virt.c b/kernel/dma/virt.c index ebe128833af7..59d32317dd57 100644 --- a/kernel/dma/virt.c +++ b/kernel/dma/virt.c @@ -4,7 +4,7 @@ */ #include <linux/export.h> #include <linux/mm.h> -#include <linux/dma-mapping.h> +#include <linux/dma-map-ops.h> #include <linux/scatterlist.h> static void *dma_virt_alloc(struct device *dev, size_t size, @@ -55,5 +55,7 @@ const struct dma_map_ops dma_virt_ops = { .free = dma_virt_free, .map_page = dma_virt_map_page, .map_sg = dma_virt_map_sg, + .alloc_pages = dma_common_alloc_pages, + .free_pages = dma_common_free_pages, }; EXPORT_SYMBOL(dma_virt_ops); diff --git a/kernel/entry/common.c b/kernel/entry/common.c index f7ed415a6768..3a1dfecc533e 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -60,6 +60,9 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall, return ret; } + /* Either of the above might have changed the syscall number */ + syscall = syscall_get_nr(current, regs); + if (unlikely(ti_work & _TIF_SYSCALL_TRACEPOINT)) trace_sys_enter(regs, syscall); @@ -68,22 +71,45 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall, return ret ? : syscall; } -noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall) +static __always_inline long +__syscall_enter_from_user_work(struct pt_regs *regs, long syscall) { unsigned long ti_work; - enter_from_user_mode(regs); - instrumentation_begin(); - - local_irq_enable(); ti_work = READ_ONCE(current_thread_info()->flags); if (ti_work & SYSCALL_ENTER_WORK) syscall = syscall_trace_enter(regs, syscall, ti_work); - instrumentation_end(); return syscall; } +long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall) +{ + return __syscall_enter_from_user_work(regs, syscall); +} + +noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall) +{ + long ret; + + enter_from_user_mode(regs); + + instrumentation_begin(); + local_irq_enable(); + ret = __syscall_enter_from_user_work(regs, syscall); + instrumentation_end(); + + return ret; +} + +noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs) +{ + enter_from_user_mode(regs); + instrumentation_begin(); + local_irq_enable(); + instrumentation_end(); +} + /** * exit_to_user_mode - Fixup state when exiting to user mode * @@ -143,7 +169,6 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, handle_signal_work(regs, ti_work); if (ti_work & _TIF_NOTIFY_RESUME) { - clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); rseq_handle_notify_resume(NULL, regs); } @@ -190,7 +215,7 @@ static inline bool report_single_step(unsigned long ti_work) /* * If TIF_SYSCALL_EMU is set, then the only reason to report is when * TIF_SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall - * instruction has been already reported in syscall_enter_from_usermode(). + * instruction has been already reported in syscall_enter_from_user_mode(). */ #define SYSEMU_STEP (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU) @@ -286,7 +311,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) * terminate a grace period, if and only if the timer interrupt is * not nested into another interrupt. * - * Checking for __rcu_is_watching() here would prevent the nesting + * Checking for rcu_is_watching() here would prevent the nesting * interrupt to invoke rcu_irq_enter(). If that nested interrupt is * the tick then rcu_flavor_sched_clock_irq() would wrongfully * assume that it is the first interupt and eventually claim @@ -320,10 +345,10 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) * already contains a warning when RCU is not watching, so no point * in having another one here. */ + lockdep_hardirqs_off(CALLER_ADDR0); instrumentation_begin(); rcu_irq_enter_check_tick(); - /* Use the combo lockdep/tracing function */ - trace_hardirqs_off(); + trace_hardirqs_off_finish(); instrumentation_end(); return ret; diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c index b828a3ddebf1..49972ee99aff 100644 --- a/kernel/entry/kvm.c +++ b/kernel/entry/kvm.c @@ -19,10 +19,8 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work) if (ti_work & _TIF_NEED_RESCHED) schedule(); - if (ti_work & _TIF_NOTIFY_RESUME) { - clear_thread_flag(TIF_NOTIFY_RESUME); + if (ti_work & _TIF_NOTIFY_RESUME) tracehook_notify_resume(NULL); - } ret = arch_xfer_to_guest_mode_handle_work(vcpu, ti_work); if (ret) diff --git a/kernel/events/core.c b/kernel/events/core.c index 5bfe8e3c6e44..da467e1dd49a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -99,7 +99,7 @@ static void remote_function(void *data) * retry due to any failures in smp_call_function_single(), such as if the * task_cpu() goes offline concurrently. * - * returns @func return value or -ESRCH when the process isn't running + * returns @func return value or -ESRCH or -ENXIO when the process isn't running */ static int task_function_call(struct task_struct *p, remote_function_f func, void *info) @@ -115,7 +115,8 @@ task_function_call(struct task_struct *p, remote_function_f func, void *info) for (;;) { ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1); - ret = !ret ? data.ret : -EAGAIN; + if (!ret) + ret = data.ret; if (ret != -EAGAIN) break; @@ -382,7 +383,6 @@ static DEFINE_MUTEX(perf_sched_mutex); static atomic_t perf_sched_count; static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); -static DEFINE_PER_CPU(int, perf_sched_cb_usages); static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events); static atomic_t nr_mmap_events __read_mostly; @@ -2133,8 +2133,24 @@ static inline struct list_head *get_event_list(struct perf_event *event) return event->attr.pinned ? &ctx->pinned_active : &ctx->flexible_active; } +/* + * Events that have PERF_EV_CAP_SIBLING require being part of a group and + * cannot exist on their own, schedule them out and move them into the ERROR + * state. Also see _perf_event_enable(), it will not be able to recover + * this ERROR state. + */ +static inline void perf_remove_sibling_event(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + + event_sched_out(event, cpuctx, ctx); + perf_event_set_state(event, PERF_EVENT_STATE_ERROR); +} + static void perf_group_detach(struct perf_event *event) { + struct perf_event *leader = event->group_leader; struct perf_event *sibling, *tmp; struct perf_event_context *ctx = event->ctx; @@ -2153,7 +2169,7 @@ static void perf_group_detach(struct perf_event *event) /* * If this is a sibling, remove it from its group. */ - if (event->group_leader != event) { + if (leader != event) { list_del_init(&event->sibling_list); event->group_leader->nr_siblings--; goto out; @@ -2166,6 +2182,9 @@ static void perf_group_detach(struct perf_event *event) */ list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) { + if (sibling->event_caps & PERF_EV_CAP_SIBLING) + perf_remove_sibling_event(sibling); + sibling->group_leader = sibling; list_del_init(&sibling->sibling_list); @@ -2183,10 +2202,10 @@ static void perf_group_detach(struct perf_event *event) } out: - perf_event__header_size(event->group_leader); - - for_each_sibling_event(tmp, event->group_leader) + for_each_sibling_event(tmp, leader) perf_event__header_size(tmp); + + perf_event__header_size(leader); } static bool is_orphaned_event(struct perf_event *event) @@ -2979,6 +2998,7 @@ static void _perf_event_enable(struct perf_event *event) raw_spin_lock_irq(&ctx->lock); if (event->state >= PERF_EVENT_STATE_INACTIVE || event->state < PERF_EVENT_STATE_ERROR) { +out: raw_spin_unlock_irq(&ctx->lock); return; } @@ -2990,8 +3010,16 @@ static void _perf_event_enable(struct perf_event *event) * has gone back into error state, as distinct from the task having * been scheduled away before the cross-call arrived. */ - if (event->state == PERF_EVENT_STATE_ERROR) + if (event->state == PERF_EVENT_STATE_ERROR) { + /* + * Detached SIBLING events cannot leave ERROR state. + */ + if (event->event_caps & PERF_EV_CAP_SIBLING && + event->group_leader == event) + goto out; + event->state = PERF_EVENT_STATE_OFF; + } raw_spin_unlock_irq(&ctx->lock); event_function_call(event, __perf_event_enable, NULL); @@ -3356,10 +3384,12 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, struct perf_event_context *parent, *next_parent; struct perf_cpu_context *cpuctx; int do_switch = 1; + struct pmu *pmu; if (likely(!ctx)) return; + pmu = ctx->pmu; cpuctx = __get_cpu_context(ctx); if (!cpuctx->task_ctx) return; @@ -3389,11 +3419,15 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, raw_spin_lock(&ctx->lock); raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); if (context_equiv(ctx, next_ctx)) { - struct pmu *pmu = ctx->pmu; WRITE_ONCE(ctx->task, next); WRITE_ONCE(next_ctx->task, task); + perf_pmu_disable(pmu); + + if (cpuctx->sched_cb_usage && pmu->sched_task) + pmu->sched_task(ctx, false); + /* * PMU specific parts of task perf context can require * additional synchronization. As an example of such @@ -3405,6 +3439,8 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, else swap(ctx->task_ctx_data, next_ctx->task_ctx_data); + perf_pmu_enable(pmu); + /* * RCU_INIT_POINTER here is safe because we've not * modified the ctx and the above modification of @@ -3427,21 +3463,22 @@ unlock: if (do_switch) { raw_spin_lock(&ctx->lock); + perf_pmu_disable(pmu); + + if (cpuctx->sched_cb_usage && pmu->sched_task) + pmu->sched_task(ctx, false); task_ctx_sched_out(cpuctx, ctx, EVENT_ALL); + + perf_pmu_enable(pmu); raw_spin_unlock(&ctx->lock); } } -static DEFINE_PER_CPU(struct list_head, sched_cb_list); - void perf_sched_cb_dec(struct pmu *pmu) { struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - this_cpu_dec(perf_sched_cb_usages); - - if (!--cpuctx->sched_cb_usage) - list_del(&cpuctx->sched_cb_entry); + --cpuctx->sched_cb_usage; } @@ -3449,10 +3486,7 @@ void perf_sched_cb_inc(struct pmu *pmu) { struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - if (!cpuctx->sched_cb_usage++) - list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list)); - - this_cpu_inc(perf_sched_cb_usages); + cpuctx->sched_cb_usage++; } /* @@ -3463,30 +3497,22 @@ void perf_sched_cb_inc(struct pmu *pmu) * PEBS requires this to provide PID/TID information. This requires we flush * all queued PEBS records before we context switch to a new task. */ -static void perf_pmu_sched_task(struct task_struct *prev, - struct task_struct *next, - bool sched_in) +static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in) { - struct perf_cpu_context *cpuctx; struct pmu *pmu; - if (prev == next) - return; - - list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) { - pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */ + pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */ - if (WARN_ON_ONCE(!pmu->sched_task)) - continue; + if (WARN_ON_ONCE(!pmu->sched_task)) + return; - perf_ctx_lock(cpuctx, cpuctx->task_ctx); - perf_pmu_disable(pmu); + perf_ctx_lock(cpuctx, cpuctx->task_ctx); + perf_pmu_disable(pmu); - pmu->sched_task(cpuctx->task_ctx, sched_in); + pmu->sched_task(cpuctx->task_ctx, sched_in); - perf_pmu_enable(pmu); - perf_ctx_unlock(cpuctx, cpuctx->task_ctx); - } + perf_pmu_enable(pmu); + perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } static void perf_event_switch(struct task_struct *task, @@ -3511,9 +3537,6 @@ void __perf_event_task_sched_out(struct task_struct *task, { int ctxn; - if (__this_cpu_read(perf_sched_cb_usages)) - perf_pmu_sched_task(task, next, false); - if (atomic_read(&nr_switch_events)) perf_event_switch(task, next, false); @@ -3745,10 +3768,14 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, struct task_struct *task) { struct perf_cpu_context *cpuctx; + struct pmu *pmu = ctx->pmu; cpuctx = __get_cpu_context(ctx); - if (cpuctx->task_ctx == ctx) + if (cpuctx->task_ctx == ctx) { + if (cpuctx->sched_cb_usage) + __perf_pmu_sched_task(cpuctx, true); return; + } perf_ctx_lock(cpuctx, ctx); /* @@ -3758,7 +3785,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, if (!ctx->nr_events) goto unlock; - perf_pmu_disable(ctx->pmu); + perf_pmu_disable(pmu); /* * We want to keep the following priority order: * cpu pinned (that don't need to move), task pinned, @@ -3770,7 +3797,11 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); perf_event_sched_in(cpuctx, ctx, task); - perf_pmu_enable(ctx->pmu); + + if (cpuctx->sched_cb_usage && pmu->sched_task) + pmu->sched_task(cpuctx->task_ctx, true); + + perf_pmu_enable(pmu); unlock: perf_ctx_unlock(cpuctx, ctx); @@ -3813,9 +3844,6 @@ void __perf_event_task_sched_in(struct task_struct *prev, if (atomic_read(&nr_switch_events)) perf_event_switch(task, prev, true); - - if (__this_cpu_read(perf_sched_cb_usages)) - perf_pmu_sched_task(prev, task, true); } static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) @@ -5868,11 +5896,11 @@ static void perf_pmu_output_stop(struct perf_event *event); static void perf_mmap_close(struct vm_area_struct *vma) { struct perf_event *event = vma->vm_file->private_data; - struct perf_buffer *rb = ring_buffer_get(event); struct user_struct *mmap_user = rb->mmap_user; int mmap_locked = rb->mmap_locked; unsigned long size = perf_data_size(rb); + bool detach_rest = false; if (event->pmu->event_unmapped) event->pmu->event_unmapped(event, vma->vm_mm); @@ -5903,7 +5931,8 @@ static void perf_mmap_close(struct vm_area_struct *vma) mutex_unlock(&event->mmap_mutex); } - atomic_dec(&rb->mmap_count); + if (atomic_dec_and_test(&rb->mmap_count)) + detach_rest = true; if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) goto out_put; @@ -5912,7 +5941,7 @@ static void perf_mmap_close(struct vm_area_struct *vma) mutex_unlock(&event->mmap_mutex); /* If there's still other mmap()s of this buffer, we're done. */ - if (atomic_read(&rb->mmap_count)) + if (!detach_rest) goto out_put; /* @@ -10034,7 +10063,7 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, case IF_SRC_KERNELADDR: case IF_SRC_KERNEL: kernel = 1; - /* fall through */ + fallthrough; case IF_SRC_FILEADDR: case IF_SRC_FILE: @@ -12828,7 +12857,6 @@ static void __init perf_event_init_all_cpus(void) #ifdef CONFIG_CGROUP_PERF INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu)); #endif - INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu)); } } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index edd0c985a939..bf9edd8d75be 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -205,7 +205,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, try_to_free_swap(old_page); page_vma_mapped_walk_done(&pvmw); - if (vma->vm_flags & VM_LOCKED) + if ((vma->vm_flags & VM_LOCKED) && !PageCompound(old_page)) munlock_vma_page(old_page); put_page(old_page); @@ -1823,7 +1823,7 @@ void uprobe_copy_process(struct task_struct *t, unsigned long flags) t->utask->dup_xol_addr = area->vaddr; init_task_work(&t->utask->dup_xol_work, dup_xol_work); - task_work_add(t, &t->utask->dup_xol_work, true); + task_work_add(t, &t->utask->dup_xol_work, TWA_RESUME); } /* diff --git a/kernel/exit.c b/kernel/exit.c index 733e80f334e7..87a2d515de0d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1474,23 +1474,6 @@ end: return retval; } -static struct pid *pidfd_get_pid(unsigned int fd) -{ - struct fd f; - struct pid *pid; - - f = fdget(fd); - if (!f.file) - return ERR_PTR(-EBADF); - - pid = pidfd_pid(f.file); - if (!IS_ERR(pid)) - get_pid(pid); - - fdput(f); - return pid; -} - static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, int options, struct rusage *ru) { @@ -1498,6 +1481,7 @@ static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, struct pid *pid = NULL; enum pid_type type; long ret; + unsigned int f_flags = 0; if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED| __WNOTHREAD|__WCLONE|__WALL)) @@ -1531,9 +1515,10 @@ static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, if (upid < 0) return -EINVAL; - pid = pidfd_get_pid(upid); + pid = pidfd_get_pid(upid, &f_flags); if (IS_ERR(pid)) return PTR_ERR(pid); + break; default: return -EINVAL; @@ -1544,7 +1529,12 @@ static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, wo.wo_flags = options; wo.wo_info = infop; wo.wo_rusage = ru; + if (f_flags & O_NONBLOCK) + wo.wo_flags |= WNOHANG; + ret = do_wait(&wo); + if (!ret && !(options & WNOHANG) && (f_flags & O_NONBLOCK)) + ret = -EAGAIN; put_pid(pid); return ret; diff --git a/kernel/fork.c b/kernel/fork.c index 4d32190861bd..32083db7a2a2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -95,6 +95,7 @@ #include <linux/stackleak.h> #include <linux/kasan.h> #include <linux/scs.h> +#include <linux/io_uring.h> #include <asm/pgalloc.h> #include <linux/uaccess.h> @@ -555,10 +556,10 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, get_file(file); if (tmp->vm_flags & VM_DENYWRITE) - atomic_dec(&inode->i_writecount); + put_write_access(inode); i_mmap_lock_write(mapping); if (tmp->vm_flags & VM_SHARED) - atomic_inc(&mapping->i_mmap_writable); + mapping_allow_writable(mapping); flush_dcache_mmap_lock(mapping); /* insert tmp into the share list, just after mpnt */ vma_interval_tree_insert_after(tmp, mpnt, @@ -589,7 +590,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, mm->map_count++; if (!(tmp->vm_flags & VM_WIPEONFORK)) - retval = copy_page_range(mm, oldmm, mpnt); + retval = copy_page_range(tmp, mpnt); if (tmp->vm_ops && tmp->vm_ops->open) tmp->vm_ops->open(tmp); @@ -728,6 +729,7 @@ void __put_task_struct(struct task_struct *tsk) WARN_ON(refcount_read(&tsk->usage)); WARN_ON(tsk == current); + io_uring_free(tsk); cgroup_free(tsk); task_numa_free(tsk, true); security_task_free(tsk); @@ -1011,6 +1013,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm_pgtables_bytes_init(mm); mm->map_count = 0; mm->locked_vm = 0; + atomic_set(&mm->has_pinned, 0); atomic64_set(&mm->pinned_vm, 0); memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); @@ -1809,6 +1812,25 @@ static __always_inline void delayed_free_task(struct task_struct *tsk) free_task(tsk); } +static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk) +{ + /* Skip if kernel thread */ + if (!tsk->mm) + return; + + /* Skip if spawning a thread or using vfork */ + if ((clone_flags & (CLONE_VM | CLONE_THREAD | CLONE_VFORK)) != CLONE_VM) + return; + + /* We need to synchronize with __set_oom_adj */ + mutex_lock(&oom_adj_mutex); + set_bit(MMF_MULTIPROCESS, &tsk->mm->flags); + /* Update the values in case they were changed after copy_signal */ + tsk->signal->oom_score_adj = current->signal->oom_score_adj; + tsk->signal->oom_score_adj_min = current->signal->oom_score_adj_min; + mutex_unlock(&oom_adj_mutex); +} + /* * This creates a new process as a copy of the old one, * but does not actually start it yet. @@ -1982,6 +2004,10 @@ static __latent_entropy struct task_struct *copy_process( p->vtime.state = VTIME_INACTIVE; #endif +#ifdef CONFIG_IO_URING + p->io_uring = NULL; +#endif + #if defined(SPLIT_RSS_COUNTING) memset(&p->rss_stat, 0, sizeof(p->rss_stat)); #endif @@ -2163,7 +2189,7 @@ static __latent_entropy struct task_struct *copy_process( /* * Ensure that the cgroup subsystem policies allow the new process to be - * forked. It should be noted the the new process's css_set can be changed + * forked. It should be noted that the new process's css_set can be changed * between here and cgroup_post_fork() if an organisation operation is in * progress. */ @@ -2281,6 +2307,8 @@ static __latent_entropy struct task_struct *copy_process( trace_task_newtask(p, clone_flags); uprobe_copy_process(p, clone_flags); + copy_oom_score_adj(clone_flags, p); + return p; bad_fork_cancel_cgroup: @@ -2384,14 +2412,14 @@ struct mm_struct *copy_init_mm(void) * * args->exit_signal is expected to be checked for sanity by the caller. */ -long _do_fork(struct kernel_clone_args *args) +pid_t kernel_clone(struct kernel_clone_args *args) { u64 clone_flags = args->flags; struct completion vfork; struct pid *pid; struct task_struct *p; int trace = 0; - long nr; + pid_t nr; /* * For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument @@ -2477,7 +2505,7 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) .stack_size = (unsigned long)arg, }; - return _do_fork(&args); + return kernel_clone(&args); } #ifdef __ARCH_WANT_SYS_FORK @@ -2488,7 +2516,7 @@ SYSCALL_DEFINE0(fork) .exit_signal = SIGCHLD, }; - return _do_fork(&args); + return kernel_clone(&args); #else /* can not support in nommu mode */ return -EINVAL; @@ -2504,7 +2532,7 @@ SYSCALL_DEFINE0(vfork) .exit_signal = SIGCHLD, }; - return _do_fork(&args); + return kernel_clone(&args); } #endif @@ -2542,7 +2570,7 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, .tls = tls, }; - return _do_fork(&args); + return kernel_clone(&args); } #endif @@ -2700,7 +2728,7 @@ SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size) if (!clone3_args_valid(&kargs)) return -EINVAL; - return _do_fork(&kargs); + return kernel_clone(&kargs); } #endif @@ -2863,7 +2891,7 @@ int unshare_fd(unsigned long unshare_flags, unsigned int max_fds, /* * unshare allows a process to 'unshare' part of the process * context which was originally shared using clone. copy_* - * functions used by _do_fork() cannot be used here directly + * functions used by kernel_clone() cannot be used here directly * because they modify an inactive task_struct that is being * constructed. Here we are modifying the current, active, * task_struct. @@ -3014,7 +3042,7 @@ int unshare_files(struct files_struct **displaced) } int sysctl_max_threads(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; int ret; diff --git a/kernel/futex.c b/kernel/futex.c index a5876694a60e..f8614ef4ff31 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -39,6 +39,7 @@ #include <linux/freezer.h> #include <linux/memblock.h> #include <linux/fault-inject.h> +#include <linux/time_namespace.h> #include <asm/futex.h> @@ -916,7 +917,7 @@ static inline void exit_pi_state_list(struct task_struct *curr) { } * [10] Found | Found | task | !=taskTID | 0/1 | Invalid * * [1] Indicates that the kernel can acquire the futex atomically. We - * came came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit. + * came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit. * * [2] Valid, if TID does not belong to a kernel thread. If no matching * thread is found then it indicates that the owner TID has died. @@ -1502,8 +1503,10 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_ */ newval = FUTEX_WAITERS | task_pid_vnr(new_owner); - if (unlikely(should_fail_futex(true))) + if (unlikely(should_fail_futex(true))) { ret = -EFAULT; + goto out_unlock; + } ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval); if (!ret && (curval != uval)) { @@ -3797,6 +3800,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, t = timespec64_to_ktime(ts); if (cmd == FUTEX_WAIT) t = ktime_add_safe(ktime_get(), t); + else if (!(op & FUTEX_CLOCK_REALTIME)) + t = timens_ktime_to_host(CLOCK_MONOTONIC, t); tp = &t; } /* @@ -3989,6 +3994,8 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, t = timespec64_to_ktime(ts); if (cmd == FUTEX_WAIT) t = ktime_add_safe(ktime_get(), t); + else if (!(op & FUTEX_CLOCK_REALTIME)) + t = timens_ktime_to_host(CLOCK_MONOTONIC, t); tp = &t; } if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index 908fdf5098c3..53c67c87f141 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c @@ -19,7 +19,9 @@ #include <linux/vmalloc.h> #include "gcov.h" -#if (__GNUC__ >= 7) +#if (__GNUC__ >= 10) +#define GCOV_COUNTERS 8 +#elif (__GNUC__ >= 7) #define GCOV_COUNTERS 9 #elif (__GNUC__ > 5) || (__GNUC__ == 5 && __GNUC_MINOR__ >= 1) #define GCOV_COUNTERS 10 diff --git a/kernel/groups.c b/kernel/groups.c index 6ee6691f6839..fe7e6385530e 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -178,7 +178,7 @@ bool may_setgroups(void) { struct user_namespace *user_ns = current_user_ns(); - return ns_capable(user_ns, CAP_SETGID) && + return ns_capable_setid(user_ns, CAP_SETGID) && userns_may_setgroups(user_ns); } diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 857f5f4c8098..b9b9618e1aca 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -945,6 +945,33 @@ void handle_percpu_devid_irq(struct irq_desc *desc) } /** + * handle_percpu_devid_fasteoi_ipi - Per CPU local IPI handler with per cpu + * dev ids + * @desc: the interrupt description structure for this irq + * + * The biggest difference with the IRQ version is that the interrupt is + * EOIed early, as the IPI could result in a context switch, and we need to + * make sure the IPI can fire again. We also assume that the arch code has + * registered an action. If not, we are positively doomed. + */ +void handle_percpu_devid_fasteoi_ipi(struct irq_desc *desc) +{ + struct irq_chip *chip = irq_desc_get_chip(desc); + struct irqaction *action = desc->action; + unsigned int irq = irq_desc_get_irq(desc); + irqreturn_t res; + + __kstat_incr_irqs_this_cpu(desc); + + if (chip->irq_eoi) + chip->irq_eoi(&desc->irq_data); + + trace_irq_handler_entry(irq, action); + res = action->handler(irq, raw_cpu_ptr(action->percpu_dev_id)); + trace_irq_handler_exit(irq, action, res); +} + +/** * handle_percpu_devid_fasteoi_nmi - Per CPU local NMI handler with per cpu * dev ids * @desc: the interrupt description structure for this irq @@ -1541,18 +1568,17 @@ EXPORT_SYMBOL_GPL(irq_chip_release_resources_parent); */ int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) { - struct irq_data *pos = NULL; + struct irq_data *pos; -#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY - for (; data; data = data->parent_data) -#endif + for (pos = NULL; !pos && data; data = irqd_get_parent_data(data)) { if (data->chip && data->chip->irq_compose_msi_msg) pos = data; + } + if (!pos) return -ENOSYS; pos->chip->irq_compose_msi_msg(pos, msg); - return 0; } diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index b95ff5d5f4bd..e4cff358b437 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -57,6 +57,7 @@ static const struct irq_bit_descr irqchip_flags[] = { BIT_MASK_DESCR(IRQCHIP_EOI_THREADED), BIT_MASK_DESCR(IRQCHIP_SUPPORTS_LEVEL_MSI), BIT_MASK_DESCR(IRQCHIP_SUPPORTS_NMI), + BIT_MASK_DESCR(IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND), }; static void @@ -125,6 +126,8 @@ static const struct irq_bit_descr irqdata_states[] = { BIT_MASK_DESCR(IRQD_DEFAULT_TRIGGER_SET), BIT_MASK_DESCR(IRQD_HANDLE_ENFORCE_IRQCTX), + + BIT_MASK_DESCR(IRQD_IRQ_ENABLED_ON_SUSPEND), }; static const struct irq_bit_descr irqdesc_states[] = { @@ -136,6 +139,7 @@ static const struct irq_bit_descr irqdesc_states[] = { BIT_MASK_DESCR(_IRQ_PER_CPU_DEVID), BIT_MASK_DESCR(_IRQ_IS_POLLED), BIT_MASK_DESCR(_IRQ_DISABLE_UNLAZY), + BIT_MASK_DESCR(_IRQ_HIDDEN), }; static const struct irq_bit_descr irqdesc_istates[] = { diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index a8e14c80b405..762a928e18f9 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -173,7 +173,7 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags __irq_wake_thread(desc, action); - /* Fall through - to add to randomness */ + fallthrough; /* to add to randomness */ case IRQ_HANDLED: *flags |= action->flags; break; diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 7db284b10ac9..54363527feea 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -473,6 +473,15 @@ static inline void irq_domain_deactivate_irq(struct irq_data *data) } #endif +static inline struct irq_data *irqd_get_parent_data(struct irq_data *irqd) +{ +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY + return irqd->parent_data; +#else + return NULL; +#endif +} + #ifdef CONFIG_GENERIC_IRQ_DEBUGFS #include <linux/debugfs.h> diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 76cd7ebd1178..cf8b374b892d 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1136,6 +1136,17 @@ static struct irq_data *irq_domain_insert_irq_data(struct irq_domain *domain, return irq_data; } +static void __irq_domain_free_hierarchy(struct irq_data *irq_data) +{ + struct irq_data *tmp; + + while (irq_data) { + tmp = irq_data; + irq_data = irq_data->parent_data; + kfree(tmp); + } +} + static void irq_domain_free_irq_data(unsigned int virq, unsigned int nr_irqs) { struct irq_data *irq_data, *tmp; @@ -1147,12 +1158,83 @@ static void irq_domain_free_irq_data(unsigned int virq, unsigned int nr_irqs) irq_data->parent_data = NULL; irq_data->domain = NULL; - while (tmp) { - irq_data = tmp; - tmp = tmp->parent_data; - kfree(irq_data); + __irq_domain_free_hierarchy(tmp); + } +} + +/** + * irq_domain_disconnect_hierarchy - Mark the first unused level of a hierarchy + * @domain: IRQ domain from which the hierarchy is to be disconnected + * @virq: IRQ number where the hierarchy is to be trimmed + * + * Marks the @virq level belonging to @domain as disconnected. + * Returns -EINVAL if @virq doesn't have a valid irq_data pointing + * to @domain. + * + * Its only use is to be able to trim levels of hierarchy that do not + * have any real meaning for this interrupt, and that the driver marks + * as such from its .alloc() callback. + */ +int irq_domain_disconnect_hierarchy(struct irq_domain *domain, + unsigned int virq) +{ + struct irq_data *irqd; + + irqd = irq_domain_get_irq_data(domain, virq); + if (!irqd) + return -EINVAL; + + irqd->chip = ERR_PTR(-ENOTCONN); + return 0; +} + +static int irq_domain_trim_hierarchy(unsigned int virq) +{ + struct irq_data *tail, *irqd, *irq_data; + + irq_data = irq_get_irq_data(virq); + tail = NULL; + + /* The first entry must have a valid irqchip */ + if (!irq_data->chip || IS_ERR(irq_data->chip)) + return -EINVAL; + + /* + * Validate that the irq_data chain is sane in the presence of + * a hierarchy trimming marker. + */ + for (irqd = irq_data->parent_data; irqd; irq_data = irqd, irqd = irqd->parent_data) { + /* Can't have a valid irqchip after a trim marker */ + if (irqd->chip && tail) + return -EINVAL; + + /* Can't have an empty irqchip before a trim marker */ + if (!irqd->chip && !tail) + return -EINVAL; + + if (IS_ERR(irqd->chip)) { + /* Only -ENOTCONN is a valid trim marker */ + if (PTR_ERR(irqd->chip) != -ENOTCONN) + return -EINVAL; + + tail = irq_data; } } + + /* No trim marker, nothing to do */ + if (!tail) + return 0; + + pr_info("IRQ%d: trimming hierarchy from %s\n", + virq, tail->parent_data->domain->name); + + /* Sever the inner part of the hierarchy... */ + irqd = tail; + tail = tail->parent_data; + irqd->parent_data = NULL; + __irq_domain_free_hierarchy(tail); + + return 0; } static int irq_domain_alloc_irq_data(struct irq_domain *domain, @@ -1362,6 +1444,15 @@ int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, mutex_unlock(&irq_domain_mutex); goto out_free_irq_data; } + + for (i = 0; i < nr_irqs; i++) { + ret = irq_domain_trim_hierarchy(virq + i); + if (ret) { + mutex_unlock(&irq_domain_mutex); + goto out_free_irq_data; + } + } + for (i = 0; i < nr_irqs; i++) irq_domain_insert_irq(virq + i); mutex_unlock(&irq_domain_mutex); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 52ac5391dcc6..c460e0496006 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -271,7 +271,7 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, case IRQ_SET_MASK_OK: case IRQ_SET_MASK_OK_DONE: cpumask_copy(desc->irq_common_data.affinity, mask); - /* fall through */ + fallthrough; case IRQ_SET_MASK_OK_NOCOPY: irq_validate_effective_affinity(data); irq_set_thread_affinity(desc); @@ -868,7 +868,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned long flags) case IRQ_SET_MASK_OK_DONE: irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK); irqd_set(&desc->irq_data, flags); - /* fall through */ + fallthrough; case IRQ_SET_MASK_OK_NOCOPY: flags = irqd_get_trigger_type(&desc->irq_data); @@ -1162,7 +1162,7 @@ static int irq_thread(void *data) handler_fn = irq_thread_fn; init_task_work(&on_exit_work, irq_thread_dtor); - task_work_add(current, &on_exit_work, false); + task_work_add(current, &on_exit_work, TWA_NONE); irq_thread_check_affinity(desc, action); diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c index 30cc217b8631..651a4ad6d711 100644 --- a/kernel/irq/matrix.c +++ b/kernel/irq/matrix.c @@ -380,6 +380,13 @@ int irq_matrix_alloc(struct irq_matrix *m, const struct cpumask *msk, unsigned int cpu, bit; struct cpumap *cm; + /* + * Not required in theory, but matrix_find_best_cpu() uses + * for_each_cpu() which ignores the cpumask on UP . + */ + if (cpumask_empty(msk)) + return -EINVAL; + cpu = matrix_find_best_cpu(m, msk); if (cpu == UINT_MAX) return -ENOSPC; diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index eb95f6106a1e..2c0c4d6d0f83 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -187,7 +187,6 @@ static const struct irq_domain_ops msi_domain_ops = { .deactivate = msi_domain_deactivate, }; -#ifdef GENERIC_MSI_DOMAIN_OPS static irq_hw_number_t msi_domain_ops_get_hwirq(struct msi_domain_info *info, msi_alloc_info_t *arg) { @@ -206,11 +205,6 @@ static void msi_domain_ops_set_desc(msi_alloc_info_t *arg, { arg->desc = desc; } -#else -#define msi_domain_ops_get_hwirq NULL -#define msi_domain_ops_prepare NULL -#define msi_domain_ops_set_desc NULL -#endif /* !GENERIC_MSI_DOMAIN_OPS */ static int msi_domain_ops_init(struct irq_domain *domain, struct msi_domain_info *info, @@ -235,11 +229,13 @@ static int msi_domain_ops_check(struct irq_domain *domain, } static struct msi_domain_ops msi_domain_ops_default = { - .get_hwirq = msi_domain_ops_get_hwirq, - .msi_init = msi_domain_ops_init, - .msi_check = msi_domain_ops_check, - .msi_prepare = msi_domain_ops_prepare, - .set_desc = msi_domain_ops_set_desc, + .get_hwirq = msi_domain_ops_get_hwirq, + .msi_init = msi_domain_ops_init, + .msi_check = msi_domain_ops_check, + .msi_prepare = msi_domain_ops_prepare, + .set_desc = msi_domain_ops_set_desc, + .domain_alloc_irqs = __msi_domain_alloc_irqs, + .domain_free_irqs = __msi_domain_free_irqs, }; static void msi_domain_update_dom_ops(struct msi_domain_info *info) @@ -251,6 +247,14 @@ static void msi_domain_update_dom_ops(struct msi_domain_info *info) return; } + if (ops->domain_alloc_irqs == NULL) + ops->domain_alloc_irqs = msi_domain_ops_default.domain_alloc_irqs; + if (ops->domain_free_irqs == NULL) + ops->domain_free_irqs = msi_domain_ops_default.domain_free_irqs; + + if (!(info->flags & MSI_FLAG_USE_DEF_DOM_OPS)) + return; + if (ops->get_hwirq == NULL) ops->get_hwirq = msi_domain_ops_default.get_hwirq; if (ops->msi_init == NULL) @@ -284,8 +288,7 @@ struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode, { struct irq_domain *domain; - if (info->flags & MSI_FLAG_USE_DEF_DOM_OPS) - msi_domain_update_dom_ops(info); + msi_domain_update_dom_ops(info); if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS) msi_domain_update_chip_ops(info); @@ -370,8 +373,13 @@ static bool msi_check_reservation_mode(struct irq_domain *domain, { struct msi_desc *desc; - if (domain->bus_token != DOMAIN_BUS_PCI_MSI) + switch(domain->bus_token) { + case DOMAIN_BUS_PCI_MSI: + case DOMAIN_BUS_VMD_MSI: + break; + default: return false; + } if (!(info->flags & MSI_FLAG_MUST_REACTIVATE)) return false; @@ -387,17 +395,8 @@ static bool msi_check_reservation_mode(struct irq_domain *domain, return desc->msi_attrib.is_msix || desc->msi_attrib.maskbit; } -/** - * msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain - * @domain: The domain to allocate from - * @dev: Pointer to device struct of the device for which the interrupts - * are allocated - * @nvec: The number of interrupts to allocate - * - * Returns 0 on success or an error code. - */ -int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, - int nvec) +int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, + int nvec) { struct msi_domain_info *info = domain->host_data; struct msi_domain_ops *ops = info->ops; @@ -491,12 +490,24 @@ cleanup: } /** - * msi_domain_free_irqs - Free interrupts from a MSI interrupt @domain associated tp @dev - * @domain: The domain to managing the interrupts + * msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain + * @domain: The domain to allocate from * @dev: Pointer to device struct of the device for which the interrupts - * are free + * are allocated + * @nvec: The number of interrupts to allocate + * + * Returns 0 on success or an error code. */ -void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev) +int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, + int nvec) +{ + struct msi_domain_info *info = domain->host_data; + struct msi_domain_ops *ops = info->ops; + + return ops->domain_alloc_irqs(domain, dev, nvec); +} + +void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev) { struct msi_desc *desc; @@ -514,6 +525,20 @@ void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev) } /** + * __msi_domain_free_irqs - Free interrupts from a MSI interrupt @domain associated tp @dev + * @domain: The domain to managing the interrupts + * @dev: Pointer to device struct of the device for which the interrupts + * are free + */ +void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev) +{ + struct msi_domain_info *info = domain->host_data; + struct msi_domain_ops *ops = info->ops; + + return ops->domain_free_irqs(domain, dev); +} + +/** * msi_get_domain_info - Get the MSI interrupt domain info for @domain * @domain: The interrupt domain to retrieve data from * diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index c6c7e187ae74..ce0adb22ee96 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -69,12 +69,26 @@ void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) static bool suspend_device_irq(struct irq_desc *desc) { + unsigned long chipflags = irq_desc_get_chip(desc)->flags; + struct irq_data *irqd = &desc->irq_data; + if (!desc->action || irq_desc_is_chained(desc) || desc->no_suspend_depth) return false; - if (irqd_is_wakeup_set(&desc->irq_data)) { - irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED); + if (irqd_is_wakeup_set(irqd)) { + irqd_set(irqd, IRQD_WAKEUP_ARMED); + + if ((chipflags & IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND) && + irqd_irq_disabled(irqd)) { + /* + * Interrupt marked for wakeup is in disabled state. + * Enable interrupt here to unmask/enable in irqchip + * to be able to resume with such interrupts. + */ + __enable_irq(desc); + irqd_set(irqd, IRQD_IRQ_ENABLED_ON_SUSPEND); + } /* * We return true here to force the caller to issue * synchronize_irq(). We need to make sure that the @@ -93,7 +107,7 @@ static bool suspend_device_irq(struct irq_desc *desc) * chip level. The chip implementation indicates that with * IRQCHIP_MASK_ON_SUSPEND. */ - if (irq_desc_get_chip(desc)->flags & IRQCHIP_MASK_ON_SUSPEND) + if (chipflags & IRQCHIP_MASK_ON_SUSPEND) mask_irq(desc); return true; } @@ -137,7 +151,19 @@ EXPORT_SYMBOL_GPL(suspend_device_irqs); static void resume_irq(struct irq_desc *desc) { - irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED); + struct irq_data *irqd = &desc->irq_data; + + irqd_clear(irqd, IRQD_WAKEUP_ARMED); + + if (irqd_is_enabled_on_suspend(irqd)) { + /* + * Interrupt marked for wakeup was enabled during suspend + * entry. Disable such interrupts to restore them back to + * original state. + */ + __disable_irq(desc); + irqd_clear(irqd, IRQD_IRQ_ENABLED_ON_SUSPEND); + } if (desc->istate & IRQS_SUSPENDED) goto resume; diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 32c071d7bc03..72513ed2a5fc 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -485,7 +485,7 @@ int show_interrupts(struct seq_file *p, void *v) rcu_read_lock(); desc = irq_to_desc(i); - if (!desc) + if (!desc || irq_settings_is_hidden(desc)) goto outsparse; if (desc->kstat_irqs) diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index c48ce19a257f..8ccd32a0cc80 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -86,6 +86,18 @@ static int irq_sw_resend(struct irq_desc *desc) } #endif +static int try_retrigger(struct irq_desc *desc) +{ + if (desc->irq_data.chip->irq_retrigger) + return desc->irq_data.chip->irq_retrigger(&desc->irq_data); + +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY + return irq_chip_retrigger_hierarchy(&desc->irq_data); +#else + return 0; +#endif +} + /* * IRQ resend * @@ -113,8 +125,7 @@ int check_irq_resend(struct irq_desc *desc, bool inject) desc->istate &= ~IRQS_PENDING; - if (!desc->irq_data.chip->irq_retrigger || - !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) + if (!try_retrigger(desc)) err = irq_sw_resend(desc); /* If the retrigger was successfull, mark it with the REPLAY bit */ diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index e43795cd2ccf..403378b9947b 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h @@ -17,6 +17,7 @@ enum { _IRQ_PER_CPU_DEVID = IRQ_PER_CPU_DEVID, _IRQ_IS_POLLED = IRQ_IS_POLLED, _IRQ_DISABLE_UNLAZY = IRQ_DISABLE_UNLAZY, + _IRQ_HIDDEN = IRQ_HIDDEN, _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK, }; @@ -31,6 +32,7 @@ enum { #define IRQ_PER_CPU_DEVID GOT_YOU_MORON #define IRQ_IS_POLLED GOT_YOU_MORON #define IRQ_DISABLE_UNLAZY GOT_YOU_MORON +#define IRQ_HIDDEN GOT_YOU_MORON #undef IRQF_MODIFY_MASK #define IRQF_MODIFY_MASK GOT_YOU_MORON @@ -167,3 +169,8 @@ static inline void irq_settings_clr_disable_unlazy(struct irq_desc *desc) { desc->status_use_accessors &= ~_IRQ_DISABLE_UNLAZY; } + +static inline bool irq_settings_is_hidden(struct irq_desc *desc) +{ + return desc->status_use_accessors & _IRQ_HIDDEN; +} diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c index e960d7ce7bcc..773b6105c4ae 100644 --- a/kernel/irq/timings.c +++ b/kernel/irq/timings.c @@ -604,7 +604,7 @@ int irq_timings_alloc(int irq) /* * Some platforms can have the same private interrupt per cpu, - * so this function may be be called several times with the + * so this function may be called several times with the * same interrupt number. Just bail out in case the per cpu * stat structure is already allocated. */ diff --git a/kernel/jump_label.c b/kernel/jump_label.c index cdb3ffab128b..015ef903ce8c 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -19,7 +19,7 @@ #include <linux/cpu.h> #include <asm/sections.h> -/* mutex to protect coming/going of the the jump_label table */ +/* mutex to protect coming/going of the jump_label table */ static DEFINE_MUTEX(jump_label_mutex); void jump_label_lock(void) @@ -539,19 +539,25 @@ static void static_key_set_mod(struct static_key *key, static int __jump_label_mod_text_reserved(void *start, void *end) { struct module *mod; + int ret; preempt_disable(); mod = __module_text_address((unsigned long)start); WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod); + if (!try_module_get(mod)) + mod = NULL; preempt_enable(); if (!mod) return 0; - - return __jump_label_text_reserved(mod->jump_entries, + ret = __jump_label_text_reserved(mod->jump_entries, mod->jump_entries + mod->num_jump_entries, start, end); + + module_put(mod); + + return ret; } static void __jump_label_mod_update(struct static_key *key) diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 95cb74f73292..fe9de067771c 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -40,10 +40,10 @@ extern const u8 kallsyms_names[] __weak; * has one (eg: FRV). */ extern const unsigned int kallsyms_num_syms -__attribute__((weak, section(".rodata"))); +__section(".rodata") __attribute__((weak)); extern const unsigned long kallsyms_relative_base -__attribute__((weak, section(".rodata"))); +__section(".rodata") __attribute__((weak)); extern const char kallsyms_token_table[] __weak; extern const u16 kallsyms_token_index[] __weak; @@ -684,12 +684,12 @@ bool kallsyms_show_value(const struct cred *cred) case 0: if (kallsyms_for_perf()) return true; - /* fallthrough */ + fallthrough; case 1: if (security_capable(cred, &init_user_ns, CAP_SYSLOG, CAP_OPT_NOAUDIT) == 0) return true; - /* fallthrough */ + fallthrough; default: return false; } diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 9147ff6a12e5..3994a217bde7 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 +#define pr_fmt(fmt) "kcsan: " fmt + #include <linux/atomic.h> #include <linux/bug.h> #include <linux/delay.h> @@ -98,6 +100,9 @@ static atomic_long_t watchpoints[CONFIG_KCSAN_NUM_WATCHPOINTS + NUM_SLOTS-1]; */ static DEFINE_PER_CPU(long, kcsan_skip); +/* For kcsan_prandom_u32_max(). */ +static DEFINE_PER_CPU(struct rnd_state, kcsan_rand_state); + static __always_inline atomic_long_t *find_watchpoint(unsigned long addr, size_t size, bool expect_write, @@ -223,7 +228,7 @@ is_atomic(const volatile void *ptr, size_t size, int type, struct kcsan_ctx *ctx if (IS_ENABLED(CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC) && (type & KCSAN_ACCESS_WRITE) && size <= sizeof(long) && - IS_ALIGNED((unsigned long)ptr, size)) + !(type & KCSAN_ACCESS_COMPOUND) && IS_ALIGNED((unsigned long)ptr, size)) return true; /* Assume aligned writes up to word size are atomic. */ if (ctx->atomic_next > 0) { @@ -269,11 +274,28 @@ should_watch(const volatile void *ptr, size_t size, int type, struct kcsan_ctx * return true; } +/* + * Returns a pseudo-random number in interval [0, ep_ro). See prandom_u32_max() + * for more details. + * + * The open-coded version here is using only safe primitives for all contexts + * where we can have KCSAN instrumentation. In particular, we cannot use + * prandom_u32() directly, as its tracepoint could cause recursion. + */ +static u32 kcsan_prandom_u32_max(u32 ep_ro) +{ + struct rnd_state *state = &get_cpu_var(kcsan_rand_state); + const u32 res = prandom_u32_state(state); + + put_cpu_var(kcsan_rand_state); + return (u32)(((u64) res * ep_ro) >> 32); +} + static inline void reset_kcsan_skip(void) { long skip_count = kcsan_skip_watch - (IS_ENABLED(CONFIG_KCSAN_SKIP_WATCH_RANDOMIZE) ? - prandom_u32_max(kcsan_skip_watch) : + kcsan_prandom_u32_max(kcsan_skip_watch) : 0); this_cpu_write(kcsan_skip, skip_count); } @@ -283,12 +305,18 @@ static __always_inline bool kcsan_is_enabled(void) return READ_ONCE(kcsan_enabled) && get_ctx()->disable_count == 0; } -static inline unsigned int get_delay(void) +/* Introduce delay depending on context and configuration. */ +static void delay_access(int type) { unsigned int delay = in_task() ? kcsan_udelay_task : kcsan_udelay_interrupt; - return delay - (IS_ENABLED(CONFIG_KCSAN_DELAY_RANDOMIZE) ? - prandom_u32_max(delay) : - 0); + /* For certain access types, skew the random delay to be longer. */ + unsigned int skew_delay_order = + (type & (KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_ASSERT)) ? 1 : 0; + + delay -= IS_ENABLED(CONFIG_KCSAN_DELAY_RANDOMIZE) ? + kcsan_prandom_u32_max(delay >> skew_delay_order) : + 0; + udelay(delay); } void kcsan_save_irqtrace(struct task_struct *task) @@ -361,13 +389,13 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr, * already removed the watchpoint, or another thread consumed * the watchpoint before this thread. */ - kcsan_counter_inc(KCSAN_COUNTER_REPORT_RACES); + atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_REPORT_RACES]); } if ((type & KCSAN_ACCESS_ASSERT) != 0) - kcsan_counter_inc(KCSAN_COUNTER_ASSERT_FAILURES); + atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_ASSERT_FAILURES]); else - kcsan_counter_inc(KCSAN_COUNTER_DATA_RACES); + atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_DATA_RACES]); user_access_restore(flags); } @@ -408,7 +436,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) goto out; if (!check_encodable((unsigned long)ptr, size)) { - kcsan_counter_inc(KCSAN_COUNTER_UNENCODABLE_ACCESSES); + atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_UNENCODABLE_ACCESSES]); goto out; } @@ -428,12 +456,12 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) * with which should_watch() returns true should be tweaked so * that this case happens very rarely. */ - kcsan_counter_inc(KCSAN_COUNTER_NO_CAPACITY); + atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_NO_CAPACITY]); goto out_unlock; } - kcsan_counter_inc(KCSAN_COUNTER_SETUP_WATCHPOINTS); - kcsan_counter_inc(KCSAN_COUNTER_USED_WATCHPOINTS); + atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_SETUP_WATCHPOINTS]); + atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_USED_WATCHPOINTS]); /* * Read the current value, to later check and infer a race if the data @@ -459,7 +487,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) if (IS_ENABLED(CONFIG_KCSAN_DEBUG)) { kcsan_disable_current(); - pr_err("KCSAN: watching %s, size: %zu, addr: %px [slot: %d, encoded: %lx]\n", + pr_err("watching %s, size: %zu, addr: %px [slot: %d, encoded: %lx]\n", is_write ? "write" : "read", size, ptr, watchpoint_slot((unsigned long)ptr), encode_watchpoint((unsigned long)ptr, size, is_write)); @@ -470,7 +498,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) * Delay this thread, to increase probability of observing a racy * conflicting access. */ - udelay(get_delay()); + delay_access(type); /* * Re-read value, and check if it is as expected; if not, we infer a @@ -535,16 +563,16 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) * increment this counter. */ if (is_assert && value_change == KCSAN_VALUE_CHANGE_TRUE) - kcsan_counter_inc(KCSAN_COUNTER_ASSERT_FAILURES); + atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_ASSERT_FAILURES]); kcsan_report(ptr, size, type, value_change, KCSAN_REPORT_RACE_SIGNAL, watchpoint - watchpoints); } else if (value_change == KCSAN_VALUE_CHANGE_TRUE) { /* Inferring a race, since the value should not have changed. */ - kcsan_counter_inc(KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN); + atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN]); if (is_assert) - kcsan_counter_inc(KCSAN_COUNTER_ASSERT_FAILURES); + atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_ASSERT_FAILURES]); if (IS_ENABLED(CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN) || is_assert) kcsan_report(ptr, size, type, KCSAN_VALUE_CHANGE_TRUE, @@ -557,7 +585,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) * reused after this point. */ remove_watchpoint(watchpoint); - kcsan_counter_dec(KCSAN_COUNTER_USED_WATCHPOINTS); + atomic_long_dec(&kcsan_counters[KCSAN_COUNTER_USED_WATCHPOINTS]); out_unlock: if (!kcsan_interrupt_watcher) local_irq_restore(irq_flags); @@ -614,13 +642,16 @@ void __init kcsan_init(void) BUG_ON(!in_task()); kcsan_debugfs_init(); + prandom_seed_full_state(&kcsan_rand_state); /* * We are in the init task, and no other tasks should be running; * WRITE_ONCE without memory barrier is sufficient. */ - if (kcsan_early_enable) + if (kcsan_early_enable) { + pr_info("enabled early\n"); WRITE_ONCE(kcsan_enabled, true); + } } /* === Exported interface =================================================== */ @@ -793,7 +824,17 @@ EXPORT_SYMBOL(__kcsan_check_access); EXPORT_SYMBOL(__tsan_write##size); \ void __tsan_unaligned_write##size(void *ptr) \ __alias(__tsan_write##size); \ - EXPORT_SYMBOL(__tsan_unaligned_write##size) + EXPORT_SYMBOL(__tsan_unaligned_write##size); \ + void __tsan_read_write##size(void *ptr); \ + void __tsan_read_write##size(void *ptr) \ + { \ + check_access(ptr, size, \ + KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE); \ + } \ + EXPORT_SYMBOL(__tsan_read_write##size); \ + void __tsan_unaligned_read_write##size(void *ptr) \ + __alias(__tsan_read_write##size); \ + EXPORT_SYMBOL(__tsan_unaligned_read_write##size) DEFINE_TSAN_READ_WRITE(1); DEFINE_TSAN_READ_WRITE(2); @@ -879,3 +920,130 @@ void __tsan_init(void) { } EXPORT_SYMBOL(__tsan_init); + +/* + * Instrumentation for atomic builtins (__atomic_*, __sync_*). + * + * Normal kernel code _should not_ be using them directly, but some + * architectures may implement some or all atomics using the compilers' + * builtins. + * + * Note: If an architecture decides to fully implement atomics using the + * builtins, because they are implicitly instrumented by KCSAN (and KASAN, + * etc.), implementing the ARCH_ATOMIC interface (to get instrumentation via + * atomic-instrumented) is no longer necessary. + * + * TSAN instrumentation replaces atomic accesses with calls to any of the below + * functions, whose job is to also execute the operation itself. + */ + +#define DEFINE_TSAN_ATOMIC_LOAD_STORE(bits) \ + u##bits __tsan_atomic##bits##_load(const u##bits *ptr, int memorder); \ + u##bits __tsan_atomic##bits##_load(const u##bits *ptr, int memorder) \ + { \ + if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \ + check_access(ptr, bits / BITS_PER_BYTE, KCSAN_ACCESS_ATOMIC); \ + } \ + return __atomic_load_n(ptr, memorder); \ + } \ + EXPORT_SYMBOL(__tsan_atomic##bits##_load); \ + void __tsan_atomic##bits##_store(u##bits *ptr, u##bits v, int memorder); \ + void __tsan_atomic##bits##_store(u##bits *ptr, u##bits v, int memorder) \ + { \ + if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \ + check_access(ptr, bits / BITS_PER_BYTE, \ + KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC); \ + } \ + __atomic_store_n(ptr, v, memorder); \ + } \ + EXPORT_SYMBOL(__tsan_atomic##bits##_store) + +#define DEFINE_TSAN_ATOMIC_RMW(op, bits, suffix) \ + u##bits __tsan_atomic##bits##_##op(u##bits *ptr, u##bits v, int memorder); \ + u##bits __tsan_atomic##bits##_##op(u##bits *ptr, u##bits v, int memorder) \ + { \ + if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \ + check_access(ptr, bits / BITS_PER_BYTE, \ + KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | \ + KCSAN_ACCESS_ATOMIC); \ + } \ + return __atomic_##op##suffix(ptr, v, memorder); \ + } \ + EXPORT_SYMBOL(__tsan_atomic##bits##_##op) + +/* + * Note: CAS operations are always classified as write, even in case they + * fail. We cannot perform check_access() after a write, as it might lead to + * false positives, in cases such as: + * + * T0: __atomic_compare_exchange_n(&p->flag, &old, 1, ...) + * + * T1: if (__atomic_load_n(&p->flag, ...)) { + * modify *p; + * p->flag = 0; + * } + * + * The only downside is that, if there are 3 threads, with one CAS that + * succeeds, another CAS that fails, and an unmarked racing operation, we may + * point at the wrong CAS as the source of the race. However, if we assume that + * all CAS can succeed in some other execution, the data race is still valid. + */ +#define DEFINE_TSAN_ATOMIC_CMPXCHG(bits, strength, weak) \ + int __tsan_atomic##bits##_compare_exchange_##strength(u##bits *ptr, u##bits *exp, \ + u##bits val, int mo, int fail_mo); \ + int __tsan_atomic##bits##_compare_exchange_##strength(u##bits *ptr, u##bits *exp, \ + u##bits val, int mo, int fail_mo) \ + { \ + if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \ + check_access(ptr, bits / BITS_PER_BYTE, \ + KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | \ + KCSAN_ACCESS_ATOMIC); \ + } \ + return __atomic_compare_exchange_n(ptr, exp, val, weak, mo, fail_mo); \ + } \ + EXPORT_SYMBOL(__tsan_atomic##bits##_compare_exchange_##strength) + +#define DEFINE_TSAN_ATOMIC_CMPXCHG_VAL(bits) \ + u##bits __tsan_atomic##bits##_compare_exchange_val(u##bits *ptr, u##bits exp, u##bits val, \ + int mo, int fail_mo); \ + u##bits __tsan_atomic##bits##_compare_exchange_val(u##bits *ptr, u##bits exp, u##bits val, \ + int mo, int fail_mo) \ + { \ + if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \ + check_access(ptr, bits / BITS_PER_BYTE, \ + KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | \ + KCSAN_ACCESS_ATOMIC); \ + } \ + __atomic_compare_exchange_n(ptr, &exp, val, 0, mo, fail_mo); \ + return exp; \ + } \ + EXPORT_SYMBOL(__tsan_atomic##bits##_compare_exchange_val) + +#define DEFINE_TSAN_ATOMIC_OPS(bits) \ + DEFINE_TSAN_ATOMIC_LOAD_STORE(bits); \ + DEFINE_TSAN_ATOMIC_RMW(exchange, bits, _n); \ + DEFINE_TSAN_ATOMIC_RMW(fetch_add, bits, ); \ + DEFINE_TSAN_ATOMIC_RMW(fetch_sub, bits, ); \ + DEFINE_TSAN_ATOMIC_RMW(fetch_and, bits, ); \ + DEFINE_TSAN_ATOMIC_RMW(fetch_or, bits, ); \ + DEFINE_TSAN_ATOMIC_RMW(fetch_xor, bits, ); \ + DEFINE_TSAN_ATOMIC_RMW(fetch_nand, bits, ); \ + DEFINE_TSAN_ATOMIC_CMPXCHG(bits, strong, 0); \ + DEFINE_TSAN_ATOMIC_CMPXCHG(bits, weak, 1); \ + DEFINE_TSAN_ATOMIC_CMPXCHG_VAL(bits) + +DEFINE_TSAN_ATOMIC_OPS(8); +DEFINE_TSAN_ATOMIC_OPS(16); +DEFINE_TSAN_ATOMIC_OPS(32); +DEFINE_TSAN_ATOMIC_OPS(64); + +void __tsan_atomic_thread_fence(int memorder); +void __tsan_atomic_thread_fence(int memorder) +{ + __atomic_thread_fence(memorder); +} +EXPORT_SYMBOL(__tsan_atomic_thread_fence); + +void __tsan_atomic_signal_fence(int memorder); +void __tsan_atomic_signal_fence(int memorder) { } +EXPORT_SYMBOL(__tsan_atomic_signal_fence); diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c index 023e49c58d55..3c8093a371b1 100644 --- a/kernel/kcsan/debugfs.c +++ b/kernel/kcsan/debugfs.c @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 +#define pr_fmt(fmt) "kcsan: " fmt + #include <linux/atomic.h> #include <linux/bsearch.h> #include <linux/bug.h> @@ -15,10 +17,19 @@ #include "kcsan.h" -/* - * Statistics counters. - */ -static atomic_long_t counters[KCSAN_COUNTER_COUNT]; +atomic_long_t kcsan_counters[KCSAN_COUNTER_COUNT]; +static const char *const counter_names[] = { + [KCSAN_COUNTER_USED_WATCHPOINTS] = "used_watchpoints", + [KCSAN_COUNTER_SETUP_WATCHPOINTS] = "setup_watchpoints", + [KCSAN_COUNTER_DATA_RACES] = "data_races", + [KCSAN_COUNTER_ASSERT_FAILURES] = "assert_failures", + [KCSAN_COUNTER_NO_CAPACITY] = "no_capacity", + [KCSAN_COUNTER_REPORT_RACES] = "report_races", + [KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN] = "races_unknown_origin", + [KCSAN_COUNTER_UNENCODABLE_ACCESSES] = "unencodable_accesses", + [KCSAN_COUNTER_ENCODING_FALSE_POSITIVES] = "encoding_false_positives", +}; +static_assert(ARRAY_SIZE(counter_names) == KCSAN_COUNTER_COUNT); /* * Addresses for filtering functions from reporting. This list can be used as a @@ -39,34 +50,6 @@ static struct { }; static DEFINE_SPINLOCK(report_filterlist_lock); -static const char *counter_to_name(enum kcsan_counter_id id) -{ - switch (id) { - case KCSAN_COUNTER_USED_WATCHPOINTS: return "used_watchpoints"; - case KCSAN_COUNTER_SETUP_WATCHPOINTS: return "setup_watchpoints"; - case KCSAN_COUNTER_DATA_RACES: return "data_races"; - case KCSAN_COUNTER_ASSERT_FAILURES: return "assert_failures"; - case KCSAN_COUNTER_NO_CAPACITY: return "no_capacity"; - case KCSAN_COUNTER_REPORT_RACES: return "report_races"; - case KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN: return "races_unknown_origin"; - case KCSAN_COUNTER_UNENCODABLE_ACCESSES: return "unencodable_accesses"; - case KCSAN_COUNTER_ENCODING_FALSE_POSITIVES: return "encoding_false_positives"; - case KCSAN_COUNTER_COUNT: - BUG(); - } - return NULL; -} - -void kcsan_counter_inc(enum kcsan_counter_id id) -{ - atomic_long_inc(&counters[id]); -} - -void kcsan_counter_dec(enum kcsan_counter_id id) -{ - atomic_long_dec(&counters[id]); -} - /* * The microbenchmark allows benchmarking KCSAN core runtime only. To run * multiple threads, pipe 'microbench=<iters>' from multiple tasks into the @@ -86,7 +69,7 @@ static noinline void microbenchmark(unsigned long iters) */ WRITE_ONCE(kcsan_enabled, false); - pr_info("KCSAN: %s begin | iters: %lu\n", __func__, iters); + pr_info("%s begin | iters: %lu\n", __func__, iters); cycles = get_cycles(); while (iters--) { @@ -97,73 +80,13 @@ static noinline void microbenchmark(unsigned long iters) } cycles = get_cycles() - cycles; - pr_info("KCSAN: %s end | cycles: %llu\n", __func__, cycles); + pr_info("%s end | cycles: %llu\n", __func__, cycles); WRITE_ONCE(kcsan_enabled, was_enabled); /* restore context */ current->kcsan_ctx = ctx_save; } -/* - * Simple test to create conflicting accesses. Write 'test=<iters>' to KCSAN's - * debugfs file from multiple tasks to generate real conflicts and show reports. - */ -static long test_dummy; -static long test_flags; -static long test_scoped; -static noinline void test_thread(unsigned long iters) -{ - const long CHANGE_BITS = 0xff00ff00ff00ff00L; - const struct kcsan_ctx ctx_save = current->kcsan_ctx; - cycles_t cycles; - - /* We may have been called from an atomic region; reset context. */ - memset(¤t->kcsan_ctx, 0, sizeof(current->kcsan_ctx)); - - pr_info("KCSAN: %s begin | iters: %lu\n", __func__, iters); - pr_info("test_dummy@%px, test_flags@%px, test_scoped@%px,\n", - &test_dummy, &test_flags, &test_scoped); - - cycles = get_cycles(); - while (iters--) { - /* These all should generate reports. */ - __kcsan_check_read(&test_dummy, sizeof(test_dummy)); - ASSERT_EXCLUSIVE_WRITER(test_dummy); - ASSERT_EXCLUSIVE_ACCESS(test_dummy); - - ASSERT_EXCLUSIVE_BITS(test_flags, ~CHANGE_BITS); /* no report */ - __kcsan_check_read(&test_flags, sizeof(test_flags)); /* no report */ - - ASSERT_EXCLUSIVE_BITS(test_flags, CHANGE_BITS); /* report */ - __kcsan_check_read(&test_flags, sizeof(test_flags)); /* no report */ - - /* not actually instrumented */ - WRITE_ONCE(test_dummy, iters); /* to observe value-change */ - __kcsan_check_write(&test_dummy, sizeof(test_dummy)); - - test_flags ^= CHANGE_BITS; /* generate value-change */ - __kcsan_check_write(&test_flags, sizeof(test_flags)); - - BUG_ON(current->kcsan_ctx.scoped_accesses.prev); - { - /* Should generate reports anywhere in this block. */ - ASSERT_EXCLUSIVE_WRITER_SCOPED(test_scoped); - ASSERT_EXCLUSIVE_ACCESS_SCOPED(test_scoped); - BUG_ON(!current->kcsan_ctx.scoped_accesses.prev); - /* Unrelated accesses. */ - __kcsan_check_access(&cycles, sizeof(cycles), 0); - __kcsan_check_access(&cycles, sizeof(cycles), KCSAN_ACCESS_ATOMIC); - } - BUG_ON(current->kcsan_ctx.scoped_accesses.prev); - } - cycles = get_cycles() - cycles; - - pr_info("KCSAN: %s end | cycles: %llu\n", __func__, cycles); - - /* restore context */ - current->kcsan_ctx = ctx_save; -} - static int cmp_filterlist_addrs(const void *rhs, const void *lhs) { const unsigned long a = *(const unsigned long *)rhs; @@ -220,7 +143,7 @@ static ssize_t insert_report_filterlist(const char *func) ssize_t ret = 0; if (!addr) { - pr_err("KCSAN: could not find function: '%s'\n", func); + pr_err("could not find function: '%s'\n", func); return -ENOENT; } @@ -270,9 +193,10 @@ static int show_info(struct seq_file *file, void *v) /* show stats */ seq_printf(file, "enabled: %i\n", READ_ONCE(kcsan_enabled)); - for (i = 0; i < KCSAN_COUNTER_COUNT; ++i) - seq_printf(file, "%s: %ld\n", counter_to_name(i), - atomic_long_read(&counters[i])); + for (i = 0; i < KCSAN_COUNTER_COUNT; ++i) { + seq_printf(file, "%s: %ld\n", counter_names[i], + atomic_long_read(&kcsan_counters[i])); + } /* show filter functions, and filter type */ spin_lock_irqsave(&report_filterlist_lock, flags); @@ -307,18 +231,12 @@ debugfs_write(struct file *file, const char __user *buf, size_t count, loff_t *o WRITE_ONCE(kcsan_enabled, true); } else if (!strcmp(arg, "off")) { WRITE_ONCE(kcsan_enabled, false); - } else if (!strncmp(arg, "microbench=", sizeof("microbench=") - 1)) { + } else if (str_has_prefix(arg, "microbench=")) { unsigned long iters; - if (kstrtoul(&arg[sizeof("microbench=") - 1], 0, &iters)) + if (kstrtoul(&arg[strlen("microbench=")], 0, &iters)) return -EINVAL; microbenchmark(iters); - } else if (!strncmp(arg, "test=", sizeof("test=") - 1)) { - unsigned long iters; - - if (kstrtoul(&arg[sizeof("test=") - 1], 0, &iters)) - return -EINVAL; - test_thread(iters); } else if (!strcmp(arg, "whitelist")) { set_report_filterlist_whitelist(true); } else if (!strcmp(arg, "blacklist")) { diff --git a/kernel/kcsan/encoding.h b/kernel/kcsan/encoding.h index f03562aaf2eb..1a6db2f797ac 100644 --- a/kernel/kcsan/encoding.h +++ b/kernel/kcsan/encoding.h @@ -32,7 +32,7 @@ * 1. different addresses but with the same encoded address race; * 2. and both map onto the same watchpoint slots; * - * Both these are assumed to be very unlikely. However, in case it still happens + * Both these are assumed to be very unlikely. However, in case it still * happens, the report logic will filter out the false positive (see report.c). */ #define WATCHPOINT_ADDR_BITS (BITS_PER_LONG-1 - WATCHPOINT_SIZE_BITS) diff --git a/kernel/kcsan/kcsan-test.c b/kernel/kcsan/kcsan-test.c index fed6fcb5768c..ebe7fd245104 100644 --- a/kernel/kcsan/kcsan-test.c +++ b/kernel/kcsan/kcsan-test.c @@ -27,6 +27,12 @@ #include <linux/types.h> #include <trace/events/printk.h> +#ifdef CONFIG_CC_HAS_TSAN_COMPOUND_READ_BEFORE_WRITE +#define __KCSAN_ACCESS_RW(alt) (KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE) +#else +#define __KCSAN_ACCESS_RW(alt) (alt) +#endif + /* Points to current test-case memory access "kernels". */ static void (*access_kernels[2])(void); @@ -186,20 +192,21 @@ static bool report_matches(const struct expect_report *r) /* Access 1 & 2 */ for (i = 0; i < 2; ++i) { + const int ty = r->access[i].type; const char *const access_type = - (r->access[i].type & KCSAN_ACCESS_ASSERT) ? - ((r->access[i].type & KCSAN_ACCESS_WRITE) ? - "assert no accesses" : - "assert no writes") : - ((r->access[i].type & KCSAN_ACCESS_WRITE) ? - "write" : - "read"); + (ty & KCSAN_ACCESS_ASSERT) ? + ((ty & KCSAN_ACCESS_WRITE) ? + "assert no accesses" : + "assert no writes") : + ((ty & KCSAN_ACCESS_WRITE) ? + ((ty & KCSAN_ACCESS_COMPOUND) ? + "read-write" : + "write") : + "read"); const char *const access_type_aux = - (r->access[i].type & KCSAN_ACCESS_ATOMIC) ? - " (marked)" : - ((r->access[i].type & KCSAN_ACCESS_SCOPED) ? - " (scoped)" : - ""); + (ty & KCSAN_ACCESS_ATOMIC) ? + " (marked)" : + ((ty & KCSAN_ACCESS_SCOPED) ? " (scoped)" : ""); if (i == 1) { /* Access 2 */ @@ -277,6 +284,12 @@ static noinline void test_kernel_write_atomic(void) WRITE_ONCE(test_var, READ_ONCE_NOCHECK(test_sink) + 1); } +static noinline void test_kernel_atomic_rmw(void) +{ + /* Use builtin, so we can set up the "bad" atomic/non-atomic scenario. */ + __atomic_fetch_add(&test_var, 1, __ATOMIC_RELAXED); +} + __no_kcsan static noinline void test_kernel_write_uninstrumented(void) { test_var++; } @@ -390,6 +403,15 @@ static noinline void test_kernel_seqlock_writer(void) write_sequnlock_irqrestore(&test_seqlock, flags); } +static noinline void test_kernel_atomic_builtins(void) +{ + /* + * Generate concurrent accesses, expecting no reports, ensuring KCSAN + * treats builtin atomics as actually atomic. + */ + __atomic_load_n(&test_var, __ATOMIC_RELAXED); +} + /* ===== Test cases ===== */ /* Simple test with normal data race. */ @@ -430,8 +452,8 @@ static void test_concurrent_races(struct kunit *test) const struct expect_report expect = { .access = { /* NULL will match any address. */ - { test_kernel_rmw_array, NULL, 0, KCSAN_ACCESS_WRITE }, - { test_kernel_rmw_array, NULL, 0, 0 }, + { test_kernel_rmw_array, NULL, 0, __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) }, + { test_kernel_rmw_array, NULL, 0, __KCSAN_ACCESS_RW(0) }, }, }; static const struct expect_report never = { @@ -620,6 +642,29 @@ static void test_read_plain_atomic_write(struct kunit *test) KUNIT_EXPECT_TRUE(test, match_expect); } +/* Test that atomic RMWs generate correct report. */ +__no_kcsan +static void test_read_plain_atomic_rmw(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + { test_kernel_atomic_rmw, &test_var, sizeof(test_var), + KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC }, + }, + }; + bool match_expect = false; + + if (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) + return; + + begin_test_checks(test_kernel_read, test_kernel_atomic_rmw); + do { + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + KUNIT_EXPECT_TRUE(test, match_expect); +} + /* Zero-sized accesses should never cause data race reports. */ __no_kcsan static void test_zero_size_access(struct kunit *test) @@ -853,6 +898,59 @@ static void test_seqlock_noreport(struct kunit *test) } /* + * Test atomic builtins work and required instrumentation functions exist. We + * also test that KCSAN understands they're atomic by racing with them via + * test_kernel_atomic_builtins(), and expect no reports. + * + * The atomic builtins _SHOULD NOT_ be used in normal kernel code! + */ +static void test_atomic_builtins(struct kunit *test) +{ + bool match_never = false; + + begin_test_checks(test_kernel_atomic_builtins, test_kernel_atomic_builtins); + do { + long tmp; + + kcsan_enable_current(); + + __atomic_store_n(&test_var, 42L, __ATOMIC_RELAXED); + KUNIT_EXPECT_EQ(test, 42L, __atomic_load_n(&test_var, __ATOMIC_RELAXED)); + + KUNIT_EXPECT_EQ(test, 42L, __atomic_exchange_n(&test_var, 20, __ATOMIC_RELAXED)); + KUNIT_EXPECT_EQ(test, 20L, test_var); + + tmp = 20L; + KUNIT_EXPECT_TRUE(test, __atomic_compare_exchange_n(&test_var, &tmp, 30L, + 0, __ATOMIC_RELAXED, + __ATOMIC_RELAXED)); + KUNIT_EXPECT_EQ(test, tmp, 20L); + KUNIT_EXPECT_EQ(test, test_var, 30L); + KUNIT_EXPECT_FALSE(test, __atomic_compare_exchange_n(&test_var, &tmp, 40L, + 1, __ATOMIC_RELAXED, + __ATOMIC_RELAXED)); + KUNIT_EXPECT_EQ(test, tmp, 30L); + KUNIT_EXPECT_EQ(test, test_var, 30L); + + KUNIT_EXPECT_EQ(test, 30L, __atomic_fetch_add(&test_var, 1, __ATOMIC_RELAXED)); + KUNIT_EXPECT_EQ(test, 31L, __atomic_fetch_sub(&test_var, 1, __ATOMIC_RELAXED)); + KUNIT_EXPECT_EQ(test, 30L, __atomic_fetch_and(&test_var, 0xf, __ATOMIC_RELAXED)); + KUNIT_EXPECT_EQ(test, 14L, __atomic_fetch_xor(&test_var, 0xf, __ATOMIC_RELAXED)); + KUNIT_EXPECT_EQ(test, 1L, __atomic_fetch_or(&test_var, 0xf0, __ATOMIC_RELAXED)); + KUNIT_EXPECT_EQ(test, 241L, __atomic_fetch_nand(&test_var, 0xf, __ATOMIC_RELAXED)); + KUNIT_EXPECT_EQ(test, -2L, test_var); + + __atomic_thread_fence(__ATOMIC_SEQ_CST); + __atomic_signal_fence(__ATOMIC_SEQ_CST); + + kcsan_disable_current(); + + match_never = report_available(); + } while (!end_test_checks(match_never)); + KUNIT_EXPECT_FALSE(test, match_never); +} + +/* * Each test case is run with different numbers of threads. Until KUnit supports * passing arguments for each test case, we encode #threads in the test case * name (read by get_num_threads()). [The '-' was chosen as a stylistic @@ -880,6 +978,7 @@ static struct kunit_case kcsan_test_cases[] = { KCSAN_KUNIT_CASE(test_write_write_struct_part), KCSAN_KUNIT_CASE(test_read_atomic_write_atomic), KCSAN_KUNIT_CASE(test_read_plain_atomic_write), + KCSAN_KUNIT_CASE(test_read_plain_atomic_rmw), KCSAN_KUNIT_CASE(test_zero_size_access), KCSAN_KUNIT_CASE(test_data_race), KCSAN_KUNIT_CASE(test_assert_exclusive_writer), @@ -891,6 +990,7 @@ static struct kunit_case kcsan_test_cases[] = { KCSAN_KUNIT_CASE(test_assert_exclusive_access_scoped), KCSAN_KUNIT_CASE(test_jiffies_noreport), KCSAN_KUNIT_CASE(test_seqlock_noreport), + KCSAN_KUNIT_CASE(test_atomic_builtins), {}, }; diff --git a/kernel/kcsan/kcsan.h b/kernel/kcsan/kcsan.h index 29480010dc30..8d4bf3431b3c 100644 --- a/kernel/kcsan/kcsan.h +++ b/kernel/kcsan/kcsan.h @@ -8,6 +8,7 @@ #ifndef _KERNEL_KCSAN_KCSAN_H #define _KERNEL_KCSAN_KCSAN_H +#include <linux/atomic.h> #include <linux/kcsan.h> #include <linux/sched.h> @@ -34,6 +35,10 @@ void kcsan_restore_irqtrace(struct task_struct *task); */ void kcsan_debugfs_init(void); +/* + * Statistics counters displayed via debugfs; should only be modified in + * slow-paths. + */ enum kcsan_counter_id { /* * Number of watchpoints currently in use. @@ -86,12 +91,7 @@ enum kcsan_counter_id { KCSAN_COUNTER_COUNT, /* number of counters */ }; - -/* - * Increment/decrement counter with given id; avoid calling these in fast-path. - */ -extern void kcsan_counter_inc(enum kcsan_counter_id id); -extern void kcsan_counter_dec(enum kcsan_counter_id id); +extern atomic_long_t kcsan_counters[KCSAN_COUNTER_COUNT]; /* * Returns true if data races in the function symbol that maps to func_addr diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index 9d07e175de0f..d3bf87e6007c 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -228,6 +228,10 @@ static const char *get_access_type(int type) return "write"; case KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC: return "write (marked)"; + case KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE: + return "read-write"; + case KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC: + return "read-write (marked)"; case KCSAN_ACCESS_SCOPED: return "read (scoped)"; case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_ATOMIC: @@ -275,8 +279,8 @@ static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries cur = strnstr(buf, "kcsan_", len); if (cur) { - cur += sizeof("kcsan_") - 1; - if (strncmp(cur, "test", sizeof("test") - 1)) + cur += strlen("kcsan_"); + if (!str_has_prefix(cur, "test")) continue; /* KCSAN runtime function. */ /* KCSAN related test. */ } @@ -555,7 +559,7 @@ static bool prepare_report_consumer(unsigned long *flags, * If the actual accesses to not match, this was a false * positive due to watchpoint encoding. */ - kcsan_counter_inc(KCSAN_COUNTER_ENCODING_FALSE_POSITIVES); + atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_ENCODING_FALSE_POSITIVES]); goto discard; } diff --git a/kernel/kcsan/selftest.c b/kernel/kcsan/selftest.c index d26a052d3383..d98bc208d06d 100644 --- a/kernel/kcsan/selftest.c +++ b/kernel/kcsan/selftest.c @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 +#define pr_fmt(fmt) "kcsan: " fmt + #include <linux/init.h> #include <linux/kernel.h> #include <linux/printk.h> @@ -116,16 +118,16 @@ static int __init kcsan_selftest(void) if (do_test()) \ ++passed; \ else \ - pr_err("KCSAN selftest: " #do_test " failed"); \ + pr_err("selftest: " #do_test " failed"); \ } while (0) RUN_TEST(test_requires); RUN_TEST(test_encode_decode); RUN_TEST(test_matching_access); - pr_info("KCSAN selftest: %d/%d tests passed\n", passed, total); + pr_info("selftest: %d/%d tests passed\n", passed, total); if (passed != total) - panic("KCSAN selftests failed"); + panic("selftests failed"); return 0; } postcore_initcall(kcsan_selftest); diff --git a/kernel/kexec.c b/kernel/kexec.c index f977786fe498..c82c6c06f051 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -205,7 +205,7 @@ static inline int kexec_load_check(unsigned long nr_segments, return -EPERM; /* Permit LSMs and IMA to fail the kexec */ - result = security_kernel_load_data(LOADING_KEXEC_IMAGE); + result = security_kernel_load_data(LOADING_KEXEC_IMAGE, false); if (result < 0) return result; diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index c19c0dad1ebe..8798a8183974 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -36,7 +36,7 @@ #include <linux/syscore_ops.h> #include <linux/compiler.h> #include <linux/hugetlb.h> -#include <linux/frame.h> +#include <linux/objtool.h> #include <asm/page.h> #include <asm/sections.h> @@ -109,7 +109,7 @@ EXPORT_SYMBOL_GPL(kexec_crash_loaded); * defined more restrictively in <asm/kexec.h>. * * The code for the transition from the current kernel to the - * the new kernel is placed in the control_code_buffer, whose size + * new kernel is placed in the control_code_buffer, whose size * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single * page of memory is necessary, but some architectures require more. * Because this memory must be identity mapped in the transition from diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index ca40bef75a61..e21f6b9234f7 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -24,6 +24,7 @@ #include <linux/elf.h> #include <linux/elfcore.h> #include <linux/kernel.h> +#include <linux/kernel_read_file.h> #include <linux/syscalls.h> #include <linux/vmalloc.h> #include "kexec_internal.h" @@ -219,13 +220,12 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, { int ret; void *ldata; - loff_t size; - ret = kernel_read_file_from_fd(kernel_fd, &image->kernel_buf, - &size, INT_MAX, READING_KEXEC_IMAGE); - if (ret) + ret = kernel_read_file_from_fd(kernel_fd, 0, &image->kernel_buf, + INT_MAX, NULL, READING_KEXEC_IMAGE); + if (ret < 0) return ret; - image->kernel_buf_len = size; + image->kernel_buf_len = ret; /* Call arch image probe handlers */ ret = arch_kexec_kernel_image_probe(image, image->kernel_buf, @@ -241,12 +241,13 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, #endif /* It is possible that there no initramfs is being loaded */ if (!(flags & KEXEC_FILE_NO_INITRAMFS)) { - ret = kernel_read_file_from_fd(initrd_fd, &image->initrd_buf, - &size, INT_MAX, + ret = kernel_read_file_from_fd(initrd_fd, 0, &image->initrd_buf, + INT_MAX, NULL, READING_KEXEC_INITRAMFS); - if (ret) + if (ret < 0) goto out; - image->initrd_buf_len = size; + image->initrd_buf_len = ret; + ret = 0; } if (cmdline_len) { @@ -520,7 +521,7 @@ static int locate_mem_hole_callback(struct resource *res, void *arg) /* Returning 0 will take to next memory range */ /* Don't use memory that will be detected and handled by a driver. */ - if (res->flags & IORESOURCE_MEM_DRIVER_MANAGED) + if (res->flags & IORESOURCE_SYSRAM_DRIVER_MANAGED) return 0; if (sz < kbuf->memsz) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 287b263c9cb9..8a12a25fa40d 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -36,6 +36,7 @@ #include <linux/cpu.h> #include <linux/jump_label.h> #include <linux/perf_event.h> +#include <linux/static_call.h> #include <asm/sections.h> #include <asm/cacheflush.h> @@ -1223,8 +1224,7 @@ void kprobes_inc_nmissed_count(struct kprobe *p) } NOKPROBE_SYMBOL(kprobes_inc_nmissed_count); -void recycle_rp_inst(struct kretprobe_instance *ri, - struct hlist_head *head) +static void recycle_rp_inst(struct kretprobe_instance *ri) { struct kretprobe *rp = ri->rp; @@ -1236,12 +1236,11 @@ void recycle_rp_inst(struct kretprobe_instance *ri, hlist_add_head(&ri->hlist, &rp->free_instances); raw_spin_unlock(&rp->lock); } else - /* Unregistering */ - hlist_add_head(&ri->hlist, head); + kfree_rcu(ri, rcu); } NOKPROBE_SYMBOL(recycle_rp_inst); -void kretprobe_hash_lock(struct task_struct *tsk, +static void kretprobe_hash_lock(struct task_struct *tsk, struct hlist_head **head, unsigned long *flags) __acquires(hlist_lock) { @@ -1263,7 +1262,7 @@ __acquires(hlist_lock) } NOKPROBE_SYMBOL(kretprobe_table_lock); -void kretprobe_hash_unlock(struct task_struct *tsk, +static void kretprobe_hash_unlock(struct task_struct *tsk, unsigned long *flags) __releases(hlist_lock) { @@ -1284,7 +1283,7 @@ __releases(hlist_lock) } NOKPROBE_SYMBOL(kretprobe_table_unlock); -struct kprobe kprobe_busy = { +static struct kprobe kprobe_busy = { .addr = (void *) get_kprobe, }; @@ -1313,7 +1312,7 @@ void kprobe_busy_end(void) void kprobe_flush_task(struct task_struct *tk) { struct kretprobe_instance *ri; - struct hlist_head *head, empty_rp; + struct hlist_head *head; struct hlist_node *tmp; unsigned long hash, flags = 0; @@ -1323,19 +1322,14 @@ void kprobe_flush_task(struct task_struct *tk) kprobe_busy_begin(); - INIT_HLIST_HEAD(&empty_rp); hash = hash_ptr(tk, KPROBE_HASH_BITS); head = &kretprobe_inst_table[hash]; kretprobe_table_lock(hash, &flags); hlist_for_each_entry_safe(ri, tmp, head, hlist) { if (ri->task == tk) - recycle_rp_inst(ri, &empty_rp); + recycle_rp_inst(ri); } kretprobe_table_unlock(hash, &flags); - hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) { - hlist_del(&ri->hlist); - kfree(ri); - } kprobe_busy_end(); } @@ -1359,7 +1353,8 @@ static void cleanup_rp_inst(struct kretprobe *rp) struct hlist_node *next; struct hlist_head *head; - /* No race here */ + /* To avoid recursive kretprobe by NMI, set kprobe busy here */ + kprobe_busy_begin(); for (hash = 0; hash < KPROBE_TABLE_SIZE; hash++) { kretprobe_table_lock(hash, &flags); head = &kretprobe_inst_table[hash]; @@ -1369,6 +1364,8 @@ static void cleanup_rp_inst(struct kretprobe *rp) } kretprobe_table_unlock(hash, &flags); } + kprobe_busy_end(); + free_rp_inst(rp); } NOKPROBE_SYMBOL(cleanup_rp_inst); @@ -1634,6 +1631,7 @@ static int check_kprobe_address_safe(struct kprobe *p, if (!kernel_text_address((unsigned long) p->addr) || within_kprobe_blacklist((unsigned long) p->addr) || jump_label_text_reserved(p->addr, p->addr) || + static_call_text_reserved(p->addr, p->addr) || find_bug((unsigned long)p->addr)) { ret = -EINVAL; goto out; @@ -1927,6 +1925,97 @@ unsigned long __weak arch_deref_entry_point(void *entry) } #ifdef CONFIG_KRETPROBES + +unsigned long __kretprobe_trampoline_handler(struct pt_regs *regs, + void *trampoline_address, + void *frame_pointer) +{ + struct kretprobe_instance *ri = NULL, *last = NULL; + struct hlist_head *head; + struct hlist_node *tmp; + unsigned long flags; + kprobe_opcode_t *correct_ret_addr = NULL; + bool skipped = false; + + kretprobe_hash_lock(current, &head, &flags); + + /* + * It is possible to have multiple instances associated with a given + * task either because multiple functions in the call path have + * return probes installed on them, and/or more than one + * return probe was registered for a target function. + * + * We can handle this because: + * - instances are always pushed into the head of the list + * - when multiple return probes are registered for the same + * function, the (chronologically) first instance's ret_addr + * will be the real return address, and all the rest will + * point to kretprobe_trampoline. + */ + hlist_for_each_entry(ri, head, hlist) { + if (ri->task != current) + /* another task is sharing our hash bucket */ + continue; + /* + * Return probes must be pushed on this hash list correct + * order (same as return order) so that it can be popped + * correctly. However, if we find it is pushed it incorrect + * order, this means we find a function which should not be + * probed, because the wrong order entry is pushed on the + * path of processing other kretprobe itself. + */ + if (ri->fp != frame_pointer) { + if (!skipped) + pr_warn("kretprobe is stacked incorrectly. Trying to fixup.\n"); + skipped = true; + continue; + } + + correct_ret_addr = ri->ret_addr; + if (skipped) + pr_warn("%ps must be blacklisted because of incorrect kretprobe order\n", + ri->rp->kp.addr); + + if (correct_ret_addr != trampoline_address) + /* + * This is the real return address. Any other + * instances associated with this task are for + * other calls deeper on the call stack + */ + break; + } + + BUG_ON(!correct_ret_addr || (correct_ret_addr == trampoline_address)); + last = ri; + + hlist_for_each_entry_safe(ri, tmp, head, hlist) { + if (ri->task != current) + /* another task is sharing our hash bucket */ + continue; + if (ri->fp != frame_pointer) + continue; + + if (ri->rp && ri->rp->handler) { + struct kprobe *prev = kprobe_running(); + + __this_cpu_write(current_kprobe, &ri->rp->kp); + ri->ret_addr = correct_ret_addr; + ri->rp->handler(ri, regs); + __this_cpu_write(current_kprobe, prev); + } + + recycle_rp_inst(ri); + + if (ri == last) + break; + } + + kretprobe_hash_unlock(current, &flags); + + return (unsigned long)correct_ret_addr; +} +NOKPROBE_SYMBOL(__kretprobe_trampoline_handler) + /* * This kprobe pre_handler is registered with every kretprobe. When probe * hits it will set up the return probe. @@ -1937,17 +2026,6 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) unsigned long hash, flags = 0; struct kretprobe_instance *ri; - /* - * To avoid deadlocks, prohibit return probing in NMI contexts, - * just skip the probe and increase the (inexact) 'nmissed' - * statistical counter, so that the user is informed that - * something happened: - */ - if (unlikely(in_nmi())) { - rp->nmissed++; - return 0; - } - /* TODO: consider to only swap the RA after the last pre_handler fired */ hash = hash_ptr(current, KPROBE_HASH_BITS); raw_spin_lock_irqsave(&rp->lock, flags); @@ -2140,6 +2218,9 @@ static void kill_kprobe(struct kprobe *p) lockdep_assert_held(&kprobe_mutex); + if (WARN_ON_ONCE(kprobe_gone(p))) + return; + p->flags |= KPROBE_FLAG_GONE; if (kprobe_aggrprobe(p)) { /* @@ -2159,9 +2240,10 @@ static void kill_kprobe(struct kprobe *p) /* * The module is going away. We should disarm the kprobe which - * is using ftrace. + * is using ftrace, because ftrace framework is still available at + * MODULE_STATE_GOING notification. */ - if (kprobe_ftrace(p)) + if (kprobe_ftrace(p) && !kprobe_disabled(p) && !kprobes_all_disarmed) disarm_kprobe_ftrace(p); } @@ -2419,7 +2501,10 @@ static int kprobes_module_callback(struct notifier_block *nb, mutex_lock(&kprobe_mutex); for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; - hlist_for_each_entry(p, head, hlist) + hlist_for_each_entry(p, head, hlist) { + if (kprobe_gone(p)) + continue; + if (within_module_init((unsigned long)p->addr, mod) || (checkcore && within_module_core((unsigned long)p->addr, mod))) { @@ -2436,6 +2521,7 @@ static int kprobes_module_callback(struct notifier_block *nb, */ kill_kprobe(p); } + } } if (val == MODULE_STATE_GOING) remove_module_kprobe_blacklist(mod); @@ -2452,6 +2538,28 @@ static struct notifier_block kprobe_module_nb = { extern unsigned long __start_kprobe_blacklist[]; extern unsigned long __stop_kprobe_blacklist[]; +void kprobe_free_init_mem(void) +{ + void *start = (void *)(&__init_begin); + void *end = (void *)(&__init_end); + struct hlist_head *head; + struct kprobe *p; + int i; + + mutex_lock(&kprobe_mutex); + + /* Kill all kprobes on initmem */ + for (i = 0; i < KPROBE_TABLE_SIZE; i++) { + head = &kprobe_table[i]; + hlist_for_each_entry(p, head, hlist) { + if (start <= (void *)p->addr && (void *)p->addr < end) + kill_kprobe(p); + } + } + + mutex_unlock(&kprobe_mutex); +} + static int __init init_kprobes(void) { int i, err = 0; @@ -2506,7 +2614,7 @@ static int __init init_kprobes(void) init_test_probes(); return err; } -subsys_initcall(init_kprobes); +early_initcall(init_kprobes); #ifdef CONFIG_DEBUG_FS static void report_probe(struct seq_file *pi, struct kprobe *p, diff --git a/kernel/kthread.c b/kernel/kthread.c index 3edaa380dc7b..e29773c82b70 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -775,7 +775,7 @@ EXPORT_SYMBOL(kthread_create_worker); /** * kthread_create_worker_on_cpu - create a kthread worker and bind it - * it to a given CPU and the associated NUMA node. + * to a given CPU and the associated NUMA node. * @cpu: CPU number * @flags: flags modifying the default behavior of the worker * @namefmt: printf-style name for the kthread worker (task). diff --git a/kernel/livepatch/state.c b/kernel/livepatch/state.c index 7ee19476de9d..2565d039ade0 100644 --- a/kernel/livepatch/state.c +++ b/kernel/livepatch/state.c @@ -55,7 +55,7 @@ EXPORT_SYMBOL_GPL(klp_get_state); * * The function can be called only during transition when a new * livepatch is being enabled or when such a transition is reverted. - * It is typically called only from from pre/post (un)patch + * It is typically called only from pre/post (un)patch * callbacks. * * Return: pointer to the latest struct klp_state from already diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 2fad21d345b0..b71ad8d9f1c9 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -76,6 +76,23 @@ module_param(lock_stat, int, 0644); #define lock_stat 0 #endif +DEFINE_PER_CPU(unsigned int, lockdep_recursion); +EXPORT_PER_CPU_SYMBOL_GPL(lockdep_recursion); + +static inline bool lockdep_enabled(void) +{ + if (!debug_locks) + return false; + + if (this_cpu_read(lockdep_recursion)) + return false; + + if (current->lockdep_recursion) + return false; + + return true; +} + /* * lockdep_lock: protects the lockdep graph, the hashes and the * class/list/hash allocators. @@ -93,7 +110,7 @@ static inline void lockdep_lock(void) arch_spin_lock(&__lock); __owner = current; - current->lockdep_recursion++; + __this_cpu_inc(lockdep_recursion); } static inline void lockdep_unlock(void) @@ -101,7 +118,7 @@ static inline void lockdep_unlock(void) if (debug_locks && DEBUG_LOCKS_WARN_ON(__owner != current)) return; - current->lockdep_recursion--; + __this_cpu_dec(lockdep_recursion); __owner = NULL; arch_spin_unlock(&__lock); } @@ -372,6 +389,21 @@ static struct hlist_head classhash_table[CLASSHASH_SIZE]; static struct hlist_head chainhash_table[CHAINHASH_SIZE]; /* + * the id of held_lock + */ +static inline u16 hlock_id(struct held_lock *hlock) +{ + BUILD_BUG_ON(MAX_LOCKDEP_KEYS_BITS + 2 > 16); + + return (hlock->class_idx | (hlock->read << MAX_LOCKDEP_KEYS_BITS)); +} + +static inline unsigned int chain_hlock_class_idx(u16 hlock_id) +{ + return hlock_id & (MAX_LOCKDEP_KEYS - 1); +} + +/* * The hash key of the lock dependency chains is a hash itself too: * it's a hash of all locks taken up to that lock, including that lock. * It's a 64-bit hash, because it's important for the keys to be @@ -393,10 +425,15 @@ void lockdep_init_task(struct task_struct *task) task->lockdep_recursion = 0; } +static __always_inline void lockdep_recursion_inc(void) +{ + __this_cpu_inc(lockdep_recursion); +} + static __always_inline void lockdep_recursion_finish(void) { - if (WARN_ON_ONCE((--current->lockdep_recursion) & LOCKDEP_RECURSION_MASK)) - current->lockdep_recursion = 0; + if (WARN_ON_ONCE(__this_cpu_dec_return(lockdep_recursion))) + __this_cpu_write(lockdep_recursion, 0); } void lockdep_set_selftest_task(struct task_struct *task) @@ -585,6 +622,8 @@ static const char *usage_str[] = #include "lockdep_states.h" #undef LOCKDEP_STATE [LOCK_USED] = "INITIAL USE", + [LOCK_USED_READ] = "INITIAL READ USE", + /* abused as string storage for verify_lock_unused() */ [LOCK_USAGE_STATES] = "IN-NMI", }; #endif @@ -1320,7 +1359,7 @@ static struct lock_list *alloc_list_entry(void) */ static int add_lock_to_list(struct lock_class *this, struct lock_class *links_to, struct list_head *head, - unsigned long ip, int distance, + unsigned long ip, u16 distance, u8 dep, const struct lock_trace *trace) { struct lock_list *entry; @@ -1334,6 +1373,7 @@ static int add_lock_to_list(struct lock_class *this, entry->class = this; entry->links_to = links_to; + entry->dep = dep; entry->distance = distance; entry->trace = trace; /* @@ -1421,23 +1461,19 @@ static inline unsigned int __cq_get_elem_count(struct circular_queue *cq) return (cq->rear - cq->front) & CQ_MASK; } -static inline void mark_lock_accessed(struct lock_list *lock, - struct lock_list *parent) +static inline void mark_lock_accessed(struct lock_list *lock) { - unsigned long nr; + lock->class->dep_gen_id = lockdep_dependency_gen_id; +} - nr = lock - list_entries; - WARN_ON(nr >= ARRAY_SIZE(list_entries)); /* Out-of-bounds, input fail */ +static inline void visit_lock_entry(struct lock_list *lock, + struct lock_list *parent) +{ lock->parent = parent; - lock->class->dep_gen_id = lockdep_dependency_gen_id; } static inline unsigned long lock_accessed(struct lock_list *lock) { - unsigned long nr; - - nr = lock - list_entries; - WARN_ON(nr >= ARRAY_SIZE(list_entries)); /* Out-of-bounds, input fail */ return lock->class->dep_gen_id == lockdep_dependency_gen_id; } @@ -1471,85 +1507,283 @@ static inline struct list_head *get_dep_list(struct lock_list *lock, int offset) return lock_class + offset; } +/* + * Return values of a bfs search: + * + * BFS_E* indicates an error + * BFS_R* indicates a result (match or not) + * + * BFS_EINVALIDNODE: Find a invalid node in the graph. + * + * BFS_EQUEUEFULL: The queue is full while doing the bfs. + * + * BFS_RMATCH: Find the matched node in the graph, and put that node into + * *@target_entry. + * + * BFS_RNOMATCH: Haven't found the matched node and keep *@target_entry + * _unchanged_. + */ +enum bfs_result { + BFS_EINVALIDNODE = -2, + BFS_EQUEUEFULL = -1, + BFS_RMATCH = 0, + BFS_RNOMATCH = 1, +}; + +/* + * bfs_result < 0 means error + */ +static inline bool bfs_error(enum bfs_result res) +{ + return res < 0; +} + +/* + * DEP_*_BIT in lock_list::dep + * + * For dependency @prev -> @next: + * + * SR: @prev is shared reader (->read != 0) and @next is recursive reader + * (->read == 2) + * ER: @prev is exclusive locker (->read == 0) and @next is recursive reader + * SN: @prev is shared reader and @next is non-recursive locker (->read != 2) + * EN: @prev is exclusive locker and @next is non-recursive locker + * + * Note that we define the value of DEP_*_BITs so that: + * bit0 is prev->read == 0 + * bit1 is next->read != 2 + */ +#define DEP_SR_BIT (0 + (0 << 1)) /* 0 */ +#define DEP_ER_BIT (1 + (0 << 1)) /* 1 */ +#define DEP_SN_BIT (0 + (1 << 1)) /* 2 */ +#define DEP_EN_BIT (1 + (1 << 1)) /* 3 */ + +#define DEP_SR_MASK (1U << (DEP_SR_BIT)) +#define DEP_ER_MASK (1U << (DEP_ER_BIT)) +#define DEP_SN_MASK (1U << (DEP_SN_BIT)) +#define DEP_EN_MASK (1U << (DEP_EN_BIT)) + +static inline unsigned int +__calc_dep_bit(struct held_lock *prev, struct held_lock *next) +{ + return (prev->read == 0) + ((next->read != 2) << 1); +} + +static inline u8 calc_dep(struct held_lock *prev, struct held_lock *next) +{ + return 1U << __calc_dep_bit(prev, next); +} + +/* + * calculate the dep_bit for backwards edges. We care about whether @prev is + * shared and whether @next is recursive. + */ +static inline unsigned int +__calc_dep_bitb(struct held_lock *prev, struct held_lock *next) +{ + return (next->read != 2) + ((prev->read == 0) << 1); +} + +static inline u8 calc_depb(struct held_lock *prev, struct held_lock *next) +{ + return 1U << __calc_dep_bitb(prev, next); +} /* - * Forward- or backward-dependency search, used for both circular dependency - * checking and hardirq-unsafe/softirq-unsafe checking. + * Initialize a lock_list entry @lock belonging to @class as the root for a BFS + * search. */ -static int __bfs(struct lock_list *source_entry, - void *data, - int (*match)(struct lock_list *entry, void *data), - struct lock_list **target_entry, - int offset) +static inline void __bfs_init_root(struct lock_list *lock, + struct lock_class *class) { + lock->class = class; + lock->parent = NULL; + lock->only_xr = 0; +} + +/* + * Initialize a lock_list entry @lock based on a lock acquisition @hlock as the + * root for a BFS search. + * + * ->only_xr of the initial lock node is set to @hlock->read == 2, to make sure + * that <prev> -> @hlock and @hlock -> <whatever __bfs() found> is not -(*R)-> + * and -(S*)->. + */ +static inline void bfs_init_root(struct lock_list *lock, + struct held_lock *hlock) +{ + __bfs_init_root(lock, hlock_class(hlock)); + lock->only_xr = (hlock->read == 2); +} + +/* + * Similar to bfs_init_root() but initialize the root for backwards BFS. + * + * ->only_xr of the initial lock node is set to @hlock->read != 0, to make sure + * that <next> -> @hlock and @hlock -> <whatever backwards BFS found> is not + * -(*S)-> and -(R*)-> (reverse order of -(*R)-> and -(S*)->). + */ +static inline void bfs_init_rootb(struct lock_list *lock, + struct held_lock *hlock) +{ + __bfs_init_root(lock, hlock_class(hlock)); + lock->only_xr = (hlock->read != 0); +} + +static inline struct lock_list *__bfs_next(struct lock_list *lock, int offset) +{ + if (!lock || !lock->parent) + return NULL; + + return list_next_or_null_rcu(get_dep_list(lock->parent, offset), + &lock->entry, struct lock_list, entry); +} + +/* + * Breadth-First Search to find a strong path in the dependency graph. + * + * @source_entry: the source of the path we are searching for. + * @data: data used for the second parameter of @match function + * @match: match function for the search + * @target_entry: pointer to the target of a matched path + * @offset: the offset to struct lock_class to determine whether it is + * locks_after or locks_before + * + * We may have multiple edges (considering different kinds of dependencies, + * e.g. ER and SN) between two nodes in the dependency graph. But + * only the strong dependency path in the graph is relevant to deadlocks. A + * strong dependency path is a dependency path that doesn't have two adjacent + * dependencies as -(*R)-> -(S*)->, please see: + * + * Documentation/locking/lockdep-design.rst + * + * for more explanation of the definition of strong dependency paths + * + * In __bfs(), we only traverse in the strong dependency path: + * + * In lock_list::only_xr, we record whether the previous dependency only + * has -(*R)-> in the search, and if it does (prev only has -(*R)->), we + * filter out any -(S*)-> in the current dependency and after that, the + * ->only_xr is set according to whether we only have -(*R)-> left. + */ +static enum bfs_result __bfs(struct lock_list *source_entry, + void *data, + bool (*match)(struct lock_list *entry, void *data), + struct lock_list **target_entry, + int offset) +{ + struct circular_queue *cq = &lock_cq; + struct lock_list *lock = NULL; struct lock_list *entry; - struct lock_list *lock; struct list_head *head; - struct circular_queue *cq = &lock_cq; - int ret = 1; + unsigned int cq_depth; + bool first; lockdep_assert_locked(); - if (match(source_entry, data)) { - *target_entry = source_entry; - ret = 0; - goto exit; - } - - head = get_dep_list(source_entry, offset); - if (list_empty(head)) - goto exit; - __cq_init(cq); __cq_enqueue(cq, source_entry); - while ((lock = __cq_dequeue(cq))) { + while ((lock = __bfs_next(lock, offset)) || (lock = __cq_dequeue(cq))) { + if (!lock->class) + return BFS_EINVALIDNODE; + + /* + * Step 1: check whether we already finish on this one. + * + * If we have visited all the dependencies from this @lock to + * others (iow, if we have visited all lock_list entries in + * @lock->class->locks_{after,before}) we skip, otherwise go + * and visit all the dependencies in the list and mark this + * list accessed. + */ + if (lock_accessed(lock)) + continue; + else + mark_lock_accessed(lock); - if (!lock->class) { - ret = -2; - goto exit; + /* + * Step 2: check whether prev dependency and this form a strong + * dependency path. + */ + if (lock->parent) { /* Parent exists, check prev dependency */ + u8 dep = lock->dep; + bool prev_only_xr = lock->parent->only_xr; + + /* + * Mask out all -(S*)-> if we only have *R in previous + * step, because -(*R)-> -(S*)-> don't make up a strong + * dependency. + */ + if (prev_only_xr) + dep &= ~(DEP_SR_MASK | DEP_SN_MASK); + + /* If nothing left, we skip */ + if (!dep) + continue; + + /* If there are only -(*R)-> left, set that for the next step */ + lock->only_xr = !(dep & (DEP_SN_MASK | DEP_EN_MASK)); } - head = get_dep_list(lock, offset); + /* + * Step 3: we haven't visited this and there is a strong + * dependency path to this, so check with @match. + */ + if (match(lock, data)) { + *target_entry = lock; + return BFS_RMATCH; + } + /* + * Step 4: if not match, expand the path by adding the + * forward or backwards dependencis in the search + * + */ + first = true; + head = get_dep_list(lock, offset); list_for_each_entry_rcu(entry, head, entry) { - if (!lock_accessed(entry)) { - unsigned int cq_depth; - mark_lock_accessed(entry, lock); - if (match(entry, data)) { - *target_entry = entry; - ret = 0; - goto exit; - } + visit_lock_entry(entry, lock); - if (__cq_enqueue(cq, entry)) { - ret = -1; - goto exit; - } - cq_depth = __cq_get_elem_count(cq); - if (max_bfs_queue_depth < cq_depth) - max_bfs_queue_depth = cq_depth; - } + /* + * Note we only enqueue the first of the list into the + * queue, because we can always find a sibling + * dependency from one (see __bfs_next()), as a result + * the space of queue is saved. + */ + if (!first) + continue; + + first = false; + + if (__cq_enqueue(cq, entry)) + return BFS_EQUEUEFULL; + + cq_depth = __cq_get_elem_count(cq); + if (max_bfs_queue_depth < cq_depth) + max_bfs_queue_depth = cq_depth; } } -exit: - return ret; + + return BFS_RNOMATCH; } -static inline int __bfs_forwards(struct lock_list *src_entry, - void *data, - int (*match)(struct lock_list *entry, void *data), - struct lock_list **target_entry) +static inline enum bfs_result +__bfs_forwards(struct lock_list *src_entry, + void *data, + bool (*match)(struct lock_list *entry, void *data), + struct lock_list **target_entry) { return __bfs(src_entry, data, match, target_entry, offsetof(struct lock_class, locks_after)); } -static inline int __bfs_backwards(struct lock_list *src_entry, - void *data, - int (*match)(struct lock_list *entry, void *data), - struct lock_list **target_entry) +static inline enum bfs_result +__bfs_backwards(struct lock_list *src_entry, + void *data, + bool (*match)(struct lock_list *entry, void *data), + struct lock_list **target_entry) { return __bfs(src_entry, data, match, target_entry, offsetof(struct lock_class, locks_before)); @@ -1659,15 +1893,72 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth, print_circular_bug_entry(entry, depth); } -static inline int class_equal(struct lock_list *entry, void *data) +/* + * We are about to add A -> B into the dependency graph, and in __bfs() a + * strong dependency path A -> .. -> B is found: hlock_class equals + * entry->class. + * + * If A -> .. -> B can replace A -> B in any __bfs() search (means the former + * is _stronger_ than or equal to the latter), we consider A -> B as redundant. + * For example if A -> .. -> B is -(EN)-> (i.e. A -(E*)-> .. -(*N)-> B), and A + * -> B is -(ER)-> or -(EN)->, then we don't need to add A -> B into the + * dependency graph, as any strong path ..-> A -> B ->.. we can get with + * having dependency A -> B, we could already get a equivalent path ..-> A -> + * .. -> B -> .. with A -> .. -> B. Therefore A -> B is reduntant. + * + * We need to make sure both the start and the end of A -> .. -> B is not + * weaker than A -> B. For the start part, please see the comment in + * check_redundant(). For the end part, we need: + * + * Either + * + * a) A -> B is -(*R)-> (everything is not weaker than that) + * + * or + * + * b) A -> .. -> B is -(*N)-> (nothing is stronger than this) + * + */ +static inline bool hlock_equal(struct lock_list *entry, void *data) +{ + struct held_lock *hlock = (struct held_lock *)data; + + return hlock_class(hlock) == entry->class && /* Found A -> .. -> B */ + (hlock->read == 2 || /* A -> B is -(*R)-> */ + !entry->only_xr); /* A -> .. -> B is -(*N)-> */ +} + +/* + * We are about to add B -> A into the dependency graph, and in __bfs() a + * strong dependency path A -> .. -> B is found: hlock_class equals + * entry->class. + * + * We will have a deadlock case (conflict) if A -> .. -> B -> A is a strong + * dependency cycle, that means: + * + * Either + * + * a) B -> A is -(E*)-> + * + * or + * + * b) A -> .. -> B is -(*N)-> (i.e. A -> .. -(*N)-> B) + * + * as then we don't have -(*R)-> -(S*)-> in the cycle. + */ +static inline bool hlock_conflict(struct lock_list *entry, void *data) { - return entry->class == data; + struct held_lock *hlock = (struct held_lock *)data; + + return hlock_class(hlock) == entry->class && /* Found A -> .. -> B */ + (hlock->read == 0 || /* B -> A is -(E*)-> */ + !entry->only_xr); /* A -> .. -> B is -(*N)-> */ } static noinline void print_circular_bug(struct lock_list *this, - struct lock_list *target, - struct held_lock *check_src, - struct held_lock *check_tgt) + struct lock_list *target, + struct held_lock *check_src, + struct held_lock *check_tgt) { struct task_struct *curr = current; struct lock_list *parent; @@ -1714,10 +2005,10 @@ static noinline void print_bfs_bug(int ret) WARN(1, "lockdep bfs error:%d\n", ret); } -static int noop_count(struct lock_list *entry, void *data) +static bool noop_count(struct lock_list *entry, void *data) { (*(unsigned long *)data)++; - return 0; + return false; } static unsigned long __lockdep_count_forward_deps(struct lock_list *this) @@ -1734,8 +2025,7 @@ unsigned long lockdep_count_forward_deps(struct lock_class *class) unsigned long ret, flags; struct lock_list this; - this.parent = NULL; - this.class = class; + __bfs_init_root(&this, class); raw_local_irq_save(flags); lockdep_lock(); @@ -1761,8 +2051,7 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class) unsigned long ret, flags; struct lock_list this; - this.parent = NULL; - this.class = class; + __bfs_init_root(&this, class); raw_local_irq_save(flags); lockdep_lock(); @@ -1775,18 +2064,18 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class) /* * Check that the dependency graph starting at <src> can lead to - * <target> or not. Print an error and return 0 if it does. + * <target> or not. */ -static noinline int -check_path(struct lock_class *target, struct lock_list *src_entry, +static noinline enum bfs_result +check_path(struct held_lock *target, struct lock_list *src_entry, + bool (*match)(struct lock_list *entry, void *data), struct lock_list **target_entry) { - int ret; + enum bfs_result ret; - ret = __bfs_forwards(src_entry, (void *)target, class_equal, - target_entry); + ret = __bfs_forwards(src_entry, target, match, target_entry); - if (unlikely(ret < 0)) + if (unlikely(bfs_error(ret))) print_bfs_bug(ret); return ret; @@ -1797,24 +2086,23 @@ check_path(struct lock_class *target, struct lock_list *src_entry, * lead to <target>. If it can, there is a circle when adding * <target> -> <src> dependency. * - * Print an error and return 0 if it does. + * Print an error and return BFS_RMATCH if it does. */ -static noinline int +static noinline enum bfs_result check_noncircular(struct held_lock *src, struct held_lock *target, struct lock_trace **const trace) { - int ret; + enum bfs_result ret; struct lock_list *target_entry; - struct lock_list src_entry = { - .class = hlock_class(src), - .parent = NULL, - }; + struct lock_list src_entry; + + bfs_init_root(&src_entry, src); debug_atomic_inc(nr_cyclic_checks); - ret = check_path(hlock_class(target), &src_entry, &target_entry); + ret = check_path(target, &src_entry, hlock_conflict, &target_entry); - if (unlikely(!ret)) { + if (unlikely(ret == BFS_RMATCH)) { if (!*trace) { /* * If save_trace fails here, the printing might @@ -1836,27 +2124,35 @@ check_noncircular(struct held_lock *src, struct held_lock *target, * <target> or not. If it can, <src> -> <target> dependency is already * in the graph. * - * Print an error and return 2 if it does or 1 if it does not. + * Return BFS_RMATCH if it does, or BFS_RMATCH if it does not, return BFS_E* if + * any error appears in the bfs search. */ -static noinline int +static noinline enum bfs_result check_redundant(struct held_lock *src, struct held_lock *target) { - int ret; + enum bfs_result ret; struct lock_list *target_entry; - struct lock_list src_entry = { - .class = hlock_class(src), - .parent = NULL, - }; + struct lock_list src_entry; + + bfs_init_root(&src_entry, src); + /* + * Special setup for check_redundant(). + * + * To report redundant, we need to find a strong dependency path that + * is equal to or stronger than <src> -> <target>. So if <src> is E, + * we need to let __bfs() only search for a path starting at a -(E*)->, + * we achieve this by setting the initial node's ->only_xr to true in + * that case. And if <prev> is S, we set initial ->only_xr to false + * because both -(S*)-> (equal) and -(E*)-> (stronger) are redundant. + */ + src_entry.only_xr = src->read == 0; debug_atomic_inc(nr_redundant_checks); - ret = check_path(hlock_class(target), &src_entry, &target_entry); + ret = check_path(target, &src_entry, hlock_equal, &target_entry); - if (!ret) { + if (ret == BFS_RMATCH) debug_atomic_inc(nr_redundant); - ret = 2; - } else if (ret < 0) - ret = 0; return ret; } @@ -1864,39 +2160,86 @@ check_redundant(struct held_lock *src, struct held_lock *target) #ifdef CONFIG_TRACE_IRQFLAGS -static inline int usage_accumulate(struct lock_list *entry, void *mask) -{ - *(unsigned long *)mask |= entry->class->usage_mask; - - return 0; -} - /* * Forwards and backwards subgraph searching, for the purposes of * proving that two subgraphs can be connected by a new dependency * without creating any illegal irq-safe -> irq-unsafe lock dependency. + * + * A irq safe->unsafe deadlock happens with the following conditions: + * + * 1) We have a strong dependency path A -> ... -> B + * + * 2) and we have ENABLED_IRQ usage of B and USED_IN_IRQ usage of A, therefore + * irq can create a new dependency B -> A (consider the case that a holder + * of B gets interrupted by an irq whose handler will try to acquire A). + * + * 3) the dependency circle A -> ... -> B -> A we get from 1) and 2) is a + * strong circle: + * + * For the usage bits of B: + * a) if A -> B is -(*N)->, then B -> A could be any type, so any + * ENABLED_IRQ usage suffices. + * b) if A -> B is -(*R)->, then B -> A must be -(E*)->, so only + * ENABLED_IRQ_*_READ usage suffices. + * + * For the usage bits of A: + * c) if A -> B is -(E*)->, then B -> A could be any type, so any + * USED_IN_IRQ usage suffices. + * d) if A -> B is -(S*)->, then B -> A must be -(*N)->, so only + * USED_IN_IRQ_*_READ usage suffices. */ -static inline int usage_match(struct lock_list *entry, void *mask) +/* + * There is a strong dependency path in the dependency graph: A -> B, and now + * we need to decide which usage bit of A should be accumulated to detect + * safe->unsafe bugs. + * + * Note that usage_accumulate() is used in backwards search, so ->only_xr + * stands for whether A -> B only has -(S*)-> (in this case ->only_xr is true). + * + * As above, if only_xr is false, which means A -> B has -(E*)-> dependency + * path, any usage of A should be considered. Otherwise, we should only + * consider _READ usage. + */ +static inline bool usage_accumulate(struct lock_list *entry, void *mask) +{ + if (!entry->only_xr) + *(unsigned long *)mask |= entry->class->usage_mask; + else /* Mask out _READ usage bits */ + *(unsigned long *)mask |= (entry->class->usage_mask & LOCKF_IRQ); + + return false; +} + +/* + * There is a strong dependency path in the dependency graph: A -> B, and now + * we need to decide which usage bit of B conflicts with the usage bits of A, + * i.e. which usage bit of B may introduce safe->unsafe deadlocks. + * + * As above, if only_xr is false, which means A -> B has -(*N)-> dependency + * path, any usage of B should be considered. Otherwise, we should only + * consider _READ usage. + */ +static inline bool usage_match(struct lock_list *entry, void *mask) { - return entry->class->usage_mask & *(unsigned long *)mask; + if (!entry->only_xr) + return !!(entry->class->usage_mask & *(unsigned long *)mask); + else /* Mask out _READ usage bits */ + return !!((entry->class->usage_mask & LOCKF_IRQ) & *(unsigned long *)mask); } /* * Find a node in the forwards-direction dependency sub-graph starting * at @root->class that matches @bit. * - * Return 0 if such a node exists in the subgraph, and put that node + * Return BFS_MATCH if such a node exists in the subgraph, and put that node * into *@target_entry. - * - * Return 1 otherwise and keep *@target_entry unchanged. - * Return <0 on error. */ -static int +static enum bfs_result find_usage_forwards(struct lock_list *root, unsigned long usage_mask, struct lock_list **target_entry) { - int result; + enum bfs_result result; debug_atomic_inc(nr_find_usage_forwards_checks); @@ -1908,18 +2251,12 @@ find_usage_forwards(struct lock_list *root, unsigned long usage_mask, /* * Find a node in the backwards-direction dependency sub-graph starting * at @root->class that matches @bit. - * - * Return 0 if such a node exists in the subgraph, and put that node - * into *@target_entry. - * - * Return 1 otherwise and keep *@target_entry unchanged. - * Return <0 on error. */ -static int +static enum bfs_result find_usage_backwards(struct lock_list *root, unsigned long usage_mask, struct lock_list **target_entry) { - int result; + enum bfs_result result; debug_atomic_inc(nr_find_usage_backwards_checks); @@ -1939,7 +2276,7 @@ static void print_lock_class_header(struct lock_class *class, int depth) #endif printk(KERN_CONT " {\n"); - for (bit = 0; bit < LOCK_USAGE_STATES; bit++) { + for (bit = 0; bit < LOCK_TRACE_STATES; bit++) { if (class->usage_mask & (1 << bit)) { int len = depth; @@ -2179,17 +2516,39 @@ static unsigned long invert_dir_mask(unsigned long mask) } /* - * As above, we clear bitnr0 (LOCK_*_READ off) with bitmask ops. First, for all - * bits with bitnr0 set (LOCK_*_READ), add those with bitnr0 cleared (LOCK_*). - * And then mask out all bitnr0. + * Note that a LOCK_ENABLED_IRQ_*_READ usage and a LOCK_USED_IN_IRQ_*_READ + * usage may cause deadlock too, for example: + * + * P1 P2 + * <irq disabled> + * write_lock(l1); <irq enabled> + * read_lock(l2); + * write_lock(l2); + * <in irq> + * read_lock(l1); + * + * , in above case, l1 will be marked as LOCK_USED_IN_IRQ_HARDIRQ_READ and l2 + * will marked as LOCK_ENABLE_IRQ_HARDIRQ_READ, and this is a possible + * deadlock. + * + * In fact, all of the following cases may cause deadlocks: + * + * LOCK_USED_IN_IRQ_* -> LOCK_ENABLED_IRQ_* + * LOCK_USED_IN_IRQ_*_READ -> LOCK_ENABLED_IRQ_* + * LOCK_USED_IN_IRQ_* -> LOCK_ENABLED_IRQ_*_READ + * LOCK_USED_IN_IRQ_*_READ -> LOCK_ENABLED_IRQ_*_READ + * + * As a result, to calculate the "exclusive mask", first we invert the + * direction (USED_IN/ENABLED) of the original mask, and 1) for all bits with + * bitnr0 set (LOCK_*_READ), add those with bitnr0 cleared (LOCK_*). 2) for all + * bits with bitnr0 cleared (LOCK_*_READ), add those with bitnr0 set (LOCK_*). */ static unsigned long exclusive_mask(unsigned long mask) { unsigned long excl = invert_dir_mask(mask); - /* Strip read */ excl |= (excl & LOCKF_IRQ_READ) >> LOCK_USAGE_READ_MASK; - excl &= ~LOCKF_IRQ_READ; + excl |= (excl & LOCKF_IRQ) << LOCK_USAGE_READ_MASK; return excl; } @@ -2206,6 +2565,7 @@ static unsigned long original_mask(unsigned long mask) unsigned long excl = invert_dir_mask(mask); /* Include read in existing usages */ + excl |= (excl & LOCKF_IRQ_READ) >> LOCK_USAGE_READ_MASK; excl |= (excl & LOCKF_IRQ) << LOCK_USAGE_READ_MASK; return excl; @@ -2220,14 +2580,24 @@ static int find_exclusive_match(unsigned long mask, enum lock_usage_bit *bitp, enum lock_usage_bit *excl_bitp) { - int bit, excl; + int bit, excl, excl_read; for_each_set_bit(bit, &mask, LOCK_USED) { + /* + * exclusive_bit() strips the read bit, however, + * LOCK_ENABLED_IRQ_*_READ may cause deadlocks too, so we need + * to search excl | LOCK_USAGE_READ_MASK as well. + */ excl = exclusive_bit(bit); + excl_read = excl | LOCK_USAGE_READ_MASK; if (excl_mask & lock_flag(excl)) { *bitp = bit; *excl_bitp = excl; return 0; + } else if (excl_mask & lock_flag(excl_read)) { + *bitp = bit; + *excl_bitp = excl_read; + return 0; } } return -1; @@ -2247,17 +2617,16 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, struct lock_list *target_entry1; struct lock_list *target_entry; struct lock_list this, that; - int ret; + enum bfs_result ret; /* * Step 1: gather all hard/soft IRQs usages backward in an * accumulated usage mask. */ - this.parent = NULL; - this.class = hlock_class(prev); + bfs_init_rootb(&this, prev); ret = __bfs_backwards(&this, &usage_mask, usage_accumulate, NULL); - if (ret < 0) { + if (bfs_error(ret)) { print_bfs_bug(ret); return 0; } @@ -2272,16 +2641,15 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, */ forward_mask = exclusive_mask(usage_mask); - that.parent = NULL; - that.class = hlock_class(next); + bfs_init_root(&that, next); ret = find_usage_forwards(&that, forward_mask, &target_entry1); - if (ret < 0) { + if (bfs_error(ret)) { print_bfs_bug(ret); return 0; } - if (ret == 1) - return ret; + if (ret == BFS_RNOMATCH) + return 1; /* * Step 3: we found a bad match! Now retrieve a lock from the backward @@ -2291,11 +2659,11 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, backward_mask = original_mask(target_entry1->class->usage_mask); ret = find_usage_backwards(&this, backward_mask, &target_entry); - if (ret < 0) { + if (bfs_error(ret)) { print_bfs_bug(ret); return 0; } - if (DEBUG_LOCKS_WARN_ON(ret == 1)) + if (DEBUG_LOCKS_WARN_ON(ret == BFS_RNOMATCH)) return 1; /* @@ -2459,11 +2827,11 @@ check_deadlock(struct task_struct *curr, struct held_lock *next) */ static int check_prev_add(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next, int distance, + struct held_lock *next, u16 distance, struct lock_trace **const trace) { struct lock_list *entry; - int ret; + enum bfs_result ret; if (!hlock_class(prev)->key || !hlock_class(next)->key) { /* @@ -2494,23 +2862,13 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, * in the graph whose neighbours are to be checked. */ ret = check_noncircular(next, prev, trace); - if (unlikely(ret <= 0)) + if (unlikely(bfs_error(ret) || ret == BFS_RMATCH)) return 0; if (!check_irq_usage(curr, prev, next)) return 0; /* - * For recursive read-locks we do all the dependency checks, - * but we dont store read-triggered dependencies (only - * write-triggered dependencies). This ensures that only the - * write-side dependencies matter, and that if for example a - * write-lock never takes any other locks, then the reads are - * equivalent to a NOP. - */ - if (next->read == 2 || prev->read == 2) - return 1; - /* * Is the <prev> -> <next> dependency already present? * * (this may occur even though this is a new chain: consider @@ -2522,7 +2880,35 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, if (entry->class == hlock_class(next)) { if (distance == 1) entry->distance = 1; - return 1; + entry->dep |= calc_dep(prev, next); + + /* + * Also, update the reverse dependency in @next's + * ->locks_before list. + * + * Here we reuse @entry as the cursor, which is fine + * because we won't go to the next iteration of the + * outer loop: + * + * For normal cases, we return in the inner loop. + * + * If we fail to return, we have inconsistency, i.e. + * <prev>::locks_after contains <next> while + * <next>::locks_before doesn't contain <prev>. In + * that case, we return after the inner and indicate + * something is wrong. + */ + list_for_each_entry(entry, &hlock_class(next)->locks_before, entry) { + if (entry->class == hlock_class(prev)) { + if (distance == 1) + entry->distance = 1; + entry->dep |= calc_depb(prev, next); + return 1; + } + } + + /* <prev> is not found in <next>::locks_before */ + return 0; } } @@ -2531,8 +2917,10 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, * Is the <prev> -> <next> link redundant? */ ret = check_redundant(prev, next); - if (ret != 1) - return ret; + if (bfs_error(ret)) + return 0; + else if (ret == BFS_RMATCH) + return 2; #endif if (!*trace) { @@ -2547,14 +2935,18 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, */ ret = add_lock_to_list(hlock_class(next), hlock_class(prev), &hlock_class(prev)->locks_after, - next->acquire_ip, distance, *trace); + next->acquire_ip, distance, + calc_dep(prev, next), + *trace); if (!ret) return 0; ret = add_lock_to_list(hlock_class(prev), hlock_class(next), &hlock_class(next)->locks_before, - next->acquire_ip, distance, *trace); + next->acquire_ip, distance, + calc_depb(prev, next), + *trace); if (!ret) return 0; @@ -2590,16 +2982,11 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) goto out_bug; for (;;) { - int distance = curr->lockdep_depth - depth + 1; + u16 distance = curr->lockdep_depth - depth + 1; hlock = curr->held_locks + depth - 1; - /* - * Only non-recursive-read entries get new dependencies - * added: - */ - if (hlock->read != 2 && hlock->check) { - int ret = check_prev_add(curr, hlock, next, distance, - &trace); + if (hlock->check) { + int ret = check_prev_add(curr, hlock, next, distance, &trace); if (!ret) return 0; @@ -2875,7 +3262,10 @@ static inline void free_chain_hlocks(int base, int size) struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i) { - return lock_classes + chain_hlocks[chain->base + i]; + u16 chain_hlock = chain_hlocks[chain->base + i]; + unsigned int class_idx = chain_hlock_class_idx(chain_hlock); + + return lock_classes + class_idx - 1; } /* @@ -2901,12 +3291,12 @@ static inline int get_first_held_lock(struct task_struct *curr, /* * Returns the next chain_key iteration */ -static u64 print_chain_key_iteration(int class_idx, u64 chain_key) +static u64 print_chain_key_iteration(u16 hlock_id, u64 chain_key) { - u64 new_chain_key = iterate_chain_key(chain_key, class_idx); + u64 new_chain_key = iterate_chain_key(chain_key, hlock_id); - printk(" class_idx:%d -> chain_key:%016Lx", - class_idx, + printk(" hlock_id:%d -> chain_key:%016Lx", + (unsigned int)hlock_id, (unsigned long long)new_chain_key); return new_chain_key; } @@ -2923,12 +3313,12 @@ print_chain_keys_held_locks(struct task_struct *curr, struct held_lock *hlock_ne hlock_next->irq_context); for (; i < depth; i++) { hlock = curr->held_locks + i; - chain_key = print_chain_key_iteration(hlock->class_idx, chain_key); + chain_key = print_chain_key_iteration(hlock_id(hlock), chain_key); print_lock(hlock); } - print_chain_key_iteration(hlock_next->class_idx, chain_key); + print_chain_key_iteration(hlock_id(hlock_next), chain_key); print_lock(hlock_next); } @@ -2936,14 +3326,14 @@ static void print_chain_keys_chain(struct lock_chain *chain) { int i; u64 chain_key = INITIAL_CHAIN_KEY; - int class_id; + u16 hlock_id; printk("depth: %u\n", chain->depth); for (i = 0; i < chain->depth; i++) { - class_id = chain_hlocks[chain->base + i]; - chain_key = print_chain_key_iteration(class_id, chain_key); + hlock_id = chain_hlocks[chain->base + i]; + chain_key = print_chain_key_iteration(hlock_id, chain_key); - print_lock_name(lock_classes + class_id); + print_lock_name(lock_classes + chain_hlock_class_idx(hlock_id) - 1); printk("\n"); } } @@ -2992,7 +3382,7 @@ static int check_no_collision(struct task_struct *curr, } for (j = 0; j < chain->depth - 1; j++, i++) { - id = curr->held_locks[i].class_idx; + id = hlock_id(&curr->held_locks[i]); if (DEBUG_LOCKS_WARN_ON(chain_hlocks[chain->base + j] != id)) { print_collision(curr, hlock, chain); @@ -3041,7 +3431,6 @@ static inline int add_chain_cache(struct task_struct *curr, struct held_lock *hlock, u64 chain_key) { - struct lock_class *class = hlock_class(hlock); struct hlist_head *hash_head = chainhashentry(chain_key); struct lock_chain *chain; int i, j; @@ -3084,11 +3473,11 @@ static inline int add_chain_cache(struct task_struct *curr, chain->base = j; for (j = 0; j < chain->depth - 1; j++, i++) { - int lock_id = curr->held_locks[i].class_idx; + int lock_id = hlock_id(curr->held_locks + i); chain_hlocks[chain->base + j] = lock_id; } - chain_hlocks[chain->base + j] = class - lock_classes; + chain_hlocks[chain->base + j] = hlock_id(hlock); hlist_add_head_rcu(&chain->entry, hash_head); debug_atomic_inc(chain_lookup_misses); inc_chains(chain->irq_context); @@ -3275,7 +3664,7 @@ static void check_chain_key(struct task_struct *curr) if (prev_hlock && (prev_hlock->irq_context != hlock->irq_context)) chain_key = INITIAL_CHAIN_KEY; - chain_key = iterate_chain_key(chain_key, hlock->class_idx); + chain_key = iterate_chain_key(chain_key, hlock_id(hlock)); prev_hlock = hlock; } if (chain_key != curr->curr_chain_key) { @@ -3434,24 +3823,32 @@ print_irq_inversion_bug(struct task_struct *curr, */ static int check_usage_forwards(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit bit, const char *irqclass) + enum lock_usage_bit bit) { - int ret; + enum bfs_result ret; struct lock_list root; struct lock_list *target_entry; + enum lock_usage_bit read_bit = bit + LOCK_USAGE_READ_MASK; + unsigned usage_mask = lock_flag(bit) | lock_flag(read_bit); - root.parent = NULL; - root.class = hlock_class(this); - ret = find_usage_forwards(&root, lock_flag(bit), &target_entry); - if (ret < 0) { + bfs_init_root(&root, this); + ret = find_usage_forwards(&root, usage_mask, &target_entry); + if (bfs_error(ret)) { print_bfs_bug(ret); return 0; } - if (ret == 1) - return ret; + if (ret == BFS_RNOMATCH) + return 1; + + /* Check whether write or read usage is the match */ + if (target_entry->class->usage_mask & lock_flag(bit)) { + print_irq_inversion_bug(curr, &root, target_entry, + this, 1, state_name(bit)); + } else { + print_irq_inversion_bug(curr, &root, target_entry, + this, 1, state_name(read_bit)); + } - print_irq_inversion_bug(curr, &root, target_entry, - this, 1, irqclass); return 0; } @@ -3461,24 +3858,32 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this, */ static int check_usage_backwards(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit bit, const char *irqclass) + enum lock_usage_bit bit) { - int ret; + enum bfs_result ret; struct lock_list root; struct lock_list *target_entry; + enum lock_usage_bit read_bit = bit + LOCK_USAGE_READ_MASK; + unsigned usage_mask = lock_flag(bit) | lock_flag(read_bit); - root.parent = NULL; - root.class = hlock_class(this); - ret = find_usage_backwards(&root, lock_flag(bit), &target_entry); - if (ret < 0) { + bfs_init_rootb(&root, this); + ret = find_usage_backwards(&root, usage_mask, &target_entry); + if (bfs_error(ret)) { print_bfs_bug(ret); return 0; } - if (ret == 1) - return ret; + if (ret == BFS_RNOMATCH) + return 1; + + /* Check whether write or read usage is the match */ + if (target_entry->class->usage_mask & lock_flag(bit)) { + print_irq_inversion_bug(curr, &root, target_entry, + this, 0, state_name(bit)); + } else { + print_irq_inversion_bug(curr, &root, target_entry, + this, 0, state_name(read_bit)); + } - print_irq_inversion_bug(curr, &root, target_entry, - this, 0, irqclass); return 0; } @@ -3517,8 +3922,6 @@ static int SOFTIRQ_verbose(struct lock_class *class) return 0; } -#define STRICT_READ_CHECKS 1 - static int (*state_verbose_f[])(struct lock_class *class) = { #define LOCKDEP_STATE(__STATE) \ __STATE##_verbose, @@ -3544,16 +3947,6 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this, int dir = new_bit & LOCK_USAGE_DIR_MASK; /* - * mark USED_IN has to look forwards -- to ensure no dependency - * has ENABLED state, which would allow recursion deadlocks. - * - * mark ENABLED has to look backwards -- to ensure no dependee - * has USED_IN state, which, again, would allow recursion deadlocks. - */ - check_usage_f usage = dir ? - check_usage_backwards : check_usage_forwards; - - /* * Validate that this particular lock does not have conflicting * usage states. */ @@ -3561,23 +3954,30 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this, return 0; /* - * Validate that the lock dependencies don't have conflicting usage - * states. + * Check for read in write conflicts */ - if ((!read || STRICT_READ_CHECKS) && - !usage(curr, this, excl_bit, state_name(new_bit & ~LOCK_USAGE_READ_MASK))) + if (!read && !valid_state(curr, this, new_bit, + excl_bit + LOCK_USAGE_READ_MASK)) return 0; + /* - * Check for read in write conflicts + * Validate that the lock dependencies don't have conflicting usage + * states. */ - if (!read) { - if (!valid_state(curr, this, new_bit, excl_bit + LOCK_USAGE_READ_MASK)) + if (dir) { + /* + * mark ENABLED has to look backwards -- to ensure no dependee + * has USED_IN state, which, again, would allow recursion deadlocks. + */ + if (!check_usage_backwards(curr, this, excl_bit)) return 0; - - if (STRICT_READ_CHECKS && - !usage(curr, this, excl_bit + LOCK_USAGE_READ_MASK, - state_name(new_bit + LOCK_USAGE_READ_MASK))) + } else { + /* + * mark USED_IN has to look forwards -- to ensure no dependency + * has ENABLED state, which would allow recursion deadlocks. + */ + if (!check_usage_forwards(curr, this, excl_bit)) return 0; } @@ -3657,7 +4057,7 @@ void lockdep_hardirqs_on_prepare(unsigned long ip) if (unlikely(in_nmi())) return; - if (unlikely(current->lockdep_recursion & LOCKDEP_RECURSION_MASK)) + if (unlikely(this_cpu_read(lockdep_recursion))) return; if (unlikely(lockdep_hardirqs_enabled())) { @@ -3693,7 +4093,7 @@ void lockdep_hardirqs_on_prepare(unsigned long ip) current->hardirq_chain_key = current->curr_chain_key; - current->lockdep_recursion++; + lockdep_recursion_inc(); __trace_hardirqs_on_caller(); lockdep_recursion_finish(); } @@ -3726,7 +4126,7 @@ void noinstr lockdep_hardirqs_on(unsigned long ip) goto skip_checks; } - if (unlikely(current->lockdep_recursion & LOCKDEP_RECURSION_MASK)) + if (unlikely(this_cpu_read(lockdep_recursion))) return; if (lockdep_hardirqs_enabled()) { @@ -3756,7 +4156,7 @@ void noinstr lockdep_hardirqs_on(unsigned long ip) skip_checks: /* we'll do an OFF -> ON transition: */ - this_cpu_write(hardirqs_enabled, 1); + __this_cpu_write(hardirqs_enabled, 1); trace->hardirq_enable_ip = ip; trace->hardirq_enable_event = ++trace->irq_events; debug_atomic_inc(hardirqs_on_events); @@ -3779,7 +4179,7 @@ void noinstr lockdep_hardirqs_off(unsigned long ip) if (in_nmi()) { if (!IS_ENABLED(CONFIG_TRACE_IRQFLAGS_NMI)) return; - } else if (current->lockdep_recursion & LOCKDEP_RECURSION_MASK) + } else if (__this_cpu_read(lockdep_recursion)) return; /* @@ -3795,7 +4195,7 @@ void noinstr lockdep_hardirqs_off(unsigned long ip) /* * We have done an ON -> OFF transition: */ - this_cpu_write(hardirqs_enabled, 0); + __this_cpu_write(hardirqs_enabled, 0); trace->hardirq_disable_ip = ip; trace->hardirq_disable_event = ++trace->irq_events; debug_atomic_inc(hardirqs_off_events); @@ -3812,7 +4212,7 @@ void lockdep_softirqs_on(unsigned long ip) { struct irqtrace_events *trace = ¤t->irqtrace; - if (unlikely(!debug_locks || current->lockdep_recursion)) + if (unlikely(!lockdep_enabled())) return; /* @@ -3827,7 +4227,7 @@ void lockdep_softirqs_on(unsigned long ip) return; } - current->lockdep_recursion++; + lockdep_recursion_inc(); /* * We'll do an OFF -> ON transition: */ @@ -3850,7 +4250,7 @@ void lockdep_softirqs_on(unsigned long ip) */ void lockdep_softirqs_off(unsigned long ip) { - if (unlikely(!debug_locks || current->lockdep_recursion)) + if (unlikely(!lockdep_enabled())) return; /* @@ -3969,13 +4369,18 @@ static int separate_irq_context(struct task_struct *curr, static int mark_lock(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit new_bit) { - unsigned int new_mask = 1 << new_bit, ret = 1; + unsigned int new_mask, ret = 1; if (new_bit >= LOCK_USAGE_STATES) { DEBUG_LOCKS_WARN_ON(1); return 0; } + if (new_bit == LOCK_USED && this->read) + new_bit = LOCK_USED_READ; + + new_mask = 1 << new_bit; + /* * If already set then do not dirty the cacheline, * nor do any checks: @@ -3988,26 +4393,26 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, /* * Make sure we didn't race: */ - if (unlikely(hlock_class(this)->usage_mask & new_mask)) { - graph_unlock(); - return 1; - } + if (unlikely(hlock_class(this)->usage_mask & new_mask)) + goto unlock; + + if (!hlock_class(this)->usage_mask) + debug_atomic_dec(nr_unused_locks); hlock_class(this)->usage_mask |= new_mask; - if (!(hlock_class(this)->usage_traces[new_bit] = save_trace())) - return 0; + if (new_bit < LOCK_TRACE_STATES) { + if (!(hlock_class(this)->usage_traces[new_bit] = save_trace())) + return 0; + } - switch (new_bit) { - case LOCK_USED: - debug_atomic_dec(nr_unused_locks); - break; - default: + if (new_bit < LOCK_USED) { ret = mark_lock_irq(curr, this, new_bit); if (!ret) return 0; } +unlock: graph_unlock(); /* @@ -4220,11 +4625,11 @@ void lockdep_init_map_waits(struct lockdep_map *lock, const char *name, if (subclass) { unsigned long flags; - if (DEBUG_LOCKS_WARN_ON(current->lockdep_recursion)) + if (DEBUG_LOCKS_WARN_ON(!lockdep_enabled())) return; raw_local_irq_save(flags); - current->lockdep_recursion++; + lockdep_recursion_inc(); register_lock_class(lock, subclass, 1); lockdep_recursion_finish(); raw_local_irq_restore(flags); @@ -4411,7 +4816,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, chain_key = INITIAL_CHAIN_KEY; chain_head = 1; } - chain_key = iterate_chain_key(chain_key, class_idx); + chain_key = iterate_chain_key(chain_key, hlock_id(hlock)); if (nest_lock && !__lock_is_held(nest_lock, -1)) { print_lock_nested_lock_not_held(curr, hlock, ip); @@ -4907,11 +5312,11 @@ void lock_set_class(struct lockdep_map *lock, const char *name, { unsigned long flags; - if (unlikely(current->lockdep_recursion)) + if (unlikely(!lockdep_enabled())) return; raw_local_irq_save(flags); - current->lockdep_recursion++; + lockdep_recursion_inc(); check_flags(flags); if (__lock_set_class(lock, name, key, subclass, ip)) check_chain_key(current); @@ -4924,11 +5329,11 @@ void lock_downgrade(struct lockdep_map *lock, unsigned long ip) { unsigned long flags; - if (unlikely(current->lockdep_recursion)) + if (unlikely(!lockdep_enabled())) return; raw_local_irq_save(flags); - current->lockdep_recursion++; + lockdep_recursion_inc(); check_flags(flags); if (__lock_downgrade(lock, ip)) check_chain_key(current); @@ -4942,12 +5347,20 @@ static void verify_lock_unused(struct lockdep_map *lock, struct held_lock *hlock { #ifdef CONFIG_PROVE_LOCKING struct lock_class *class = look_up_lock_class(lock, subclass); + unsigned long mask = LOCKF_USED; /* if it doesn't have a class (yet), it certainly hasn't been used yet */ if (!class) return; - if (!(class->usage_mask & LOCK_USED)) + /* + * READ locks only conflict with USED, such that if we only ever use + * READ locks, there is no deadlock possible -- RCU. + */ + if (!hlock->read) + mask |= LOCKF_USED_READ; + + if (!(class->usage_mask & mask)) return; hlock->class_idx = class - lock_classes; @@ -4958,7 +5371,7 @@ static void verify_lock_unused(struct lockdep_map *lock, struct held_lock *hlock static bool lockdep_nmi(void) { - if (current->lockdep_recursion & LOCKDEP_RECURSION_MASK) + if (raw_cpu_read(lockdep_recursion)) return false; if (!in_nmi()) @@ -4968,6 +5381,20 @@ static bool lockdep_nmi(void) } /* + * read_lock() is recursive if: + * 1. We force lockdep think this way in selftests or + * 2. The implementation is not queued read/write lock or + * 3. The locker is at an in_interrupt() context. + */ +bool read_lock_is_recursive(void) +{ + return force_read_lock_recursive || + !IS_ENABLED(CONFIG_QUEUED_RWLOCKS) || + in_interrupt(); +} +EXPORT_SYMBOL_GPL(read_lock_is_recursive); + +/* * We are not always called with irqs disabled - do that here, * and also avoid lockdep recursion: */ @@ -4977,7 +5404,12 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, { unsigned long flags; - if (unlikely(current->lockdep_recursion)) { + trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip); + + if (!debug_locks) + return; + + if (unlikely(!lockdep_enabled())) { /* XXX allow trylock from NMI ?!? */ if (lockdep_nmi() && !trylock) { struct held_lock hlock; @@ -5000,8 +5432,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion++; - trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip); + lockdep_recursion_inc(); __lock_acquire(lock, subclass, trylock, read, check, irqs_disabled_flags(flags), nest_lock, ip, 0, 0); lockdep_recursion_finish(); @@ -5013,13 +5444,15 @@ void lock_release(struct lockdep_map *lock, unsigned long ip) { unsigned long flags; - if (unlikely(current->lockdep_recursion)) + trace_lock_release(lock, ip); + + if (unlikely(!lockdep_enabled())) return; raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion++; - trace_lock_release(lock, ip); + + lockdep_recursion_inc(); if (__lock_release(lock, ip)) check_chain_key(current); lockdep_recursion_finish(); @@ -5032,13 +5465,13 @@ noinstr int lock_is_held_type(const struct lockdep_map *lock, int read) unsigned long flags; int ret = 0; - if (unlikely(current->lockdep_recursion)) + if (unlikely(!lockdep_enabled())) return 1; /* avoid false negative lockdep_assert_held() */ raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion++; + lockdep_recursion_inc(); ret = __lock_is_held(lock, read); lockdep_recursion_finish(); raw_local_irq_restore(flags); @@ -5053,13 +5486,13 @@ struct pin_cookie lock_pin_lock(struct lockdep_map *lock) struct pin_cookie cookie = NIL_COOKIE; unsigned long flags; - if (unlikely(current->lockdep_recursion)) + if (unlikely(!lockdep_enabled())) return cookie; raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion++; + lockdep_recursion_inc(); cookie = __lock_pin_lock(lock); lockdep_recursion_finish(); raw_local_irq_restore(flags); @@ -5072,13 +5505,13 @@ void lock_repin_lock(struct lockdep_map *lock, struct pin_cookie cookie) { unsigned long flags; - if (unlikely(current->lockdep_recursion)) + if (unlikely(!lockdep_enabled())) return; raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion++; + lockdep_recursion_inc(); __lock_repin_lock(lock, cookie); lockdep_recursion_finish(); raw_local_irq_restore(flags); @@ -5089,13 +5522,13 @@ void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie) { unsigned long flags; - if (unlikely(current->lockdep_recursion)) + if (unlikely(!lockdep_enabled())) return; raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion++; + lockdep_recursion_inc(); __lock_unpin_lock(lock, cookie); lockdep_recursion_finish(); raw_local_irq_restore(flags); @@ -5205,8 +5638,6 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip) hlock->holdtime_stamp = now; } - trace_lock_acquired(lock, ip); - stats = get_lock_stats(hlock_class(hlock)); if (waittime) { if (hlock->read) @@ -5225,16 +5656,14 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip) { unsigned long flags; - if (unlikely(!lock_stat || !debug_locks)) - return; + trace_lock_acquired(lock, ip); - if (unlikely(current->lockdep_recursion)) + if (unlikely(!lock_stat || !lockdep_enabled())) return; raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion++; - trace_lock_contended(lock, ip); + lockdep_recursion_inc(); __lock_contended(lock, ip); lockdep_recursion_finish(); raw_local_irq_restore(flags); @@ -5245,15 +5674,14 @@ void lock_acquired(struct lockdep_map *lock, unsigned long ip) { unsigned long flags; - if (unlikely(!lock_stat || !debug_locks)) - return; + trace_lock_contended(lock, ip); - if (unlikely(current->lockdep_recursion)) + if (unlikely(!lock_stat || !lockdep_enabled())) return; raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion++; + lockdep_recursion_inc(); __lock_acquired(lock, ip); lockdep_recursion_finish(); raw_local_irq_restore(flags); @@ -5292,7 +5720,7 @@ static void remove_class_from_lock_chain(struct pending_free *pf, int i; for (i = chain->base; i < chain->base + chain->depth; i++) { - if (chain_hlocks[i] != class - lock_classes) + if (chain_hlock_class_idx(chain_hlocks[i]) != class - lock_classes) continue; /* * Each lock class occurs at most once in a lock chain so once diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index baca699b94e9..de49f9e1c11b 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -19,9 +19,13 @@ enum lock_usage_bit { #include "lockdep_states.h" #undef LOCKDEP_STATE LOCK_USED, - LOCK_USAGE_STATES + LOCK_USED_READ, + LOCK_USAGE_STATES, }; +/* states after LOCK_USED_READ are not traced and printed */ +static_assert(LOCK_TRACE_STATES == LOCK_USAGE_STATES); + #define LOCK_USAGE_READ_MASK 1 #define LOCK_USAGE_DIR_MASK 2 #define LOCK_USAGE_STATE_MASK (~(LOCK_USAGE_READ_MASK | LOCK_USAGE_DIR_MASK)) @@ -40,6 +44,7 @@ enum { #include "lockdep_states.h" #undef LOCKDEP_STATE __LOCKF(USED) + __LOCKF(USED_READ) }; #define LOCKDEP_STATE(__STATE) LOCKF_ENABLED_##__STATE | @@ -119,7 +124,7 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ = extern struct list_head all_lock_classes; extern struct lock_chain lock_chains[]; -#define LOCK_USAGE_CHARS (1+LOCK_USAGE_STATES/2) +#define LOCK_USAGE_CHARS (2*XXX_LOCK_USAGE_STATES + 1) extern void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS]); diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 9cfa5e89cff7..62d215b2e39f 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -566,7 +566,7 @@ static struct lock_torture_ops rwsem_lock_ops = { #include <linux/percpu-rwsem.h> static struct percpu_rw_semaphore pcpu_rwsem; -void torture_percpu_rwsem_init(void) +static void torture_percpu_rwsem_init(void) { BUG_ON(percpu_init_rwsem(&pcpu_rwsem)); } diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index 8bbafe3e5203..70a32a576f3f 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -45,7 +45,7 @@ EXPORT_SYMBOL_GPL(percpu_free_rwsem); static bool __percpu_down_read_trylock(struct percpu_rw_semaphore *sem) { - __this_cpu_inc(*sem->read_count); + this_cpu_inc(*sem->read_count); /* * Due to having preemption disabled the decrement happens on @@ -71,7 +71,7 @@ static bool __percpu_down_read_trylock(struct percpu_rw_semaphore *sem) if (likely(!atomic_read_acquire(&sem->block))) return true; - __this_cpu_dec(*sem->read_count); + this_cpu_dec(*sem->read_count); /* Prod writer to re-evaluate readers_active_check() */ rcuwait_wake_up(&sem->writer); diff --git a/kernel/module.c b/kernel/module.c index 1c5cff34d9f2..a4fa44a652a7 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -18,6 +18,7 @@ #include <linux/fs.h> #include <linux/sysfs.h> #include <linux/kernel.h> +#include <linux/kernel_read_file.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/elf.h> @@ -91,8 +92,9 @@ EXPORT_SYMBOL_GPL(module_mutex); static LIST_HEAD(modules); /* Work queue for freeing init sections in success case */ -static struct work_struct init_free_wq; -static struct llist_head init_free_list; +static void do_free_init(struct work_struct *w); +static DECLARE_WORK(init_free_wq, do_free_init); +static LLIST_HEAD(init_free_list); #ifdef CONFIG_MODULES_TREE_LOOKUP @@ -2096,8 +2098,11 @@ static int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, int i; for (i = 0; i < hdr->e_shnum; i++) { - if ((sechdrs[i].sh_flags & shf_wx) == shf_wx) + if ((sechdrs[i].sh_flags & shf_wx) == shf_wx) { + pr_err("%s: section %s (index %d) has invalid WRITE|EXEC flags\n", + mod->name, secstrings + sechdrs[i].sh_name, i); return -ENOEXEC; + } } return 0; @@ -3013,7 +3018,7 @@ static int copy_module_from_user(const void __user *umod, unsigned long len, if (info->len < sizeof(*(info->hdr))) return -ENOEXEC; - err = security_kernel_load_data(LOADING_MODULE); + err = security_kernel_load_data(LOADING_MODULE, true); if (err) return err; @@ -3023,11 +3028,17 @@ static int copy_module_from_user(const void __user *umod, unsigned long len, return -ENOMEM; if (copy_chunked_from_user(info->hdr, umod, info->len) != 0) { - vfree(info->hdr); - return -EFAULT; + err = -EFAULT; + goto out; } - return 0; + err = security_kernel_post_load_data((char *)info->hdr, info->len, + LOADING_MODULE, "init_module"); +out: + if (err) + vfree(info->hdr); + + return err; } static void free_copy(struct load_info *info) @@ -3275,6 +3286,11 @@ static int find_module_sections(struct module *mod, struct load_info *info) sizeof(unsigned long), &mod->num_kprobe_blacklist); #endif +#ifdef CONFIG_HAVE_STATIC_CALL_INLINE + mod->static_call_sites = section_objs(info, ".static_call_sites", + sizeof(*mod->static_call_sites), + &mod->num_static_call_sites); +#endif mod->extable = section_objs(info, "__ex_table", sizeof(*mod->extable), &mod->num_exentries); @@ -3579,14 +3595,6 @@ static void do_free_init(struct work_struct *w) } } -static int __init modules_wq_init(void) -{ - INIT_WORK(&init_free_wq, do_free_init); - init_llist_head(&init_free_list); - return 0; -} -module_init(modules_wq_init); - /* * This is where the real work happens. * @@ -3792,9 +3800,13 @@ static int prepare_coming_module(struct module *mod) if (err) return err; - blocking_notifier_call_chain(&module_notify_list, - MODULE_STATE_COMING, mod); - return 0; + err = blocking_notifier_call_chain_robust(&module_notify_list, + MODULE_STATE_COMING, MODULE_STATE_GOING, mod); + err = notifier_to_errno(err); + if (err) + klp_module_going(mod); + + return err; } static int unknown_module_param_cb(char *param, char *val, const char *modname, @@ -3825,8 +3837,10 @@ static int load_module(struct load_info *info, const char __user *uargs, char *after_dashes; err = elf_header_check(info); - if (err) + if (err) { + pr_err("Module has invalid ELF header\n"); goto free_copy; + } err = setup_load_info(info, flags); if (err) @@ -3834,6 +3848,7 @@ static int load_module(struct load_info *info, const char __user *uargs, if (blacklisted(info->name)) { err = -EPERM; + pr_err("Module %s is blacklisted\n", info->name); goto free_copy; } @@ -4034,8 +4049,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) { struct load_info info = { }; - loff_t size; - void *hdr; + void *hdr = NULL; int err; err = may_init_module(); @@ -4048,12 +4062,12 @@ SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) |MODULE_INIT_IGNORE_VERMAGIC)) return -EINVAL; - err = kernel_read_file_from_fd(fd, &hdr, &size, INT_MAX, + err = kernel_read_file_from_fd(fd, 0, &hdr, INT_MAX, NULL, READING_MODULE); - if (err) + if (err < 0) return err; info.hdr = hdr; - info.len = size; + info.len = err; return load_module(&info, uargs, flags); } diff --git a/kernel/notifier.c b/kernel/notifier.c index 84c987dfbe03..1b019cbca594 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -94,6 +94,34 @@ static int notifier_call_chain(struct notifier_block **nl, } NOKPROBE_SYMBOL(notifier_call_chain); +/** + * notifier_call_chain_robust - Inform the registered notifiers about an event + * and rollback on error. + * @nl: Pointer to head of the blocking notifier chain + * @val_up: Value passed unmodified to the notifier function + * @val_down: Value passed unmodified to the notifier function when recovering + * from an error on @val_up + * @v Pointer passed unmodified to the notifier function + * + * NOTE: It is important the @nl chain doesn't change between the two + * invocations of notifier_call_chain() such that we visit the + * exact same notifier callbacks; this rules out any RCU usage. + * + * Returns: the return value of the @val_up call. + */ +static int notifier_call_chain_robust(struct notifier_block **nl, + unsigned long val_up, unsigned long val_down, + void *v) +{ + int ret, nr = 0; + + ret = notifier_call_chain(nl, val_up, v, -1, &nr); + if (ret & NOTIFY_STOP_MASK) + notifier_call_chain(nl, val_down, v, nr-1, NULL); + + return ret; +} + /* * Atomic notifier chain routines. Registration and unregistration * use a spinlock, and call_chain is synchronized by RCU (no locks). @@ -144,13 +172,30 @@ int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh, } EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister); +int atomic_notifier_call_chain_robust(struct atomic_notifier_head *nh, + unsigned long val_up, unsigned long val_down, void *v) +{ + unsigned long flags; + int ret; + + /* + * Musn't use RCU; because then the notifier list can + * change between the up and down traversal. + */ + spin_lock_irqsave(&nh->lock, flags); + ret = notifier_call_chain_robust(&nh->head, val_up, val_down, v); + spin_unlock_irqrestore(&nh->lock, flags); + + return ret; +} +EXPORT_SYMBOL_GPL(atomic_notifier_call_chain_robust); +NOKPROBE_SYMBOL(atomic_notifier_call_chain_robust); + /** - * __atomic_notifier_call_chain - Call functions in an atomic notifier chain + * atomic_notifier_call_chain - Call functions in an atomic notifier chain * @nh: Pointer to head of the atomic notifier chain * @val: Value passed unmodified to notifier function * @v: Pointer passed unmodified to notifier function - * @nr_to_call: See the comment for notifier_call_chain. - * @nr_calls: See the comment for notifier_call_chain. * * Calls each function in a notifier chain in turn. The functions * run in an atomic context, so they must not block. @@ -163,24 +208,16 @@ EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister); * Otherwise the return value is the return value * of the last notifier function called. */ -int __atomic_notifier_call_chain(struct atomic_notifier_head *nh, - unsigned long val, void *v, - int nr_to_call, int *nr_calls) +int atomic_notifier_call_chain(struct atomic_notifier_head *nh, + unsigned long val, void *v) { int ret; rcu_read_lock(); - ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls); + ret = notifier_call_chain(&nh->head, val, v, -1, NULL); rcu_read_unlock(); - return ret; -} -EXPORT_SYMBOL_GPL(__atomic_notifier_call_chain); -NOKPROBE_SYMBOL(__atomic_notifier_call_chain); -int atomic_notifier_call_chain(struct atomic_notifier_head *nh, - unsigned long val, void *v) -{ - return __atomic_notifier_call_chain(nh, val, v, -1, NULL); + return ret; } EXPORT_SYMBOL_GPL(atomic_notifier_call_chain); NOKPROBE_SYMBOL(atomic_notifier_call_chain); @@ -250,13 +287,30 @@ int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh, } EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister); +int blocking_notifier_call_chain_robust(struct blocking_notifier_head *nh, + unsigned long val_up, unsigned long val_down, void *v) +{ + int ret = NOTIFY_DONE; + + /* + * We check the head outside the lock, but if this access is + * racy then it does not matter what the result of the test + * is, we re-check the list after having taken the lock anyway: + */ + if (rcu_access_pointer(nh->head)) { + down_read(&nh->rwsem); + ret = notifier_call_chain_robust(&nh->head, val_up, val_down, v); + up_read(&nh->rwsem); + } + return ret; +} +EXPORT_SYMBOL_GPL(blocking_notifier_call_chain_robust); + /** - * __blocking_notifier_call_chain - Call functions in a blocking notifier chain + * blocking_notifier_call_chain - Call functions in a blocking notifier chain * @nh: Pointer to head of the blocking notifier chain * @val: Value passed unmodified to notifier function * @v: Pointer passed unmodified to notifier function - * @nr_to_call: See comment for notifier_call_chain. - * @nr_calls: See comment for notifier_call_chain. * * Calls each function in a notifier chain in turn. The functions * run in a process context, so they are allowed to block. @@ -268,9 +322,8 @@ EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister); * Otherwise the return value is the return value * of the last notifier function called. */ -int __blocking_notifier_call_chain(struct blocking_notifier_head *nh, - unsigned long val, void *v, - int nr_to_call, int *nr_calls) +int blocking_notifier_call_chain(struct blocking_notifier_head *nh, + unsigned long val, void *v) { int ret = NOTIFY_DONE; @@ -281,19 +334,11 @@ int __blocking_notifier_call_chain(struct blocking_notifier_head *nh, */ if (rcu_access_pointer(nh->head)) { down_read(&nh->rwsem); - ret = notifier_call_chain(&nh->head, val, v, nr_to_call, - nr_calls); + ret = notifier_call_chain(&nh->head, val, v, -1, NULL); up_read(&nh->rwsem); } return ret; } -EXPORT_SYMBOL_GPL(__blocking_notifier_call_chain); - -int blocking_notifier_call_chain(struct blocking_notifier_head *nh, - unsigned long val, void *v) -{ - return __blocking_notifier_call_chain(nh, val, v, -1, NULL); -} EXPORT_SYMBOL_GPL(blocking_notifier_call_chain); /* @@ -335,13 +380,18 @@ int raw_notifier_chain_unregister(struct raw_notifier_head *nh, } EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister); +int raw_notifier_call_chain_robust(struct raw_notifier_head *nh, + unsigned long val_up, unsigned long val_down, void *v) +{ + return notifier_call_chain_robust(&nh->head, val_up, val_down, v); +} +EXPORT_SYMBOL_GPL(raw_notifier_call_chain_robust); + /** - * __raw_notifier_call_chain - Call functions in a raw notifier chain + * raw_notifier_call_chain - Call functions in a raw notifier chain * @nh: Pointer to head of the raw notifier chain * @val: Value passed unmodified to notifier function * @v: Pointer passed unmodified to notifier function - * @nr_to_call: See comment for notifier_call_chain. - * @nr_calls: See comment for notifier_call_chain * * Calls each function in a notifier chain in turn. The functions * run in an undefined context. @@ -354,18 +404,10 @@ EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister); * Otherwise the return value is the return value * of the last notifier function called. */ -int __raw_notifier_call_chain(struct raw_notifier_head *nh, - unsigned long val, void *v, - int nr_to_call, int *nr_calls) -{ - return notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls); -} -EXPORT_SYMBOL_GPL(__raw_notifier_call_chain); - int raw_notifier_call_chain(struct raw_notifier_head *nh, unsigned long val, void *v) { - return __raw_notifier_call_chain(nh, val, v, -1, NULL); + return notifier_call_chain(&nh->head, val, v, -1, NULL); } EXPORT_SYMBOL_GPL(raw_notifier_call_chain); @@ -437,12 +479,10 @@ int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh, EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister); /** - * __srcu_notifier_call_chain - Call functions in an SRCU notifier chain + * srcu_notifier_call_chain - Call functions in an SRCU notifier chain * @nh: Pointer to head of the SRCU notifier chain * @val: Value passed unmodified to notifier function * @v: Pointer passed unmodified to notifier function - * @nr_to_call: See comment for notifier_call_chain. - * @nr_calls: See comment for notifier_call_chain * * Calls each function in a notifier chain in turn. The functions * run in a process context, so they are allowed to block. @@ -454,25 +494,17 @@ EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister); * Otherwise the return value is the return value * of the last notifier function called. */ -int __srcu_notifier_call_chain(struct srcu_notifier_head *nh, - unsigned long val, void *v, - int nr_to_call, int *nr_calls) +int srcu_notifier_call_chain(struct srcu_notifier_head *nh, + unsigned long val, void *v) { int ret; int idx; idx = srcu_read_lock(&nh->srcu); - ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls); + ret = notifier_call_chain(&nh->head, val, v, -1, NULL); srcu_read_unlock(&nh->srcu, idx); return ret; } -EXPORT_SYMBOL_GPL(__srcu_notifier_call_chain); - -int srcu_notifier_call_chain(struct srcu_notifier_head *nh, - unsigned long val, void *v) -{ - return __srcu_notifier_call_chain(nh, val, v, -1, NULL); -} EXPORT_SYMBOL_GPL(srcu_notifier_call_chain); /** diff --git a/kernel/padata.c b/kernel/padata.c index 16cb894dc272..d4d3ba6e1728 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -215,12 +215,13 @@ int padata_do_parallel(struct padata_shell *ps, padata->pd = pd; padata->cb_cpu = *cb_cpu; - rcu_read_unlock_bh(); - spin_lock(&padata_works_lock); padata->seq_nr = ++pd->seq_nr; pw = padata_work_alloc(); spin_unlock(&padata_works_lock); + + rcu_read_unlock_bh(); + if (pw) { padata_work_init(pw, padata_parallel_worker, padata, 0); queue_work(pinst->parallel_wq, &pw->pw_work); diff --git a/kernel/panic.c b/kernel/panic.c index aef8872ba843..396142ee43fd 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -589,6 +589,11 @@ void __warn(const char *file, int line, void *caller, unsigned taint, if (args) vprintk(args->fmt, args->args); + print_modules(); + + if (regs) + show_regs(regs); + if (panic_on_warn) { /* * This thread may hit another WARN() in the panic path. @@ -600,12 +605,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint, panic("panic_on_warn set ...\n"); } - print_modules(); - - if (regs) - show_regs(regs); - else - dump_stack(); + dump_stack(); print_irqtrace_events(current); diff --git a/kernel/params.c b/kernel/params.c index 8e56f8b12d8f..164d79330849 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -233,14 +233,15 @@ char *parse_args(const char *doing, EXPORT_SYMBOL(param_ops_##name) -STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", kstrtou8); -STANDARD_PARAM_DEF(short, short, "%hi", kstrtos16); -STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", kstrtou16); -STANDARD_PARAM_DEF(int, int, "%i", kstrtoint); -STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint); -STANDARD_PARAM_DEF(long, long, "%li", kstrtol); -STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul); -STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull); +STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", kstrtou8); +STANDARD_PARAM_DEF(short, short, "%hi", kstrtos16); +STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", kstrtou16); +STANDARD_PARAM_DEF(int, int, "%i", kstrtoint); +STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint); +STANDARD_PARAM_DEF(long, long, "%li", kstrtol); +STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul); +STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull); +STANDARD_PARAM_DEF(hexint, unsigned int, "%#08x", kstrtouint); int param_set_charp(const char *val, const struct kernel_param *kp) { @@ -529,7 +530,7 @@ struct module_param_attrs { unsigned int num; struct attribute_group grp; - struct param_attribute attrs[0]; + struct param_attribute attrs[]; }; #ifdef CONFIG_SYSFS diff --git a/kernel/pid.c b/kernel/pid.c index b2562a7ce525..a96bc4bf4f86 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -43,6 +43,7 @@ #include <linux/sched/task.h> #include <linux/idr.h> #include <net/sock.h> +#include <uapi/linux/pidfd.h> struct pid init_struct_pid = { .count = REFCOUNT_INIT(1), @@ -519,10 +520,30 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) return idr_get_next(&ns->idr, &nr); } +struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags) +{ + struct fd f; + struct pid *pid; + + f = fdget(fd); + if (!f.file) + return ERR_PTR(-EBADF); + + pid = pidfd_pid(f.file); + if (!IS_ERR(pid)) { + get_pid(pid); + *flags = f.file->f_flags; + } + + fdput(f); + return pid; +} + /** * pidfd_create() - Create a new pid file descriptor. * - * @pid: struct pid that the pidfd will reference + * @pid: struct pid that the pidfd will reference + * @flags: flags to pass * * This creates a new pid file descriptor with the O_CLOEXEC flag set. * @@ -532,12 +553,12 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) * Return: On success, a cloexec pidfd is returned. * On error, a negative errno number will be returned. */ -static int pidfd_create(struct pid *pid) +static int pidfd_create(struct pid *pid, unsigned int flags) { int fd; fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid), - O_RDWR | O_CLOEXEC); + flags | O_RDWR | O_CLOEXEC); if (fd < 0) put_pid(pid); @@ -565,7 +586,7 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags) int fd; struct pid *p; - if (flags) + if (flags & ~PIDFD_NONBLOCK) return -EINVAL; if (pid <= 0) @@ -576,7 +597,7 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags) return -ESRCH; if (pid_has_task(p, PIDTYPE_TGID)) - fd = pidfd_create(p); + fd = pidfd_create(p, flags); else fd = -EINVAL; diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index ac135bd600eb..9de21803a8ae 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -233,7 +233,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) * to pid_ns->child_reaper. Thus pidns->child_reaper needs to * stay valid until they all go away. * - * The code relies on the the pid_ns->child_reaper ignoring + * The code relies on the pid_ns->child_reaper ignoring * SIGCHILD to cause those EXIT_ZOMBIE processes to be * autoreaped if reparented. * diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index f33769f97aca..2fc7d509a34f 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -659,7 +659,7 @@ static void power_down(void) break; case HIBERNATION_PLATFORM: hibernation_platform_enter(); - /* Fall through */ + fallthrough; case HIBERNATION_SHUTDOWN: if (pm_power_off) kernel_power_off(); @@ -706,8 +706,8 @@ static int load_image_and_restore(void) */ int hibernate(void) { - int error, nr_calls = 0; bool snapshot_test = false; + int error; if (!hibernation_available()) { pm_pr_dbg("Hibernation not available.\n"); @@ -723,11 +723,9 @@ int hibernate(void) pr_info("hibernation entry\n"); pm_prepare_console(); - error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls); - if (error) { - nr_calls--; - goto Exit; - } + error = pm_notifier_call_chain_robust(PM_HIBERNATION_PREPARE, PM_POST_HIBERNATION); + if (error) + goto Restore; ksys_sync_helper(); @@ -785,7 +783,8 @@ int hibernate(void) /* Don't bother checking whether freezer_test_done is true */ freezer_test_done = false; Exit: - __pm_notifier_call_chain(PM_POST_HIBERNATION, nr_calls, NULL); + pm_notifier_call_chain(PM_POST_HIBERNATION); + Restore: pm_restore_console(); hibernate_release(); Unlock: @@ -804,7 +803,7 @@ int hibernate(void) */ int hibernate_quiet_exec(int (*func)(void *data), void *data) { - int error, nr_calls = 0; + int error; lock_system_sleep(); @@ -815,11 +814,9 @@ int hibernate_quiet_exec(int (*func)(void *data), void *data) pm_prepare_console(); - error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls); - if (error) { - nr_calls--; - goto exit; - } + error = pm_notifier_call_chain_robust(PM_HIBERNATION_PREPARE, PM_POST_HIBERNATION); + if (error) + goto restore; error = freeze_processes(); if (error) @@ -880,8 +877,9 @@ thaw: thaw_processes(); exit: - __pm_notifier_call_chain(PM_POST_HIBERNATION, nr_calls, NULL); + pm_notifier_call_chain(PM_POST_HIBERNATION); +restore: pm_restore_console(); hibernate_release(); @@ -910,7 +908,7 @@ EXPORT_SYMBOL_GPL(hibernate_quiet_exec); */ static int software_resume(void) { - int error, nr_calls = 0; + int error; /* * If the user said "noresume".. bail out early. @@ -948,17 +946,6 @@ static int software_resume(void) /* Check if the device is there */ swsusp_resume_device = name_to_dev_t(resume_file); - - /* - * name_to_dev_t is ineffective to verify parition if resume_file is in - * integer format. (e.g. major:minor) - */ - if (isdigit(resume_file[0]) && resume_wait) { - int partno; - while (!get_gendisk(swsusp_resume_device, &partno)) - msleep(10); - } - if (!swsusp_resume_device) { /* * Some device discovery might still be in progress; we need @@ -997,11 +984,9 @@ static int software_resume(void) pr_info("resume from hibernation\n"); pm_prepare_console(); - error = __pm_notifier_call_chain(PM_RESTORE_PREPARE, -1, &nr_calls); - if (error) { - nr_calls--; - goto Close_Finish; - } + error = pm_notifier_call_chain_robust(PM_RESTORE_PREPARE, PM_POST_RESTORE); + if (error) + goto Restore; pm_pr_dbg("Preparing processes for hibernation restore.\n"); error = freeze_processes(); @@ -1017,7 +1002,8 @@ static int software_resume(void) error = load_image_and_restore(); thaw_processes(); Finish: - __pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL); + pm_notifier_call_chain(PM_POST_RESTORE); + Restore: pm_restore_console(); pr_info("resume failed (%d)\n", error); hibernate_release(); diff --git a/kernel/power/main.c b/kernel/power/main.c index 40f86ec4ab30..0aefd6f57e0a 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -80,18 +80,18 @@ int unregister_pm_notifier(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(unregister_pm_notifier); -int __pm_notifier_call_chain(unsigned long val, int nr_to_call, int *nr_calls) +int pm_notifier_call_chain_robust(unsigned long val_up, unsigned long val_down) { int ret; - ret = __blocking_notifier_call_chain(&pm_chain_head, val, NULL, - nr_to_call, nr_calls); + ret = blocking_notifier_call_chain_robust(&pm_chain_head, val_up, val_down, NULL); return notifier_to_errno(ret); } + int pm_notifier_call_chain(unsigned long val) { - return __pm_notifier_call_chain(val, -1, NULL); + return blocking_notifier_call_chain(&pm_chain_head, val, NULL); } /* If set, devices may be suspended and resumed asynchronously. */ diff --git a/kernel/power/power.h b/kernel/power/power.h index 32fc89ac96c3..24f12d534515 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -210,8 +210,7 @@ static inline void suspend_test_finish(const char *label) {} #ifdef CONFIG_PM_SLEEP /* kernel/power/main.c */ -extern int __pm_notifier_call_chain(unsigned long val, int nr_to_call, - int *nr_calls); +extern int pm_notifier_call_chain_robust(unsigned long val_up, unsigned long val_down); extern int pm_notifier_call_chain(unsigned long val); #endif diff --git a/kernel/power/process.c b/kernel/power/process.c index 4b6a54da7e65..45b054b7b5ec 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -146,7 +146,7 @@ int freeze_processes(void) BUG_ON(in_atomic()); /* - * Now that the whole userspace is frozen we need to disbale + * Now that the whole userspace is frozen we need to disable * the OOM killer to disallow any further interference with * killable tasks. There is no guarantee oom victims will * ever reach a point they go away we have to wait with a timeout. diff --git a/kernel/power/qos.c b/kernel/power/qos.c index db0bed2cae26..ec7e1e85923e 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -119,7 +119,7 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node, * and add, then see if the aggregate has changed. */ plist_del(node, &c->list); - /* fall through */ + fallthrough; case PM_QOS_ADD_REQ: plist_node_init(node, new_value); plist_add(node, &c->list); @@ -188,7 +188,7 @@ bool pm_qos_update_flags(struct pm_qos_flags *pqf, break; case PM_QOS_UPDATE_REQ: pm_qos_flags_remove_req(pqf, req); - /* fall through */ + fallthrough; case PM_QOS_ADD_REQ: req->flags = val; INIT_LIST_HEAD(&req->node); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index d25749bce7cf..46b1804c1ddf 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -735,7 +735,7 @@ zone_found: */ /* - * If the zone we wish to scan is the the current zone and the + * If the zone we wish to scan is the current zone and the * pfn falls into the current node then we do not need to walk * the tree. */ diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 8b1bb5ee7e5d..32391acc806b 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -342,18 +342,16 @@ static int suspend_test(int level) */ static int suspend_prepare(suspend_state_t state) { - int error, nr_calls = 0; + int error; if (!sleep_state_supported(state)) return -EPERM; pm_prepare_console(); - error = __pm_notifier_call_chain(PM_SUSPEND_PREPARE, -1, &nr_calls); - if (error) { - nr_calls--; - goto Finish; - } + error = pm_notifier_call_chain_robust(PM_SUSPEND_PREPARE, PM_POST_SUSPEND); + if (error) + goto Restore; trace_suspend_resume(TPS("freeze_processes"), 0, true); error = suspend_freeze_processes(); @@ -363,8 +361,8 @@ static int suspend_prepare(suspend_state_t state) suspend_stats.failed_freeze++; dpm_save_failed_step(SUSPEND_FREEZE); - Finish: - __pm_notifier_call_chain(PM_POST_SUSPEND, nr_calls, NULL); + pm_notifier_call_chain(PM_POST_SUSPEND); + Restore: pm_restore_console(); return error; } diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 01e2858b5fe3..c73f2e295167 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -226,6 +226,7 @@ struct hib_bio_batch { atomic_t count; wait_queue_head_t wait; blk_status_t error; + struct blk_plug plug; }; static void hib_init_batch(struct hib_bio_batch *hb) @@ -233,6 +234,12 @@ static void hib_init_batch(struct hib_bio_batch *hb) atomic_set(&hb->count, 0); init_waitqueue_head(&hb->wait); hb->error = BLK_STS_OK; + blk_start_plug(&hb->plug); +} + +static void hib_finish_batch(struct hib_bio_batch *hb) +{ + blk_finish_plug(&hb->plug); } static void hib_end_io(struct bio *bio) @@ -294,6 +301,10 @@ static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr, static blk_status_t hib_wait_io(struct hib_bio_batch *hb) { + /* + * We are relying on the behavior of blk_plug that a thread with + * a plug will flush the plug list before sleeping. + */ wait_event(hb->wait, atomic_read(&hb->count) == 0); return blk_status_to_errno(hb->error); } @@ -335,26 +346,23 @@ static int swsusp_swap_check(void) { int res; - res = swap_type_of(swsusp_resume_device, swsusp_resume_block, - &hib_resume_bdev); + if (swsusp_resume_device) + res = swap_type_of(swsusp_resume_device, swsusp_resume_block); + else + res = find_first_swap(&swsusp_resume_device); if (res < 0) return res; - root_swap = res; - res = blkdev_get(hib_resume_bdev, FMODE_WRITE, NULL); - if (res) - return res; + + hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, FMODE_WRITE, + NULL); + if (IS_ERR(hib_resume_bdev)) + return PTR_ERR(hib_resume_bdev); res = set_blocksize(hib_resume_bdev, PAGE_SIZE); if (res < 0) blkdev_put(hib_resume_bdev, FMODE_WRITE); - /* - * Update the resume device to the one actually used, - * so the test_resume mode can use it in case it is - * invoked from hibernate() to test the snapshot. - */ - swsusp_resume_device = hib_resume_bdev->bd_dev; return res; } @@ -561,6 +569,7 @@ static int save_image(struct swap_map_handle *handle, nr_pages++; } err2 = hib_wait_io(&hb); + hib_finish_batch(&hb); stop = ktime_get(); if (!ret) ret = err2; @@ -854,6 +863,7 @@ out_finish: pr_info("Image saving done\n"); swsusp_show_speed(start, stop, nr_to_write, "Wrote"); out_clean: + hib_finish_batch(&hb); if (crc) { if (crc->thr) kthread_stop(crc->thr); @@ -1084,6 +1094,7 @@ static int load_image(struct swap_map_handle *handle, nr_pages++; } err2 = hib_wait_io(&hb); + hib_finish_batch(&hb); stop = ktime_get(); if (!ret) ret = err2; @@ -1447,6 +1458,7 @@ out_finish: } swsusp_show_speed(start, stop, nr_to_read, "Read"); out_clean: + hib_finish_batch(&hb); for (i = 0; i < ring_size; i++) free_page((unsigned long)page[i]); if (crc) { diff --git a/kernel/power/user.c b/kernel/power/user.c index d5eedc2baa2a..740723bb3885 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -35,18 +35,18 @@ static struct snapshot_data { bool ready; bool platform_support; bool free_bitmaps; - struct inode *bd_inode; + dev_t dev; } snapshot_state; -int is_hibernate_resume_dev(const struct inode *bd_inode) +int is_hibernate_resume_dev(dev_t dev) { - return hibernation_available() && snapshot_state.bd_inode == bd_inode; + return hibernation_available() && snapshot_state.dev == dev; } static int snapshot_open(struct inode *inode, struct file *filp) { struct snapshot_data *data; - int error, nr_calls = 0; + int error; if (!hibernation_available()) return -EPERM; @@ -69,13 +69,10 @@ static int snapshot_open(struct inode *inode, struct file *filp) memset(&data->handle, 0, sizeof(struct snapshot_handle)); if ((filp->f_flags & O_ACCMODE) == O_RDONLY) { /* Hibernating. The image device should be accessible. */ - data->swap = swsusp_resume_device ? - swap_type_of(swsusp_resume_device, 0, NULL) : -1; + data->swap = swap_type_of(swsusp_resume_device, 0); data->mode = O_RDONLY; data->free_bitmaps = false; - error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls); - if (error) - __pm_notifier_call_chain(PM_POST_HIBERNATION, --nr_calls, NULL); + error = pm_notifier_call_chain_robust(PM_HIBERNATION_PREPARE, PM_POST_HIBERNATION); } else { /* * Resuming. We may need to wait for the image device to @@ -85,15 +82,11 @@ static int snapshot_open(struct inode *inode, struct file *filp) data->swap = -1; data->mode = O_WRONLY; - error = __pm_notifier_call_chain(PM_RESTORE_PREPARE, -1, &nr_calls); + error = pm_notifier_call_chain_robust(PM_RESTORE_PREPARE, PM_POST_RESTORE); if (!error) { error = create_basic_memory_bitmaps(); data->free_bitmaps = !error; - } else - nr_calls--; - - if (error) - __pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL); + } } if (error) hibernate_release(); @@ -101,7 +94,7 @@ static int snapshot_open(struct inode *inode, struct file *filp) data->frozen = false; data->ready = false; data->platform_support = false; - data->bd_inode = NULL; + data->dev = 0; Unlock: unlock_system_sleep(); @@ -117,7 +110,7 @@ static int snapshot_release(struct inode *inode, struct file *filp) swsusp_free(); data = filp->private_data; - data->bd_inode = NULL; + data->dev = 0; free_all_swap_pages(data->swap); if (data->frozen) { pm_restore_gfp_mask(); @@ -210,7 +203,6 @@ struct compat_resume_swap_area { static int snapshot_set_swap_area(struct snapshot_data *data, void __user *argp) { - struct block_device *bdev; sector_t offset; dev_t swdev; @@ -237,16 +229,10 @@ static int snapshot_set_swap_area(struct snapshot_data *data, * User space encodes device types as two-byte values, * so we need to recode them */ - if (!swdev) { - data->swap = -1; - return -EINVAL; - } - data->swap = swap_type_of(swdev, offset, &bdev); + data->swap = swap_type_of(swdev, offset); if (data->swap < 0) - return -ENODEV; - - data->bd_inode = bdev->bd_inode; - bdput(bdev); + return swdev ? -ENODEV : -EINVAL; + data->dev = swdev; return 0; } diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile index 4d052fc6bcde..eee3dc9b60a9 100644 --- a/kernel/printk/Makefile +++ b/kernel/printk/Makefile @@ -2,3 +2,4 @@ obj-y = printk.o obj-$(CONFIG_PRINTK) += printk_safe.o obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o +obj-$(CONFIG_PRINTK) += printk_ringbuffer.o diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 660f9a6bf73a..3a8fd491758c 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -14,9 +14,9 @@ extern raw_spinlock_t logbuf_lock; -__printf(5, 0) +__printf(4, 0) int vprintk_store(int facility, int level, - const char *dict, size_t dictlen, + const struct dev_printk_info *dev_info, const char *fmt, va_list args); __printf(1, 0) int vprintk_default(const char *fmt, va_list args); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 9b75f6bfc333..fe64a49344bf 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -55,6 +55,7 @@ #define CREATE_TRACE_POINTS #include <trace/events/printk.h> +#include "printk_ringbuffer.h" #include "console_cmdline.h" #include "braille.h" #include "internal.h" @@ -294,30 +295,22 @@ enum con_msg_format_flags { static int console_msg_format = MSG_FORMAT_DEFAULT; /* - * The printk log buffer consists of a chain of concatenated variable - * length records. Every record starts with a record header, containing - * the overall length of the record. + * The printk log buffer consists of a sequenced collection of records, each + * containing variable length message text. Every record also contains its + * own meta-data (@info). * - * The heads to the first and last entry in the buffer, as well as the - * sequence numbers of these entries are maintained when messages are - * stored. + * Every record meta-data carries the timestamp in microseconds, as well as + * the standard userspace syslog level and syslog facility. The usual kernel + * messages use LOG_KERN; userspace-injected messages always carry a matching + * syslog facility, by default LOG_USER. The origin of every message can be + * reliably determined that way. * - * If the heads indicate available messages, the length in the header - * tells the start next message. A length == 0 for the next message - * indicates a wrap-around to the beginning of the buffer. + * The human readable log message of a record is available in @text, the + * length of the message text in @text_len. The stored message is not + * terminated. * - * Every record carries the monotonic timestamp in microseconds, as well as - * the standard userspace syslog level and syslog facility. The usual - * kernel messages use LOG_KERN; userspace-injected messages always carry - * a matching syslog facility, by default LOG_USER. The origin of every - * message can be reliably determined that way. - * - * The human readable log message directly follows the message header. The - * length of the message text is stored in the header, the stored message - * is not terminated. - * - * Optionally, a message can carry a dictionary of properties (key/value pairs), - * to provide userspace with a machine-readable message context. + * Optionally, a record can carry a dictionary of properties (key/value + * pairs), to provide userspace with a machine-readable message context. * * Examples for well-defined, commonly used property names are: * DEVICE=b12:8 device identifier @@ -327,25 +320,22 @@ static int console_msg_format = MSG_FORMAT_DEFAULT; * +sound:card0 subsystem:devname * SUBSYSTEM=pci driver-core subsystem name * - * Valid characters in property names are [a-zA-Z0-9.-_]. The plain text value - * follows directly after a '=' character. Every property is terminated by - * a '\0' character. The last property is not terminated. - * - * Example of a message structure: - * 0000 ff 8f 00 00 00 00 00 00 monotonic time in nsec - * 0008 34 00 record is 52 bytes long - * 000a 0b 00 text is 11 bytes long - * 000c 1f 00 dictionary is 23 bytes long - * 000e 03 00 LOG_KERN (facility) LOG_ERR (level) - * 0010 69 74 27 73 20 61 20 6c "it's a l" - * 69 6e 65 "ine" - * 001b 44 45 56 49 43 "DEVIC" - * 45 3d 62 38 3a 32 00 44 "E=b8:2\0D" - * 52 49 56 45 52 3d 62 75 "RIVER=bu" - * 67 "g" - * 0032 00 00 00 padding to next message header - * - * The 'struct printk_log' buffer header must never be directly exported to + * Valid characters in property names are [a-zA-Z0-9.-_]. Property names + * and values are terminated by a '\0' character. + * + * Example of record values: + * record.text_buf = "it's a line" (unterminated) + * record.info.seq = 56 + * record.info.ts_nsec = 36863 + * record.info.text_len = 11 + * record.info.facility = 0 (LOG_KERN) + * record.info.flags = 0 + * record.info.level = 3 (LOG_ERR) + * record.info.caller_id = 299 (task 299) + * record.info.dev_info.subsystem = "pci" (terminated) + * record.info.dev_info.device = "+pci:0000:00:01.0" (terminated) + * + * The 'struct printk_info' buffer must never be directly exported to * userspace, it is a kernel-private implementation detail that might * need to be changed in the future, when the requirements change. * @@ -365,23 +355,6 @@ enum log_flags { LOG_CONT = 8, /* text is a fragment of a continuation line */ }; -struct printk_log { - u64 ts_nsec; /* timestamp in nanoseconds */ - u16 len; /* length of entire record */ - u16 text_len; /* length of text buffer */ - u16 dict_len; /* length of dictionary buffer */ - u8 facility; /* syslog facility */ - u8 flags:5; /* internal record flags */ - u8 level:3; /* syslog level */ -#ifdef CONFIG_PRINTK_CALLER - u32 caller_id; /* thread id or processor id */ -#endif -} -#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS -__packed __aligned(4) -#endif -; - /* * The logbuf_lock protects kmsg buffer, indices, counters. This can be taken * within the scheduler's rq lock. It must be released before calling @@ -421,26 +394,16 @@ DEFINE_RAW_SPINLOCK(logbuf_lock); DECLARE_WAIT_QUEUE_HEAD(log_wait); /* the next printk record to read by syslog(READ) or /proc/kmsg */ static u64 syslog_seq; -static u32 syslog_idx; static size_t syslog_partial; static bool syslog_time; -/* index and sequence number of the first record stored in the buffer */ -static u64 log_first_seq; -static u32 log_first_idx; - -/* index and sequence number of the next record to store in the buffer */ -static u64 log_next_seq; -static u32 log_next_idx; - /* the next printk record to write to the console */ static u64 console_seq; -static u32 console_idx; static u64 exclusive_console_stop_seq; +static unsigned long console_dropped; /* the next printk record to read after the last 'clear' command */ static u64 clear_seq; -static u32 clear_idx; #ifdef CONFIG_PRINTK_CALLER #define PREFIX_MAX 48 @@ -453,7 +416,7 @@ static u32 clear_idx; #define LOG_FACILITY(v) ((v) >> 3 & 0xff) /* record buffer */ -#define LOG_ALIGN __alignof__(struct printk_log) +#define LOG_ALIGN __alignof__(unsigned long) #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) #define LOG_BUF_LEN_MAX (u32)(1 << 31) static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); @@ -461,6 +424,23 @@ static char *log_buf = __log_buf; static u32 log_buf_len = __LOG_BUF_LEN; /* + * Define the average message size. This only affects the number of + * descriptors that will be available. Underestimating is better than + * overestimating (too many available descriptors is better than not enough). + */ +#define PRB_AVGBITS 5 /* 32 character average length */ + +#if CONFIG_LOG_BUF_SHIFT <= PRB_AVGBITS +#error CONFIG_LOG_BUF_SHIFT value too small. +#endif +_DEFINE_PRINTKRB(printk_rb_static, CONFIG_LOG_BUF_SHIFT - PRB_AVGBITS, + PRB_AVGBITS, &__log_buf[0]); + +static struct printk_ringbuffer printk_rb_dynamic; + +static struct printk_ringbuffer *prb = &printk_rb_static; + +/* * We cannot access per-CPU data (e.g. per-CPU flush irq_work) before * per_cpu_areas are initialised. This variable is set to true when * it's safe to access per-CPU data. @@ -484,108 +464,6 @@ u32 log_buf_len_get(void) return log_buf_len; } -/* human readable text of the record */ -static char *log_text(const struct printk_log *msg) -{ - return (char *)msg + sizeof(struct printk_log); -} - -/* optional key/value pair dictionary attached to the record */ -static char *log_dict(const struct printk_log *msg) -{ - return (char *)msg + sizeof(struct printk_log) + msg->text_len; -} - -/* get record by index; idx must point to valid msg */ -static struct printk_log *log_from_idx(u32 idx) -{ - struct printk_log *msg = (struct printk_log *)(log_buf + idx); - - /* - * A length == 0 record is the end of buffer marker. Wrap around and - * read the message at the start of the buffer. - */ - if (!msg->len) - return (struct printk_log *)log_buf; - return msg; -} - -/* get next record; idx must point to valid msg */ -static u32 log_next(u32 idx) -{ - struct printk_log *msg = (struct printk_log *)(log_buf + idx); - - /* length == 0 indicates the end of the buffer; wrap */ - /* - * A length == 0 record is the end of buffer marker. Wrap around and - * read the message at the start of the buffer as *this* one, and - * return the one after that. - */ - if (!msg->len) { - msg = (struct printk_log *)log_buf; - return msg->len; - } - return idx + msg->len; -} - -/* - * Check whether there is enough free space for the given message. - * - * The same values of first_idx and next_idx mean that the buffer - * is either empty or full. - * - * If the buffer is empty, we must respect the position of the indexes. - * They cannot be reset to the beginning of the buffer. - */ -static int logbuf_has_space(u32 msg_size, bool empty) -{ - u32 free; - - if (log_next_idx > log_first_idx || empty) - free = max(log_buf_len - log_next_idx, log_first_idx); - else - free = log_first_idx - log_next_idx; - - /* - * We need space also for an empty header that signalizes wrapping - * of the buffer. - */ - return free >= msg_size + sizeof(struct printk_log); -} - -static int log_make_free_space(u32 msg_size) -{ - while (log_first_seq < log_next_seq && - !logbuf_has_space(msg_size, false)) { - /* drop old messages until we have enough contiguous space */ - log_first_idx = log_next(log_first_idx); - log_first_seq++; - } - - if (clear_seq < log_first_seq) { - clear_seq = log_first_seq; - clear_idx = log_first_idx; - } - - /* sequence numbers are equal, so the log buffer is empty */ - if (logbuf_has_space(msg_size, log_first_seq == log_next_seq)) - return 0; - - return -ENOMEM; -} - -/* compute the message size including the padding bytes */ -static u32 msg_used_size(u16 text_len, u16 dict_len, u32 *pad_len) -{ - u32 size; - - size = sizeof(struct printk_log) + text_len + dict_len; - *pad_len = (-size) & (LOG_ALIGN - 1); - size += *pad_len; - - return size; -} - /* * Define how much of the log buffer we could take at maximum. The value * must be greater than two. Note that only half of the buffer is available @@ -594,84 +472,69 @@ static u32 msg_used_size(u16 text_len, u16 dict_len, u32 *pad_len) #define MAX_LOG_TAKE_PART 4 static const char trunc_msg[] = "<truncated>"; -static u32 truncate_msg(u16 *text_len, u16 *trunc_msg_len, - u16 *dict_len, u32 *pad_len) +static void truncate_msg(u16 *text_len, u16 *trunc_msg_len) { /* * The message should not take the whole buffer. Otherwise, it might * get removed too soon. */ u32 max_text_len = log_buf_len / MAX_LOG_TAKE_PART; + if (*text_len > max_text_len) *text_len = max_text_len; - /* enable the warning message */ + + /* enable the warning message (if there is room) */ *trunc_msg_len = strlen(trunc_msg); - /* disable the "dict" completely */ - *dict_len = 0; - /* compute the size again, count also the warning message */ - return msg_used_size(*text_len + *trunc_msg_len, 0, pad_len); + if (*text_len >= *trunc_msg_len) + *text_len -= *trunc_msg_len; + else + *trunc_msg_len = 0; } /* insert record into the buffer, discard old ones, update heads */ static int log_store(u32 caller_id, int facility, int level, enum log_flags flags, u64 ts_nsec, - const char *dict, u16 dict_len, + const struct dev_printk_info *dev_info, const char *text, u16 text_len) { - struct printk_log *msg; - u32 size, pad_len; + struct prb_reserved_entry e; + struct printk_record r; u16 trunc_msg_len = 0; - /* number of '\0' padding bytes to next message */ - size = msg_used_size(text_len, dict_len, &pad_len); + prb_rec_init_wr(&r, text_len); - if (log_make_free_space(size)) { + if (!prb_reserve(&e, prb, &r)) { /* truncate the message if it is too long for empty buffer */ - size = truncate_msg(&text_len, &trunc_msg_len, - &dict_len, &pad_len); + truncate_msg(&text_len, &trunc_msg_len); + prb_rec_init_wr(&r, text_len + trunc_msg_len); /* survive when the log buffer is too small for trunc_msg */ - if (log_make_free_space(size)) + if (!prb_reserve(&e, prb, &r)) return 0; } - if (log_next_idx + size + sizeof(struct printk_log) > log_buf_len) { - /* - * This message + an additional empty header does not fit - * at the end of the buffer. Add an empty header with len == 0 - * to signify a wrap around. - */ - memset(log_buf + log_next_idx, 0, sizeof(struct printk_log)); - log_next_idx = 0; - } - /* fill message */ - msg = (struct printk_log *)(log_buf + log_next_idx); - memcpy(log_text(msg), text, text_len); - msg->text_len = text_len; - if (trunc_msg_len) { - memcpy(log_text(msg) + text_len, trunc_msg, trunc_msg_len); - msg->text_len += trunc_msg_len; - } - memcpy(log_dict(msg), dict, dict_len); - msg->dict_len = dict_len; - msg->facility = facility; - msg->level = level & 7; - msg->flags = flags & 0x1f; + memcpy(&r.text_buf[0], text, text_len); + if (trunc_msg_len) + memcpy(&r.text_buf[text_len], trunc_msg, trunc_msg_len); + r.info->text_len = text_len + trunc_msg_len; + r.info->facility = facility; + r.info->level = level & 7; + r.info->flags = flags & 0x1f; if (ts_nsec > 0) - msg->ts_nsec = ts_nsec; + r.info->ts_nsec = ts_nsec; else - msg->ts_nsec = local_clock(); -#ifdef CONFIG_PRINTK_CALLER - msg->caller_id = caller_id; -#endif - memset(log_dict(msg) + dict_len, 0, pad_len); - msg->len = size; + r.info->ts_nsec = local_clock(); + r.info->caller_id = caller_id; + if (dev_info) + memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info)); /* insert message */ - log_next_idx += msg->len; - log_next_seq++; + if ((flags & LOG_CONT) || !(flags & LOG_NEWLINE)) + prb_commit(&e); + else + prb_final_commit(&e); - return msg->text_len; + return (text_len + trunc_msg_len); } int dmesg_restrict = IS_ENABLED(CONFIG_SECURITY_DMESG_RESTRICT); @@ -723,13 +586,13 @@ static void append_char(char **pp, char *e, char c) *(*pp)++ = c; } -static ssize_t msg_print_ext_header(char *buf, size_t size, - struct printk_log *msg, u64 seq) +static ssize_t info_print_ext_header(char *buf, size_t size, + struct printk_info *info) { - u64 ts_usec = msg->ts_nsec; + u64 ts_usec = info->ts_nsec; char caller[20]; #ifdef CONFIG_PRINTK_CALLER - u32 id = msg->caller_id; + u32 id = info->caller_id; snprintf(caller, sizeof(caller), ",caller=%c%u", id & 0x80000000 ? 'C' : 'T', id & ~0x80000000); @@ -740,13 +603,13 @@ static ssize_t msg_print_ext_header(char *buf, size_t size, do_div(ts_usec, 1000); return scnprintf(buf, size, "%u,%llu,%llu,%c%s;", - (msg->facility << 3) | msg->level, seq, ts_usec, - msg->flags & LOG_CONT ? 'c' : '-', caller); + (info->facility << 3) | info->level, info->seq, + ts_usec, info->flags & LOG_CONT ? 'c' : '-', caller); } -static ssize_t msg_print_ext_body(char *buf, size_t size, - char *dict, size_t dict_len, - char *text, size_t text_len) +static ssize_t msg_add_ext_text(char *buf, size_t size, + const char *text, size_t text_len, + unsigned char endc) { char *p = buf, *e = buf + size; size_t i; @@ -760,45 +623,56 @@ static ssize_t msg_print_ext_body(char *buf, size_t size, else append_char(&p, e, c); } - append_char(&p, e, '\n'); + append_char(&p, e, endc); - if (dict_len) { - bool line = true; + return p - buf; +} - for (i = 0; i < dict_len; i++) { - unsigned char c = dict[i]; +static ssize_t msg_add_dict_text(char *buf, size_t size, + const char *key, const char *val) +{ + size_t val_len = strlen(val); + ssize_t len; - if (line) { - append_char(&p, e, ' '); - line = false; - } + if (!val_len) + return 0; - if (c == '\0') { - append_char(&p, e, '\n'); - line = true; - continue; - } + len = msg_add_ext_text(buf, size, "", 0, ' '); /* dict prefix */ + len += msg_add_ext_text(buf + len, size - len, key, strlen(key), '='); + len += msg_add_ext_text(buf + len, size - len, val, val_len, '\n'); - if (c < ' ' || c >= 127 || c == '\\') { - p += scnprintf(p, e - p, "\\x%02x", c); - continue; - } + return len; +} - append_char(&p, e, c); - } - append_char(&p, e, '\n'); - } +static ssize_t msg_print_ext_body(char *buf, size_t size, + char *text, size_t text_len, + struct dev_printk_info *dev_info) +{ + ssize_t len; - return p - buf; + len = msg_add_ext_text(buf, size, text, text_len, '\n'); + + if (!dev_info) + goto out; + + len += msg_add_dict_text(buf + len, size - len, "SUBSYSTEM", + dev_info->subsystem); + len += msg_add_dict_text(buf + len, size - len, "DEVICE", + dev_info->device); +out: + return len; } /* /dev/kmsg - userspace message inject/listen interface */ struct devkmsg_user { u64 seq; - u32 idx; struct ratelimit_state rs; struct mutex lock; char buf[CONSOLE_EXT_LOG_MAX]; + + struct printk_info info; + char text_buf[CONSOLE_EXT_LOG_MAX]; + struct printk_record record; }; static __printf(3, 4) __cold @@ -808,7 +682,7 @@ int devkmsg_emit(int facility, int level, const char *fmt, ...) int r; va_start(args, fmt); - r = vprintk_emit(facility, level, NULL, 0, fmt, args); + r = vprintk_emit(facility, level, NULL, fmt, args); va_end(args); return r; @@ -881,7 +755,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct devkmsg_user *user = file->private_data; - struct printk_log *msg; + struct printk_record *r = &user->record; size_t len; ssize_t ret; @@ -893,7 +767,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, return ret; logbuf_lock_irq(); - while (user->seq == log_next_seq) { + if (!prb_read_valid(prb, user->seq, r)) { if (file->f_flags & O_NONBLOCK) { ret = -EAGAIN; logbuf_unlock_irq(); @@ -902,30 +776,26 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, logbuf_unlock_irq(); ret = wait_event_interruptible(log_wait, - user->seq != log_next_seq); + prb_read_valid(prb, user->seq, r)); if (ret) goto out; logbuf_lock_irq(); } - if (user->seq < log_first_seq) { + if (user->seq < prb_first_valid_seq(prb)) { /* our last seen message is gone, return error and reset */ - user->idx = log_first_idx; - user->seq = log_first_seq; + user->seq = prb_first_valid_seq(prb); ret = -EPIPE; logbuf_unlock_irq(); goto out; } - msg = log_from_idx(user->idx); - len = msg_print_ext_header(user->buf, sizeof(user->buf), - msg, user->seq); + len = info_print_ext_header(user->buf, sizeof(user->buf), r->info); len += msg_print_ext_body(user->buf + len, sizeof(user->buf) - len, - log_dict(msg), msg->dict_len, - log_text(msg), msg->text_len); + &r->text_buf[0], r->info->text_len, + &r->info->dev_info); - user->idx = log_next(user->idx); - user->seq++; + user->seq = r->info->seq + 1; logbuf_unlock_irq(); if (len > count) { @@ -965,8 +835,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) switch (whence) { case SEEK_SET: /* the first record */ - user->idx = log_first_idx; - user->seq = log_first_seq; + user->seq = prb_first_valid_seq(prb); break; case SEEK_DATA: /* @@ -974,13 +843,11 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) * like issued by 'dmesg -c'. Reading /dev/kmsg itself * changes no global state, and does not clear anything. */ - user->idx = clear_idx; user->seq = clear_seq; break; case SEEK_END: /* after the last record */ - user->idx = log_next_idx; - user->seq = log_next_seq; + user->seq = prb_next_seq(prb); break; default: ret = -EINVAL; @@ -1000,9 +867,9 @@ static __poll_t devkmsg_poll(struct file *file, poll_table *wait) poll_wait(file, &log_wait, wait); logbuf_lock_irq(); - if (user->seq < log_next_seq) { + if (prb_read_valid(prb, user->seq, NULL)) { /* return error when data has vanished underneath us */ - if (user->seq < log_first_seq) + if (user->seq < prb_first_valid_seq(prb)) ret = EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI; else ret = EPOLLIN|EPOLLRDNORM; @@ -1037,9 +904,11 @@ static int devkmsg_open(struct inode *inode, struct file *file) mutex_init(&user->lock); + prb_rec_init_rd(&user->record, &user->info, + &user->text_buf[0], sizeof(user->text_buf)); + logbuf_lock_irq(); - user->idx = log_first_idx; - user->seq = log_first_seq; + user->seq = prb_first_valid_seq(prb); logbuf_unlock_irq(); file->private_data = user; @@ -1080,23 +949,58 @@ const struct file_operations kmsg_fops = { */ void log_buf_vmcoreinfo_setup(void) { - VMCOREINFO_SYMBOL(log_buf); - VMCOREINFO_SYMBOL(log_buf_len); - VMCOREINFO_SYMBOL(log_first_idx); - VMCOREINFO_SYMBOL(clear_idx); - VMCOREINFO_SYMBOL(log_next_idx); + struct dev_printk_info *dev_info = NULL; + + VMCOREINFO_SYMBOL(prb); + VMCOREINFO_SYMBOL(printk_rb_static); + VMCOREINFO_SYMBOL(clear_seq); + /* - * Export struct printk_log size and field offsets. User space tools can + * Export struct size and field offsets. User space tools can * parse it and detect any changes to structure down the line. */ - VMCOREINFO_STRUCT_SIZE(printk_log); - VMCOREINFO_OFFSET(printk_log, ts_nsec); - VMCOREINFO_OFFSET(printk_log, len); - VMCOREINFO_OFFSET(printk_log, text_len); - VMCOREINFO_OFFSET(printk_log, dict_len); -#ifdef CONFIG_PRINTK_CALLER - VMCOREINFO_OFFSET(printk_log, caller_id); -#endif + + VMCOREINFO_STRUCT_SIZE(printk_ringbuffer); + VMCOREINFO_OFFSET(printk_ringbuffer, desc_ring); + VMCOREINFO_OFFSET(printk_ringbuffer, text_data_ring); + VMCOREINFO_OFFSET(printk_ringbuffer, fail); + + VMCOREINFO_STRUCT_SIZE(prb_desc_ring); + VMCOREINFO_OFFSET(prb_desc_ring, count_bits); + VMCOREINFO_OFFSET(prb_desc_ring, descs); + VMCOREINFO_OFFSET(prb_desc_ring, infos); + VMCOREINFO_OFFSET(prb_desc_ring, head_id); + VMCOREINFO_OFFSET(prb_desc_ring, tail_id); + + VMCOREINFO_STRUCT_SIZE(prb_desc); + VMCOREINFO_OFFSET(prb_desc, state_var); + VMCOREINFO_OFFSET(prb_desc, text_blk_lpos); + + VMCOREINFO_STRUCT_SIZE(prb_data_blk_lpos); + VMCOREINFO_OFFSET(prb_data_blk_lpos, begin); + VMCOREINFO_OFFSET(prb_data_blk_lpos, next); + + VMCOREINFO_STRUCT_SIZE(printk_info); + VMCOREINFO_OFFSET(printk_info, seq); + VMCOREINFO_OFFSET(printk_info, ts_nsec); + VMCOREINFO_OFFSET(printk_info, text_len); + VMCOREINFO_OFFSET(printk_info, caller_id); + VMCOREINFO_OFFSET(printk_info, dev_info); + + VMCOREINFO_STRUCT_SIZE(dev_printk_info); + VMCOREINFO_OFFSET(dev_printk_info, subsystem); + VMCOREINFO_LENGTH(printk_info_subsystem, sizeof(dev_info->subsystem)); + VMCOREINFO_OFFSET(dev_printk_info, device); + VMCOREINFO_LENGTH(printk_info_device, sizeof(dev_info->device)); + + VMCOREINFO_STRUCT_SIZE(prb_data_ring); + VMCOREINFO_OFFSET(prb_data_ring, size_bits); + VMCOREINFO_OFFSET(prb_data_ring, data); + VMCOREINFO_OFFSET(prb_data_ring, head_lpos); + VMCOREINFO_OFFSET(prb_data_ring, tail_lpos); + + VMCOREINFO_SIZE(atomic_long_t); + VMCOREINFO_TYPE_OFFSET(atomic_long_t, counter); } #endif @@ -1174,11 +1078,46 @@ static void __init set_percpu_data_ready(void) __printk_percpu_data_ready = true; } +static unsigned int __init add_to_rb(struct printk_ringbuffer *rb, + struct printk_record *r) +{ + struct prb_reserved_entry e; + struct printk_record dest_r; + + prb_rec_init_wr(&dest_r, r->info->text_len); + + if (!prb_reserve(&e, rb, &dest_r)) + return 0; + + memcpy(&dest_r.text_buf[0], &r->text_buf[0], r->info->text_len); + dest_r.info->text_len = r->info->text_len; + dest_r.info->facility = r->info->facility; + dest_r.info->level = r->info->level; + dest_r.info->flags = r->info->flags; + dest_r.info->ts_nsec = r->info->ts_nsec; + dest_r.info->caller_id = r->info->caller_id; + memcpy(&dest_r.info->dev_info, &r->info->dev_info, sizeof(dest_r.info->dev_info)); + + prb_final_commit(&e); + + return prb_record_text_space(&e); +} + +static char setup_text_buf[LOG_LINE_MAX] __initdata; + void __init setup_log_buf(int early) { + struct printk_info *new_infos; + unsigned int new_descs_count; + struct prb_desc *new_descs; + struct printk_info info; + struct printk_record r; + size_t new_descs_size; + size_t new_infos_size; unsigned long flags; char *new_log_buf; unsigned int free; + u64 seq; /* * Some archs call setup_log_buf() multiple times - first is very @@ -1197,24 +1136,75 @@ void __init setup_log_buf(int early) if (!new_log_buf_len) return; + new_descs_count = new_log_buf_len >> PRB_AVGBITS; + if (new_descs_count == 0) { + pr_err("new_log_buf_len: %lu too small\n", new_log_buf_len); + return; + } + new_log_buf = memblock_alloc(new_log_buf_len, LOG_ALIGN); if (unlikely(!new_log_buf)) { - pr_err("log_buf_len: %lu bytes not available\n", - new_log_buf_len); + pr_err("log_buf_len: %lu text bytes not available\n", + new_log_buf_len); return; } + new_descs_size = new_descs_count * sizeof(struct prb_desc); + new_descs = memblock_alloc(new_descs_size, LOG_ALIGN); + if (unlikely(!new_descs)) { + pr_err("log_buf_len: %zu desc bytes not available\n", + new_descs_size); + goto err_free_log_buf; + } + + new_infos_size = new_descs_count * sizeof(struct printk_info); + new_infos = memblock_alloc(new_infos_size, LOG_ALIGN); + if (unlikely(!new_infos)) { + pr_err("log_buf_len: %zu info bytes not available\n", + new_infos_size); + goto err_free_descs; + } + + prb_rec_init_rd(&r, &info, &setup_text_buf[0], sizeof(setup_text_buf)); + + prb_init(&printk_rb_dynamic, + new_log_buf, ilog2(new_log_buf_len), + new_descs, ilog2(new_descs_count), + new_infos); + logbuf_lock_irqsave(flags); + log_buf_len = new_log_buf_len; log_buf = new_log_buf; new_log_buf_len = 0; - free = __LOG_BUF_LEN - log_next_idx; - memcpy(log_buf, __log_buf, __LOG_BUF_LEN); + + free = __LOG_BUF_LEN; + prb_for_each_record(0, &printk_rb_static, seq, &r) + free -= add_to_rb(&printk_rb_dynamic, &r); + + /* + * This is early enough that everything is still running on the + * boot CPU and interrupts are disabled. So no new messages will + * appear during the transition to the dynamic buffer. + */ + prb = &printk_rb_dynamic; + logbuf_unlock_irqrestore(flags); + if (seq != prb_next_seq(&printk_rb_static)) { + pr_err("dropped %llu messages\n", + prb_next_seq(&printk_rb_static) - seq); + } + pr_info("log_buf_len: %u bytes\n", log_buf_len); pr_info("early log buf free: %u(%u%%)\n", free, (free * 100) / __LOG_BUF_LEN); + return; + +err_free_descs: + memblock_free(__pa(new_descs), new_descs_size); +err_free_log_buf: + memblock_free(__pa(new_log_buf), new_log_buf_len); } static bool __read_mostly ignore_loglevel; @@ -1321,18 +1311,18 @@ static size_t print_caller(u32 id, char *buf) #define print_caller(id, buf) 0 #endif -static size_t print_prefix(const struct printk_log *msg, bool syslog, - bool time, char *buf) +static size_t info_print_prefix(const struct printk_info *info, bool syslog, + bool time, char *buf) { size_t len = 0; if (syslog) - len = print_syslog((msg->facility << 3) | msg->level, buf); + len = print_syslog((info->facility << 3) | info->level, buf); if (time) - len += print_time(msg->ts_nsec, buf + len); + len += print_time(info->ts_nsec, buf + len); - len += print_caller(msg->caller_id, buf + len); + len += print_caller(info->caller_id, buf + len); if (IS_ENABLED(CONFIG_PRINTK_CALLER) || time) { buf[len++] = ' '; @@ -1342,72 +1332,150 @@ static size_t print_prefix(const struct printk_log *msg, bool syslog, return len; } -static size_t msg_print_text(const struct printk_log *msg, bool syslog, - bool time, char *buf, size_t size) +/* + * Prepare the record for printing. The text is shifted within the given + * buffer to avoid a need for another one. The following operations are + * done: + * + * - Add prefix for each line. + * - Add the trailing newline that has been removed in vprintk_store(). + * - Drop truncated lines that do not longer fit into the buffer. + * + * Return: The length of the updated/prepared text, including the added + * prefixes and the newline. The dropped line(s) are not counted. + */ +static size_t record_print_text(struct printk_record *r, bool syslog, + bool time) { - const char *text = log_text(msg); - size_t text_size = msg->text_len; - size_t len = 0; + size_t text_len = r->info->text_len; + size_t buf_size = r->text_buf_size; + char *text = r->text_buf; char prefix[PREFIX_MAX]; - const size_t prefix_len = print_prefix(msg, syslog, time, prefix); + bool truncated = false; + size_t prefix_len; + size_t line_len; + size_t len = 0; + char *next; + + /* + * If the message was truncated because the buffer was not large + * enough, treat the available text as if it were the full text. + */ + if (text_len > buf_size) + text_len = buf_size; - do { - const char *next = memchr(text, '\n', text_size); - size_t text_len; + prefix_len = info_print_prefix(r->info, syslog, time, prefix); + /* + * @text_len: bytes of unprocessed text + * @line_len: bytes of current line _without_ newline + * @text: pointer to beginning of current line + * @len: number of bytes prepared in r->text_buf + */ + for (;;) { + next = memchr(text, '\n', text_len); if (next) { - text_len = next - text; - next++; - text_size -= next - text; + line_len = next - text; } else { - text_len = text_size; + /* Drop truncated line(s). */ + if (truncated) + break; + line_len = text_len; } - if (buf) { - if (prefix_len + text_len + 1 >= size - len) + /* + * Truncate the text if there is not enough space to add the + * prefix and a trailing newline. + */ + if (len + prefix_len + text_len + 1 > buf_size) { + /* Drop even the current line if no space. */ + if (len + prefix_len + line_len + 1 > buf_size) break; - memcpy(buf + len, prefix, prefix_len); - len += prefix_len; - memcpy(buf + len, text, text_len); - len += text_len; - buf[len++] = '\n'; - } else { - /* SYSLOG_ACTION_* buffer size only calculation */ - len += prefix_len + text_len + 1; + text_len = buf_size - len - prefix_len - 1; + truncated = true; } - text = next; - } while (text); + memmove(text + prefix_len, text, text_len); + memcpy(text, prefix, prefix_len); + + len += prefix_len + line_len + 1; + + if (text_len == line_len) { + /* + * Add the trailing newline removed in + * vprintk_store(). + */ + text[prefix_len + line_len] = '\n'; + break; + } + + /* + * Advance beyond the added prefix and the related line with + * its newline. + */ + text += prefix_len + line_len + 1; + + /* + * The remaining text has only decreased by the line with its + * newline. + * + * Note that @text_len can become zero. It happens when @text + * ended with a newline (either due to truncation or the + * original string ending with "\n\n"). The loop is correctly + * repeated and (if not truncated) an empty line with a prefix + * will be prepared. + */ + text_len -= line_len + 1; + } return len; } +static size_t get_record_print_text_size(struct printk_info *info, + unsigned int line_count, + bool syslog, bool time) +{ + char prefix[PREFIX_MAX]; + size_t prefix_len; + + prefix_len = info_print_prefix(info, syslog, time, prefix); + + /* + * Each line will be preceded with a prefix. The intermediate + * newlines are already within the text, but a final trailing + * newline will be added. + */ + return ((prefix_len * line_count) + info->text_len + 1); +} + static int syslog_print(char __user *buf, int size) { + struct printk_info info; + struct printk_record r; char *text; - struct printk_log *msg; int len = 0; text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); if (!text) return -ENOMEM; + prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX); + while (size > 0) { size_t n; size_t skip; logbuf_lock_irq(); - if (syslog_seq < log_first_seq) { - /* messages are gone, move to first one */ - syslog_seq = log_first_seq; - syslog_idx = log_first_idx; - syslog_partial = 0; - } - if (syslog_seq == log_next_seq) { + if (!prb_read_valid(prb, syslog_seq, &r)) { logbuf_unlock_irq(); break; } + if (r.info->seq != syslog_seq) { + /* message is gone, move to next valid one */ + syslog_seq = r.info->seq; + syslog_partial = 0; + } /* * To keep reading/counting partial line consistent, @@ -1417,13 +1485,10 @@ static int syslog_print(char __user *buf, int size) syslog_time = printk_time; skip = syslog_partial; - msg = log_from_idx(syslog_idx); - n = msg_print_text(msg, true, syslog_time, text, - LOG_LINE_MAX + PREFIX_MAX); + n = record_print_text(&r, true, syslog_time); if (n - syslog_partial <= size) { /* message fits into buffer, move forward */ - syslog_idx = log_next(syslog_idx); - syslog_seq++; + syslog_seq = r.info->seq + 1; n -= syslog_partial; syslog_partial = 0; } else if (!len){ @@ -1454,11 +1519,12 @@ static int syslog_print(char __user *buf, int size) static int syslog_print_all(char __user *buf, int size, bool clear) { + struct printk_info info; + unsigned int line_count; + struct printk_record r; char *text; int len = 0; - u64 next_seq; u64 seq; - u32 idx; bool time; text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); @@ -1471,38 +1537,28 @@ static int syslog_print_all(char __user *buf, int size, bool clear) * Find first record that fits, including all following records, * into the user-provided buffer for this dump. */ - seq = clear_seq; - idx = clear_idx; - while (seq < log_next_seq) { - struct printk_log *msg = log_from_idx(idx); - - len += msg_print_text(msg, true, time, NULL, 0); - idx = log_next(idx); - seq++; - } + prb_for_each_info(clear_seq, prb, seq, &info, &line_count) + len += get_record_print_text_size(&info, line_count, true, time); /* move first record forward until length fits into the buffer */ - seq = clear_seq; - idx = clear_idx; - while (len > size && seq < log_next_seq) { - struct printk_log *msg = log_from_idx(idx); - - len -= msg_print_text(msg, true, time, NULL, 0); - idx = log_next(idx); - seq++; + prb_for_each_info(clear_seq, prb, seq, &info, &line_count) { + if (len <= size) + break; + len -= get_record_print_text_size(&info, line_count, true, time); } - /* last message fitting into this dump */ - next_seq = log_next_seq; + prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX); len = 0; - while (len >= 0 && seq < next_seq) { - struct printk_log *msg = log_from_idx(idx); - int textlen = msg_print_text(msg, true, time, text, - LOG_LINE_MAX + PREFIX_MAX); + prb_for_each_record(seq, prb, seq, &r) { + int textlen; - idx = log_next(idx); - seq++; + textlen = record_print_text(&r, true, time); + + if (len + textlen > size) { + seq--; + break; + } logbuf_unlock_irq(); if (copy_to_user(buf + len, text, textlen)) @@ -1511,17 +1567,12 @@ static int syslog_print_all(char __user *buf, int size, bool clear) len += textlen; logbuf_lock_irq(); - if (seq < log_first_seq) { - /* messages are gone, move to next one */ - seq = log_first_seq; - idx = log_first_idx; - } + if (len < 0) + break; } - if (clear) { - clear_seq = log_next_seq; - clear_idx = log_next_idx; - } + if (clear) + clear_seq = seq; logbuf_unlock_irq(); kfree(text); @@ -1531,8 +1582,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) static void syslog_clear(void) { logbuf_lock_irq(); - clear_seq = log_next_seq; - clear_idx = log_next_idx; + clear_seq = prb_next_seq(prb); logbuf_unlock_irq(); } @@ -1559,7 +1609,7 @@ int do_syslog(int type, char __user *buf, int len, int source) if (!access_ok(buf, len)) return -EFAULT; error = wait_event_interruptible(log_wait, - syslog_seq != log_next_seq); + prb_read_valid(prb, syslog_seq, NULL)); if (error) return error; error = syslog_print(buf, len); @@ -1567,7 +1617,7 @@ int do_syslog(int type, char __user *buf, int len, int source) /* Read/clear last kernel messages */ case SYSLOG_ACTION_READ_CLEAR: clear = true; - /* FALL THRU */ + fallthrough; /* Read last kernel messages */ case SYSLOG_ACTION_READ_ALL: if (!buf || len < 0) @@ -1608,10 +1658,9 @@ int do_syslog(int type, char __user *buf, int len, int source) /* Number of chars in the log buffer */ case SYSLOG_ACTION_SIZE_UNREAD: logbuf_lock_irq(); - if (syslog_seq < log_first_seq) { + if (syslog_seq < prb_first_valid_seq(prb)) { /* messages are gone, move to first one */ - syslog_seq = log_first_seq; - syslog_idx = log_first_idx; + syslog_seq = prb_first_valid_seq(prb); syslog_partial = 0; } if (source == SYSLOG_FROM_PROC) { @@ -1620,20 +1669,18 @@ int do_syslog(int type, char __user *buf, int len, int source) * for pending data, not the size; return the count of * records, not the length. */ - error = log_next_seq - syslog_seq; + error = prb_next_seq(prb) - syslog_seq; } else { - u64 seq = syslog_seq; - u32 idx = syslog_idx; bool time = syslog_partial ? syslog_time : printk_time; - - while (seq < log_next_seq) { - struct printk_log *msg = log_from_idx(idx); - - error += msg_print_text(msg, true, time, NULL, - 0); + struct printk_info info; + unsigned int line_count; + u64 seq; + + prb_for_each_info(syslog_seq, prb, seq, &info, + &line_count) { + error += get_record_print_text_size(&info, line_count, + true, time); time = printk_time; - idx = log_next(idx); - seq++; } error -= syslog_partial; } @@ -1804,10 +1851,22 @@ static int console_trylock_spinning(void) static void call_console_drivers(const char *ext_text, size_t ext_len, const char *text, size_t len) { + static char dropped_text[64]; + size_t dropped_len = 0; struct console *con; trace_console_rcuidle(text, len); + if (!console_drivers) + return; + + if (console_dropped) { + dropped_len = snprintf(dropped_text, sizeof(dropped_text), + "** %lu printk messages dropped **\n", + console_dropped); + console_dropped = 0; + } + for_each_console(con) { if (exclusive_console && con != exclusive_console) continue; @@ -1820,8 +1879,11 @@ static void call_console_drivers(const char *ext_text, size_t ext_len, continue; if (con->flags & CON_EXTENDED) con->write(con, ext_text, ext_len); - else + else { + if (dropped_len) + con->write(con, dropped_text, dropped_len); con->write(con, text, len); + } } } @@ -1845,97 +1907,38 @@ static inline u32 printk_caller_id(void) 0x80000000 + raw_smp_processor_id(); } -/* - * Continuation lines are buffered, and not committed to the record buffer - * until the line is complete, or a race forces it. The line fragments - * though, are printed immediately to the consoles to ensure everything has - * reached the console in case of a kernel crash. - */ -static struct cont { - char buf[LOG_LINE_MAX]; - size_t len; /* length == 0 means unused buffer */ - u32 caller_id; /* printk_caller_id() of first print */ - u64 ts_nsec; /* time of first print */ - u8 level; /* log level of first message */ - u8 facility; /* log facility of first message */ - enum log_flags flags; /* prefix, newline flags */ -} cont; - -static void cont_flush(void) -{ - if (cont.len == 0) - return; - - log_store(cont.caller_id, cont.facility, cont.level, cont.flags, - cont.ts_nsec, NULL, 0, cont.buf, cont.len); - cont.len = 0; -} - -static bool cont_add(u32 caller_id, int facility, int level, - enum log_flags flags, const char *text, size_t len) -{ - /* If the line gets too long, split it up in separate records. */ - if (cont.len + len > sizeof(cont.buf)) { - cont_flush(); - return false; - } - - if (!cont.len) { - cont.facility = facility; - cont.level = level; - cont.caller_id = caller_id; - cont.ts_nsec = local_clock(); - cont.flags = flags; - } - - memcpy(cont.buf + cont.len, text, len); - cont.len += len; - - // The original flags come from the first line, - // but later continuations can add a newline. - if (flags & LOG_NEWLINE) { - cont.flags |= LOG_NEWLINE; - cont_flush(); - } - - return true; -} - -static size_t log_output(int facility, int level, enum log_flags lflags, const char *dict, size_t dictlen, char *text, size_t text_len) +static size_t log_output(int facility, int level, enum log_flags lflags, + const struct dev_printk_info *dev_info, + char *text, size_t text_len) { const u32 caller_id = printk_caller_id(); - /* - * If an earlier line was buffered, and we're a continuation - * write from the same context, try to add it to the buffer. - */ - if (cont.len) { - if (cont.caller_id == caller_id && (lflags & LOG_CONT)) { - if (cont_add(caller_id, facility, level, lflags, text, text_len)) - return text_len; - } - /* Otherwise, make sure it's flushed */ - cont_flush(); - } - - /* Skip empty continuation lines that couldn't be added - they just flush */ - if (!text_len && (lflags & LOG_CONT)) - return 0; - - /* If it doesn't end in a newline, try to buffer the current line */ - if (!(lflags & LOG_NEWLINE)) { - if (cont_add(caller_id, facility, level, lflags, text, text_len)) + if (lflags & LOG_CONT) { + struct prb_reserved_entry e; + struct printk_record r; + + prb_rec_init_wr(&r, text_len); + if (prb_reserve_in_last(&e, prb, &r, caller_id, LOG_LINE_MAX)) { + memcpy(&r.text_buf[r.info->text_len], text, text_len); + r.info->text_len += text_len; + if (lflags & LOG_NEWLINE) { + r.info->flags |= LOG_NEWLINE; + prb_final_commit(&e); + } else { + prb_commit(&e); + } return text_len; + } } /* Store it in the record log */ return log_store(caller_id, facility, level, lflags, 0, - dict, dictlen, text, text_len); + dev_info, text, text_len); } /* Must be called under logbuf_lock. */ int vprintk_store(int facility, int level, - const char *dict, size_t dictlen, + const struct dev_printk_info *dev_info, const char *fmt, va_list args) { static char textbuf[LOG_LINE_MAX]; @@ -1977,21 +1980,19 @@ int vprintk_store(int facility, int level, if (level == LOGLEVEL_DEFAULT) level = default_message_loglevel; - if (dict) + if (dev_info) lflags |= LOG_NEWLINE; - return log_output(facility, level, lflags, - dict, dictlen, text, text_len); + return log_output(facility, level, lflags, dev_info, text, text_len); } asmlinkage int vprintk_emit(int facility, int level, - const char *dict, size_t dictlen, + const struct dev_printk_info *dev_info, const char *fmt, va_list args) { int printed_len; - bool in_sched = false, pending_output; + bool in_sched = false; unsigned long flags; - u64 curr_log_seq; /* Suppress unimportant messages after panic happens */ if (unlikely(suppress_printk)) @@ -2007,13 +2008,11 @@ asmlinkage int vprintk_emit(int facility, int level, /* This stops the holder of console_sem just where we want him */ logbuf_lock_irqsave(flags); - curr_log_seq = log_next_seq; - printed_len = vprintk_store(facility, level, dict, dictlen, fmt, args); - pending_output = (curr_log_seq != log_next_seq); + printed_len = vprintk_store(facility, level, dev_info, fmt, args); logbuf_unlock_irqrestore(flags); /* If called from the scheduler, we can not call up(). */ - if (!in_sched && pending_output) { + if (!in_sched) { /* * Disable preemption to avoid being preempted while holding * console_sem which would prevent anyone from printing to @@ -2030,8 +2029,7 @@ asmlinkage int vprintk_emit(int facility, int level, preempt_enable(); } - if (pending_output) - wake_up_klogd(); + wake_up_klogd(); return printed_len; } EXPORT_SYMBOL(vprintk_emit); @@ -2044,7 +2042,7 @@ EXPORT_SYMBOL(vprintk); int vprintk_default(const char *fmt, va_list args) { - return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args); + return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, fmt, args); } EXPORT_SYMBOL_GPL(vprintk_default); @@ -2088,30 +2086,31 @@ EXPORT_SYMBOL(printk); #define PREFIX_MAX 0 #define printk_time false +#define prb_read_valid(rb, seq, r) false +#define prb_first_valid_seq(rb) 0 + static u64 syslog_seq; -static u32 syslog_idx; static u64 console_seq; -static u32 console_idx; static u64 exclusive_console_stop_seq; -static u64 log_first_seq; -static u32 log_first_idx; -static u64 log_next_seq; -static char *log_text(const struct printk_log *msg) { return NULL; } -static char *log_dict(const struct printk_log *msg) { return NULL; } -static struct printk_log *log_from_idx(u32 idx) { return NULL; } -static u32 log_next(u32 idx) { return 0; } -static ssize_t msg_print_ext_header(char *buf, size_t size, - struct printk_log *msg, - u64 seq) { return 0; } +static unsigned long console_dropped; + +static size_t record_print_text(const struct printk_record *r, + bool syslog, bool time) +{ + return 0; +} +static ssize_t info_print_ext_header(char *buf, size_t size, + struct printk_info *info) +{ + return 0; +} static ssize_t msg_print_ext_body(char *buf, size_t size, - char *dict, size_t dict_len, - char *text, size_t text_len) { return 0; } + char *text, size_t text_len, + struct dev_printk_info *dev_info) { return 0; } static void console_lock_spinning_enable(void) { } static int console_lock_spinning_disable_and_check(void) { return 0; } static void call_console_drivers(const char *ext_text, size_t ext_len, const char *text, size_t len) {} -static size_t msg_print_text(const struct printk_log *msg, bool syslog, - bool time, char *buf, size_t size) { return 0; } static bool suppress_message_printing(int level) { return false; } #endif /* CONFIG_PRINTK */ @@ -2398,12 +2397,16 @@ void console_unlock(void) static char text[LOG_LINE_MAX + PREFIX_MAX]; unsigned long flags; bool do_cond_resched, retry; + struct printk_info info; + struct printk_record r; if (console_suspended) { up_console_sem(); return; } + prb_rec_init_rd(&r, &info, text, sizeof(text)); + /* * Console drivers are called with interrupts disabled, so * @console_may_schedule should be cleared before; however, we may @@ -2416,7 +2419,7 @@ void console_unlock(void) * * console_trylock() is not able to detect the preemptive * context reliably. Therefore the value must be stored before - * and cleared after the the "again" goto label. + * and cleared after the "again" goto label. */ do_cond_resched = console_may_schedule; again: @@ -2434,35 +2437,26 @@ again: } for (;;) { - struct printk_log *msg; size_t ext_len = 0; size_t len; printk_safe_enter_irqsave(flags); raw_spin_lock(&logbuf_lock); - if (console_seq < log_first_seq) { - len = snprintf(text, sizeof(text), - "** %llu printk messages dropped **\n", - log_first_seq - console_seq); - - /* messages are gone, move to first one */ - console_seq = log_first_seq; - console_idx = log_first_idx; - } else { - len = 0; - } skip: - if (console_seq == log_next_seq) + if (!prb_read_valid(prb, console_seq, &r)) break; - msg = log_from_idx(console_idx); - if (suppress_message_printing(msg->level)) { + if (console_seq != r.info->seq) { + console_dropped += r.info->seq - console_seq; + console_seq = r.info->seq; + } + + if (suppress_message_printing(r.info->level)) { /* * Skip record we have buffered and already printed * directly to the console when we received it, and * record that has level above the console loglevel. */ - console_idx = log_next(console_idx); console_seq++; goto skip; } @@ -2473,19 +2467,23 @@ skip: exclusive_console = NULL; } - len += msg_print_text(msg, - console_msg_format & MSG_FORMAT_SYSLOG, - printk_time, text + len, sizeof(text) - len); + /* + * Handle extended console text first because later + * record_print_text() will modify the record buffer in-place. + */ if (nr_ext_console_drivers) { - ext_len = msg_print_ext_header(ext_text, + ext_len = info_print_ext_header(ext_text, sizeof(ext_text), - msg, console_seq); + r.info); ext_len += msg_print_ext_body(ext_text + ext_len, sizeof(ext_text) - ext_len, - log_dict(msg), msg->dict_len, - log_text(msg), msg->text_len); + &r.text_buf[0], + r.info->text_len, + &r.info->dev_info); } - console_idx = log_next(console_idx); + len = record_print_text(&r, + console_msg_format & MSG_FORMAT_SYSLOG, + printk_time); console_seq++; raw_spin_unlock(&logbuf_lock); @@ -2525,7 +2523,7 @@ skip: * flush, no worries. */ raw_spin_lock(&logbuf_lock); - retry = console_seq != log_next_seq; + retry = prb_read_valid(prb, console_seq, NULL); raw_spin_unlock(&logbuf_lock); printk_safe_exit_irqrestore(flags); @@ -2594,8 +2592,7 @@ void console_flush_on_panic(enum con_flush_mode mode) unsigned long flags; logbuf_lock_irqsave(flags); - console_seq = log_first_seq; - console_idx = log_first_idx; + console_seq = prb_first_valid_seq(prb); logbuf_unlock_irqrestore(flags); } console_unlock(); @@ -2838,7 +2835,6 @@ void register_console(struct console *newcon) exclusive_console = newcon; exclusive_console_stop_seq = console_seq; console_seq = syslog_seq; - console_idx = syslog_idx; logbuf_unlock_irqrestore(flags); } console_unlock(); @@ -3062,7 +3058,7 @@ int vprintk_deferred(const char *fmt, va_list args) { int r; - r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, 0, fmt, args); + r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, fmt, args); defer_console_output(); return r; @@ -3227,9 +3223,7 @@ void kmsg_dump(enum kmsg_dump_reason reason) logbuf_lock_irqsave(flags); dumper->cur_seq = clear_seq; - dumper->cur_idx = clear_idx; - dumper->next_seq = log_next_seq; - dumper->next_idx = log_next_idx; + dumper->next_seq = prb_next_seq(prb); logbuf_unlock_irqrestore(flags); /* invoke dumper which will iterate over records */ @@ -3263,28 +3257,33 @@ void kmsg_dump(enum kmsg_dump_reason reason) bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog, char *line, size_t size, size_t *len) { - struct printk_log *msg; + struct printk_info info; + unsigned int line_count; + struct printk_record r; size_t l = 0; bool ret = false; + prb_rec_init_rd(&r, &info, line, size); + if (!dumper->active) goto out; - if (dumper->cur_seq < log_first_seq) { - /* messages are gone, move to first available one */ - dumper->cur_seq = log_first_seq; - dumper->cur_idx = log_first_idx; - } - - /* last entry */ - if (dumper->cur_seq >= log_next_seq) - goto out; + /* Read text or count text lines? */ + if (line) { + if (!prb_read_valid(prb, dumper->cur_seq, &r)) + goto out; + l = record_print_text(&r, syslog, printk_time); + } else { + if (!prb_read_valid_info(prb, dumper->cur_seq, + &info, &line_count)) { + goto out; + } + l = get_record_print_text_size(&info, line_count, syslog, + printk_time); - msg = log_from_idx(dumper->cur_idx); - l = msg_print_text(msg, syslog, printk_time, line, size); + } - dumper->cur_idx = log_next(dumper->cur_idx); - dumper->cur_seq++; + dumper->cur_seq = r.info->seq + 1; ret = true; out: if (len) @@ -3332,7 +3331,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_line); * @len: length of line placed into buffer * * Start at the end of the kmsg buffer and fill the provided buffer - * with as many of the the *youngest* kmsg records that fit into it. + * with as many of the *youngest* kmsg records that fit into it. * If the buffer is large enough, all available kmsg records will be * copied with a single call. * @@ -3345,23 +3344,25 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_line); bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, char *buf, size_t size, size_t *len) { + struct printk_info info; + unsigned int line_count; + struct printk_record r; unsigned long flags; u64 seq; - u32 idx; u64 next_seq; - u32 next_idx; size_t l = 0; bool ret = false; bool time = printk_time; - if (!dumper->active) + prb_rec_init_rd(&r, &info, buf, size); + + if (!dumper->active || !buf || !size) goto out; logbuf_lock_irqsave(flags); - if (dumper->cur_seq < log_first_seq) { + if (dumper->cur_seq < prb_first_valid_seq(prb)) { /* messages are gone, move to first available one */ - dumper->cur_seq = log_first_seq; - dumper->cur_idx = log_first_idx; + dumper->cur_seq = prb_first_valid_seq(prb); } /* last entry */ @@ -3372,41 +3373,41 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, /* calculate length of entire buffer */ seq = dumper->cur_seq; - idx = dumper->cur_idx; - while (seq < dumper->next_seq) { - struct printk_log *msg = log_from_idx(idx); - - l += msg_print_text(msg, true, time, NULL, 0); - idx = log_next(idx); - seq++; + while (prb_read_valid_info(prb, seq, &info, &line_count)) { + if (r.info->seq >= dumper->next_seq) + break; + l += get_record_print_text_size(&info, line_count, true, time); + seq = r.info->seq + 1; } /* move first record forward until length fits into the buffer */ seq = dumper->cur_seq; - idx = dumper->cur_idx; - while (l >= size && seq < dumper->next_seq) { - struct printk_log *msg = log_from_idx(idx); - - l -= msg_print_text(msg, true, time, NULL, 0); - idx = log_next(idx); - seq++; + while (l >= size && prb_read_valid_info(prb, seq, + &info, &line_count)) { + if (r.info->seq >= dumper->next_seq) + break; + l -= get_record_print_text_size(&info, line_count, true, time); + seq = r.info->seq + 1; } /* last message in next interation */ next_seq = seq; - next_idx = idx; + /* actually read text into the buffer now */ l = 0; - while (seq < dumper->next_seq) { - struct printk_log *msg = log_from_idx(idx); + while (prb_read_valid(prb, seq, &r)) { + if (r.info->seq >= dumper->next_seq) + break; + + l += record_print_text(&r, syslog, time); + + /* adjust record to store to remaining buffer space */ + prb_rec_init_rd(&r, &info, buf + l, size - l); - l += msg_print_text(msg, syslog, time, buf + l, size - l); - idx = log_next(idx); - seq++; + seq = r.info->seq + 1; } dumper->next_seq = next_seq; - dumper->next_idx = next_idx; ret = true; logbuf_unlock_irqrestore(flags); out: @@ -3429,9 +3430,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer); void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper) { dumper->cur_seq = clear_seq; - dumper->cur_idx = clear_idx; - dumper->next_seq = log_next_seq; - dumper->next_idx = log_next_idx; + dumper->next_seq = prb_next_seq(prb); } /** diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c new file mode 100644 index 000000000000..6b1525685277 --- /dev/null +++ b/kernel/printk/printk_ringbuffer.c @@ -0,0 +1,2086 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/kernel.h> +#include <linux/irqflags.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/bug.h> +#include "printk_ringbuffer.h" + +/** + * DOC: printk_ringbuffer overview + * + * Data Structure + * -------------- + * The printk_ringbuffer is made up of 3 internal ringbuffers: + * + * desc_ring + * A ring of descriptors and their meta data (such as sequence number, + * timestamp, loglevel, etc.) as well as internal state information about + * the record and logical positions specifying where in the other + * ringbuffer the text strings are located. + * + * text_data_ring + * A ring of data blocks. A data block consists of an unsigned long + * integer (ID) that maps to a desc_ring index followed by the text + * string of the record. + * + * The internal state information of a descriptor is the key element to allow + * readers and writers to locklessly synchronize access to the data. + * + * Implementation + * -------------- + * + * Descriptor Ring + * ~~~~~~~~~~~~~~~ + * The descriptor ring is an array of descriptors. A descriptor contains + * essential meta data to track the data of a printk record using + * blk_lpos structs pointing to associated text data blocks (see + * "Data Rings" below). Each descriptor is assigned an ID that maps + * directly to index values of the descriptor array and has a state. The ID + * and the state are bitwise combined into a single descriptor field named + * @state_var, allowing ID and state to be synchronously and atomically + * updated. + * + * Descriptors have four states: + * + * reserved + * A writer is modifying the record. + * + * committed + * The record and all its data are written. A writer can reopen the + * descriptor (transitioning it back to reserved), but in the committed + * state the data is consistent. + * + * finalized + * The record and all its data are complete and available for reading. A + * writer cannot reopen the descriptor. + * + * reusable + * The record exists, but its text and/or meta data may no longer be + * available. + * + * Querying the @state_var of a record requires providing the ID of the + * descriptor to query. This can yield a possible fifth (pseudo) state: + * + * miss + * The descriptor being queried has an unexpected ID. + * + * The descriptor ring has a @tail_id that contains the ID of the oldest + * descriptor and @head_id that contains the ID of the newest descriptor. + * + * When a new descriptor should be created (and the ring is full), the tail + * descriptor is invalidated by first transitioning to the reusable state and + * then invalidating all tail data blocks up to and including the data blocks + * associated with the tail descriptor (for the text ring). Then + * @tail_id is advanced, followed by advancing @head_id. And finally the + * @state_var of the new descriptor is initialized to the new ID and reserved + * state. + * + * The @tail_id can only be advanced if the new @tail_id would be in the + * committed or reusable queried state. This makes it possible that a valid + * sequence number of the tail is always available. + * + * Descriptor Finalization + * ~~~~~~~~~~~~~~~~~~~~~~~ + * When a writer calls the commit function prb_commit(), record data is + * fully stored and is consistent within the ringbuffer. However, a writer can + * reopen that record, claiming exclusive access (as with prb_reserve()), and + * modify that record. When finished, the writer must again commit the record. + * + * In order for a record to be made available to readers (and also become + * recyclable for writers), it must be finalized. A finalized record cannot be + * reopened and can never become "unfinalized". Record finalization can occur + * in three different scenarios: + * + * 1) A writer can simultaneously commit and finalize its record by calling + * prb_final_commit() instead of prb_commit(). + * + * 2) When a new record is reserved and the previous record has been + * committed via prb_commit(), that previous record is automatically + * finalized. + * + * 3) When a record is committed via prb_commit() and a newer record + * already exists, the record being committed is automatically finalized. + * + * Data Ring + * ~~~~~~~~~ + * The text data ring is a byte array composed of data blocks. Data blocks are + * referenced by blk_lpos structs that point to the logical position of the + * beginning of a data block and the beginning of the next adjacent data + * block. Logical positions are mapped directly to index values of the byte + * array ringbuffer. + * + * Each data block consists of an ID followed by the writer data. The ID is + * the identifier of a descriptor that is associated with the data block. A + * given data block is considered valid if all of the following conditions + * are met: + * + * 1) The descriptor associated with the data block is in the committed + * or finalized queried state. + * + * 2) The blk_lpos struct within the descriptor associated with the data + * block references back to the same data block. + * + * 3) The data block is within the head/tail logical position range. + * + * If the writer data of a data block would extend beyond the end of the + * byte array, only the ID of the data block is stored at the logical + * position and the full data block (ID and writer data) is stored at the + * beginning of the byte array. The referencing blk_lpos will point to the + * ID before the wrap and the next data block will be at the logical + * position adjacent the full data block after the wrap. + * + * Data rings have a @tail_lpos that points to the beginning of the oldest + * data block and a @head_lpos that points to the logical position of the + * next (not yet existing) data block. + * + * When a new data block should be created (and the ring is full), tail data + * blocks will first be invalidated by putting their associated descriptors + * into the reusable state and then pushing the @tail_lpos forward beyond + * them. Then the @head_lpos is pushed forward and is associated with a new + * descriptor. If a data block is not valid, the @tail_lpos cannot be + * advanced beyond it. + * + * Info Array + * ~~~~~~~~~~ + * The general meta data of printk records are stored in printk_info structs, + * stored in an array with the same number of elements as the descriptor ring. + * Each info corresponds to the descriptor of the same index in the + * descriptor ring. Info validity is confirmed by evaluating the corresponding + * descriptor before and after loading the info. + * + * Usage + * ----- + * Here are some simple examples demonstrating writers and readers. For the + * examples a global ringbuffer (test_rb) is available (which is not the + * actual ringbuffer used by printk):: + * + * DEFINE_PRINTKRB(test_rb, 15, 5); + * + * This ringbuffer allows up to 32768 records (2 ^ 15) and has a size of + * 1 MiB (2 ^ (15 + 5)) for text data. + * + * Sample writer code:: + * + * const char *textstr = "message text"; + * struct prb_reserved_entry e; + * struct printk_record r; + * + * // specify how much to allocate + * prb_rec_init_wr(&r, strlen(textstr) + 1); + * + * if (prb_reserve(&e, &test_rb, &r)) { + * snprintf(r.text_buf, r.text_buf_size, "%s", textstr); + * + * r.info->text_len = strlen(textstr); + * r.info->ts_nsec = local_clock(); + * r.info->caller_id = printk_caller_id(); + * + * // commit and finalize the record + * prb_final_commit(&e); + * } + * + * Note that additional writer functions are available to extend a record + * after it has been committed but not yet finalized. This can be done as + * long as no new records have been reserved and the caller is the same. + * + * Sample writer code (record extending):: + * + * // alternate rest of previous example + * + * r.info->text_len = strlen(textstr); + * r.info->ts_nsec = local_clock(); + * r.info->caller_id = printk_caller_id(); + * + * // commit the record (but do not finalize yet) + * prb_commit(&e); + * } + * + * ... + * + * // specify additional 5 bytes text space to extend + * prb_rec_init_wr(&r, 5); + * + * // try to extend, but only if it does not exceed 32 bytes + * if (prb_reserve_in_last(&e, &test_rb, &r, printk_caller_id()), 32) { + * snprintf(&r.text_buf[r.info->text_len], + * r.text_buf_size - r.info->text_len, "hello"); + * + * r.info->text_len += 5; + * + * // commit and finalize the record + * prb_final_commit(&e); + * } + * + * Sample reader code:: + * + * struct printk_info info; + * struct printk_record r; + * char text_buf[32]; + * u64 seq; + * + * prb_rec_init_rd(&r, &info, &text_buf[0], sizeof(text_buf)); + * + * prb_for_each_record(0, &test_rb, &seq, &r) { + * if (info.seq != seq) + * pr_warn("lost %llu records\n", info.seq - seq); + * + * if (info.text_len > r.text_buf_size) { + * pr_warn("record %llu text truncated\n", info.seq); + * text_buf[r.text_buf_size - 1] = 0; + * } + * + * pr_info("%llu: %llu: %s\n", info.seq, info.ts_nsec, + * &text_buf[0]); + * } + * + * Note that additional less convenient reader functions are available to + * allow complex record access. + * + * ABA Issues + * ~~~~~~~~~~ + * To help avoid ABA issues, descriptors are referenced by IDs (array index + * values combined with tagged bits counting array wraps) and data blocks are + * referenced by logical positions (array index values combined with tagged + * bits counting array wraps). However, on 32-bit systems the number of + * tagged bits is relatively small such that an ABA incident is (at least + * theoretically) possible. For example, if 4 million maximally sized (1KiB) + * printk messages were to occur in NMI context on a 32-bit system, the + * interrupted context would not be able to recognize that the 32-bit integer + * completely wrapped and thus represents a different data block than the one + * the interrupted context expects. + * + * To help combat this possibility, additional state checking is performed + * (such as using cmpxchg() even though set() would suffice). These extra + * checks are commented as such and will hopefully catch any ABA issue that + * a 32-bit system might experience. + * + * Memory Barriers + * ~~~~~~~~~~~~~~~ + * Multiple memory barriers are used. To simplify proving correctness and + * generating litmus tests, lines of code related to memory barriers + * (loads, stores, and the associated memory barriers) are labeled:: + * + * LMM(function:letter) + * + * Comments reference the labels using only the "function:letter" part. + * + * The memory barrier pairs and their ordering are: + * + * desc_reserve:D / desc_reserve:B + * push descriptor tail (id), then push descriptor head (id) + * + * desc_reserve:D / data_push_tail:B + * push data tail (lpos), then set new descriptor reserved (state) + * + * desc_reserve:D / desc_push_tail:C + * push descriptor tail (id), then set new descriptor reserved (state) + * + * desc_reserve:D / prb_first_seq:C + * push descriptor tail (id), then set new descriptor reserved (state) + * + * desc_reserve:F / desc_read:D + * set new descriptor id and reserved (state), then allow writer changes + * + * data_alloc:A (or data_realloc:A) / desc_read:D + * set old descriptor reusable (state), then modify new data block area + * + * data_alloc:A (or data_realloc:A) / data_push_tail:B + * push data tail (lpos), then modify new data block area + * + * _prb_commit:B / desc_read:B + * store writer changes, then set new descriptor committed (state) + * + * desc_reopen_last:A / _prb_commit:B + * set descriptor reserved (state), then read descriptor data + * + * _prb_commit:B / desc_reserve:D + * set new descriptor committed (state), then check descriptor head (id) + * + * data_push_tail:D / data_push_tail:A + * set descriptor reusable (state), then push data tail (lpos) + * + * desc_push_tail:B / desc_reserve:D + * set descriptor reusable (state), then push descriptor tail (id) + */ + +#define DATA_SIZE(data_ring) _DATA_SIZE((data_ring)->size_bits) +#define DATA_SIZE_MASK(data_ring) (DATA_SIZE(data_ring) - 1) + +#define DESCS_COUNT(desc_ring) _DESCS_COUNT((desc_ring)->count_bits) +#define DESCS_COUNT_MASK(desc_ring) (DESCS_COUNT(desc_ring) - 1) + +/* Determine the data array index from a logical position. */ +#define DATA_INDEX(data_ring, lpos) ((lpos) & DATA_SIZE_MASK(data_ring)) + +/* Determine the desc array index from an ID or sequence number. */ +#define DESC_INDEX(desc_ring, n) ((n) & DESCS_COUNT_MASK(desc_ring)) + +/* Determine how many times the data array has wrapped. */ +#define DATA_WRAPS(data_ring, lpos) ((lpos) >> (data_ring)->size_bits) + +/* Determine if a logical position refers to a data-less block. */ +#define LPOS_DATALESS(lpos) ((lpos) & 1UL) +#define BLK_DATALESS(blk) (LPOS_DATALESS((blk)->begin) && \ + LPOS_DATALESS((blk)->next)) + +/* Get the logical position at index 0 of the current wrap. */ +#define DATA_THIS_WRAP_START_LPOS(data_ring, lpos) \ +((lpos) & ~DATA_SIZE_MASK(data_ring)) + +/* Get the ID for the same index of the previous wrap as the given ID. */ +#define DESC_ID_PREV_WRAP(desc_ring, id) \ +DESC_ID((id) - DESCS_COUNT(desc_ring)) + +/* + * A data block: mapped directly to the beginning of the data block area + * specified as a logical position within the data ring. + * + * @id: the ID of the associated descriptor + * @data: the writer data + * + * Note that the size of a data block is only known by its associated + * descriptor. + */ +struct prb_data_block { + unsigned long id; + char data[]; +}; + +/* + * Return the descriptor associated with @n. @n can be either a + * descriptor ID or a sequence number. + */ +static struct prb_desc *to_desc(struct prb_desc_ring *desc_ring, u64 n) +{ + return &desc_ring->descs[DESC_INDEX(desc_ring, n)]; +} + +/* + * Return the printk_info associated with @n. @n can be either a + * descriptor ID or a sequence number. + */ +static struct printk_info *to_info(struct prb_desc_ring *desc_ring, u64 n) +{ + return &desc_ring->infos[DESC_INDEX(desc_ring, n)]; +} + +static struct prb_data_block *to_block(struct prb_data_ring *data_ring, + unsigned long begin_lpos) +{ + return (void *)&data_ring->data[DATA_INDEX(data_ring, begin_lpos)]; +} + +/* + * Increase the data size to account for data block meta data plus any + * padding so that the adjacent data block is aligned on the ID size. + */ +static unsigned int to_blk_size(unsigned int size) +{ + struct prb_data_block *db = NULL; + + size += sizeof(*db); + size = ALIGN(size, sizeof(db->id)); + return size; +} + +/* + * Sanity checker for reserve size. The ringbuffer code assumes that a data + * block does not exceed the maximum possible size that could fit within the + * ringbuffer. This function provides that basic size check so that the + * assumption is safe. + */ +static bool data_check_size(struct prb_data_ring *data_ring, unsigned int size) +{ + struct prb_data_block *db = NULL; + + if (size == 0) + return true; + + /* + * Ensure the alignment padded size could possibly fit in the data + * array. The largest possible data block must still leave room for + * at least the ID of the next block. + */ + size = to_blk_size(size); + if (size > DATA_SIZE(data_ring) - sizeof(db->id)) + return false; + + return true; +} + +/* Query the state of a descriptor. */ +static enum desc_state get_desc_state(unsigned long id, + unsigned long state_val) +{ + if (id != DESC_ID(state_val)) + return desc_miss; + + return DESC_STATE(state_val); +} + +/* + * Get a copy of a specified descriptor and return its queried state. If the + * descriptor is in an inconsistent state (miss or reserved), the caller can + * only expect the descriptor's @state_var field to be valid. + * + * The sequence number and caller_id can be optionally retrieved. Like all + * non-state_var data, they are only valid if the descriptor is in a + * consistent state. + */ +static enum desc_state desc_read(struct prb_desc_ring *desc_ring, + unsigned long id, struct prb_desc *desc_out, + u64 *seq_out, u32 *caller_id_out) +{ + struct printk_info *info = to_info(desc_ring, id); + struct prb_desc *desc = to_desc(desc_ring, id); + atomic_long_t *state_var = &desc->state_var; + enum desc_state d_state; + unsigned long state_val; + + /* Check the descriptor state. */ + state_val = atomic_long_read(state_var); /* LMM(desc_read:A) */ + d_state = get_desc_state(id, state_val); + if (d_state == desc_miss || d_state == desc_reserved) { + /* + * The descriptor is in an inconsistent state. Set at least + * @state_var so that the caller can see the details of + * the inconsistent state. + */ + goto out; + } + + /* + * Guarantee the state is loaded before copying the descriptor + * content. This avoids copying obsolete descriptor content that might + * not apply to the descriptor state. This pairs with _prb_commit:B. + * + * Memory barrier involvement: + * + * If desc_read:A reads from _prb_commit:B, then desc_read:C reads + * from _prb_commit:A. + * + * Relies on: + * + * WMB from _prb_commit:A to _prb_commit:B + * matching + * RMB from desc_read:A to desc_read:C + */ + smp_rmb(); /* LMM(desc_read:B) */ + + /* + * Copy the descriptor data. The data is not valid until the + * state has been re-checked. A memcpy() for all of @desc + * cannot be used because of the atomic_t @state_var field. + */ + memcpy(&desc_out->text_blk_lpos, &desc->text_blk_lpos, + sizeof(desc_out->text_blk_lpos)); /* LMM(desc_read:C) */ + if (seq_out) + *seq_out = info->seq; /* also part of desc_read:C */ + if (caller_id_out) + *caller_id_out = info->caller_id; /* also part of desc_read:C */ + + /* + * 1. Guarantee the descriptor content is loaded before re-checking + * the state. This avoids reading an obsolete descriptor state + * that may not apply to the copied content. This pairs with + * desc_reserve:F. + * + * Memory barrier involvement: + * + * If desc_read:C reads from desc_reserve:G, then desc_read:E + * reads from desc_reserve:F. + * + * Relies on: + * + * WMB from desc_reserve:F to desc_reserve:G + * matching + * RMB from desc_read:C to desc_read:E + * + * 2. Guarantee the record data is loaded before re-checking the + * state. This avoids reading an obsolete descriptor state that may + * not apply to the copied data. This pairs with data_alloc:A and + * data_realloc:A. + * + * Memory barrier involvement: + * + * If copy_data:A reads from data_alloc:B, then desc_read:E + * reads from desc_make_reusable:A. + * + * Relies on: + * + * MB from desc_make_reusable:A to data_alloc:B + * matching + * RMB from desc_read:C to desc_read:E + * + * Note: desc_make_reusable:A and data_alloc:B can be different + * CPUs. However, the data_alloc:B CPU (which performs the + * full memory barrier) must have previously seen + * desc_make_reusable:A. + */ + smp_rmb(); /* LMM(desc_read:D) */ + + /* + * The data has been copied. Return the current descriptor state, + * which may have changed since the load above. + */ + state_val = atomic_long_read(state_var); /* LMM(desc_read:E) */ + d_state = get_desc_state(id, state_val); +out: + atomic_long_set(&desc_out->state_var, state_val); + return d_state; +} + +/* + * Take a specified descriptor out of the finalized state by attempting + * the transition from finalized to reusable. Either this context or some + * other context will have been successful. + */ +static void desc_make_reusable(struct prb_desc_ring *desc_ring, + unsigned long id) +{ + unsigned long val_finalized = DESC_SV(id, desc_finalized); + unsigned long val_reusable = DESC_SV(id, desc_reusable); + struct prb_desc *desc = to_desc(desc_ring, id); + atomic_long_t *state_var = &desc->state_var; + + atomic_long_cmpxchg_relaxed(state_var, val_finalized, + val_reusable); /* LMM(desc_make_reusable:A) */ +} + +/* + * Given the text data ring, put the associated descriptor of each + * data block from @lpos_begin until @lpos_end into the reusable state. + * + * If there is any problem making the associated descriptor reusable, either + * the descriptor has not yet been finalized or another writer context has + * already pushed the tail lpos past the problematic data block. Regardless, + * on error the caller can re-load the tail lpos to determine the situation. + */ +static bool data_make_reusable(struct printk_ringbuffer *rb, + struct prb_data_ring *data_ring, + unsigned long lpos_begin, + unsigned long lpos_end, + unsigned long *lpos_out) +{ + struct prb_desc_ring *desc_ring = &rb->desc_ring; + struct prb_data_block *blk; + enum desc_state d_state; + struct prb_desc desc; + struct prb_data_blk_lpos *blk_lpos = &desc.text_blk_lpos; + unsigned long id; + + /* Loop until @lpos_begin has advanced to or beyond @lpos_end. */ + while ((lpos_end - lpos_begin) - 1 < DATA_SIZE(data_ring)) { + blk = to_block(data_ring, lpos_begin); + + /* + * Load the block ID from the data block. This is a data race + * against a writer that may have newly reserved this data + * area. If the loaded value matches a valid descriptor ID, + * the blk_lpos of that descriptor will be checked to make + * sure it points back to this data block. If the check fails, + * the data area has been recycled by another writer. + */ + id = blk->id; /* LMM(data_make_reusable:A) */ + + d_state = desc_read(desc_ring, id, &desc, + NULL, NULL); /* LMM(data_make_reusable:B) */ + + switch (d_state) { + case desc_miss: + case desc_reserved: + case desc_committed: + return false; + case desc_finalized: + /* + * This data block is invalid if the descriptor + * does not point back to it. + */ + if (blk_lpos->begin != lpos_begin) + return false; + desc_make_reusable(desc_ring, id); + break; + case desc_reusable: + /* + * This data block is invalid if the descriptor + * does not point back to it. + */ + if (blk_lpos->begin != lpos_begin) + return false; + break; + } + + /* Advance @lpos_begin to the next data block. */ + lpos_begin = blk_lpos->next; + } + + *lpos_out = lpos_begin; + return true; +} + +/* + * Advance the data ring tail to at least @lpos. This function puts + * descriptors into the reusable state if the tail is pushed beyond + * their associated data block. + */ +static bool data_push_tail(struct printk_ringbuffer *rb, + struct prb_data_ring *data_ring, + unsigned long lpos) +{ + unsigned long tail_lpos_new; + unsigned long tail_lpos; + unsigned long next_lpos; + + /* If @lpos is from a data-less block, there is nothing to do. */ + if (LPOS_DATALESS(lpos)) + return true; + + /* + * Any descriptor states that have transitioned to reusable due to the + * data tail being pushed to this loaded value will be visible to this + * CPU. This pairs with data_push_tail:D. + * + * Memory barrier involvement: + * + * If data_push_tail:A reads from data_push_tail:D, then this CPU can + * see desc_make_reusable:A. + * + * Relies on: + * + * MB from desc_make_reusable:A to data_push_tail:D + * matches + * READFROM from data_push_tail:D to data_push_tail:A + * thus + * READFROM from desc_make_reusable:A to this CPU + */ + tail_lpos = atomic_long_read(&data_ring->tail_lpos); /* LMM(data_push_tail:A) */ + + /* + * Loop until the tail lpos is at or beyond @lpos. This condition + * may already be satisfied, resulting in no full memory barrier + * from data_push_tail:D being performed. However, since this CPU + * sees the new tail lpos, any descriptor states that transitioned to + * the reusable state must already be visible. + */ + while ((lpos - tail_lpos) - 1 < DATA_SIZE(data_ring)) { + /* + * Make all descriptors reusable that are associated with + * data blocks before @lpos. + */ + if (!data_make_reusable(rb, data_ring, tail_lpos, lpos, + &next_lpos)) { + /* + * 1. Guarantee the block ID loaded in + * data_make_reusable() is performed before + * reloading the tail lpos. The failed + * data_make_reusable() may be due to a newly + * recycled data area causing the tail lpos to + * have been previously pushed. This pairs with + * data_alloc:A and data_realloc:A. + * + * Memory barrier involvement: + * + * If data_make_reusable:A reads from data_alloc:B, + * then data_push_tail:C reads from + * data_push_tail:D. + * + * Relies on: + * + * MB from data_push_tail:D to data_alloc:B + * matching + * RMB from data_make_reusable:A to + * data_push_tail:C + * + * Note: data_push_tail:D and data_alloc:B can be + * different CPUs. However, the data_alloc:B + * CPU (which performs the full memory + * barrier) must have previously seen + * data_push_tail:D. + * + * 2. Guarantee the descriptor state loaded in + * data_make_reusable() is performed before + * reloading the tail lpos. The failed + * data_make_reusable() may be due to a newly + * recycled descriptor causing the tail lpos to + * have been previously pushed. This pairs with + * desc_reserve:D. + * + * Memory barrier involvement: + * + * If data_make_reusable:B reads from + * desc_reserve:F, then data_push_tail:C reads + * from data_push_tail:D. + * + * Relies on: + * + * MB from data_push_tail:D to desc_reserve:F + * matching + * RMB from data_make_reusable:B to + * data_push_tail:C + * + * Note: data_push_tail:D and desc_reserve:F can + * be different CPUs. However, the + * desc_reserve:F CPU (which performs the + * full memory barrier) must have previously + * seen data_push_tail:D. + */ + smp_rmb(); /* LMM(data_push_tail:B) */ + + tail_lpos_new = atomic_long_read(&data_ring->tail_lpos + ); /* LMM(data_push_tail:C) */ + if (tail_lpos_new == tail_lpos) + return false; + + /* Another CPU pushed the tail. Try again. */ + tail_lpos = tail_lpos_new; + continue; + } + + /* + * Guarantee any descriptor states that have transitioned to + * reusable are stored before pushing the tail lpos. A full + * memory barrier is needed since other CPUs may have made + * the descriptor states reusable. This pairs with + * data_push_tail:A. + */ + if (atomic_long_try_cmpxchg(&data_ring->tail_lpos, &tail_lpos, + next_lpos)) { /* LMM(data_push_tail:D) */ + break; + } + } + + return true; +} + +/* + * Advance the desc ring tail. This function advances the tail by one + * descriptor, thus invalidating the oldest descriptor. Before advancing + * the tail, the tail descriptor is made reusable and all data blocks up to + * and including the descriptor's data block are invalidated (i.e. the data + * ring tail is pushed past the data block of the descriptor being made + * reusable). + */ +static bool desc_push_tail(struct printk_ringbuffer *rb, + unsigned long tail_id) +{ + struct prb_desc_ring *desc_ring = &rb->desc_ring; + enum desc_state d_state; + struct prb_desc desc; + + d_state = desc_read(desc_ring, tail_id, &desc, NULL, NULL); + + switch (d_state) { + case desc_miss: + /* + * If the ID is exactly 1 wrap behind the expected, it is + * in the process of being reserved by another writer and + * must be considered reserved. + */ + if (DESC_ID(atomic_long_read(&desc.state_var)) == + DESC_ID_PREV_WRAP(desc_ring, tail_id)) { + return false; + } + + /* + * The ID has changed. Another writer must have pushed the + * tail and recycled the descriptor already. Success is + * returned because the caller is only interested in the + * specified tail being pushed, which it was. + */ + return true; + case desc_reserved: + case desc_committed: + return false; + case desc_finalized: + desc_make_reusable(desc_ring, tail_id); + break; + case desc_reusable: + break; + } + + /* + * Data blocks must be invalidated before their associated + * descriptor can be made available for recycling. Invalidating + * them later is not possible because there is no way to trust + * data blocks once their associated descriptor is gone. + */ + + if (!data_push_tail(rb, &rb->text_data_ring, desc.text_blk_lpos.next)) + return false; + + /* + * Check the next descriptor after @tail_id before pushing the tail + * to it because the tail must always be in a finalized or reusable + * state. The implementation of prb_first_seq() relies on this. + * + * A successful read implies that the next descriptor is less than or + * equal to @head_id so there is no risk of pushing the tail past the + * head. + */ + d_state = desc_read(desc_ring, DESC_ID(tail_id + 1), &desc, + NULL, NULL); /* LMM(desc_push_tail:A) */ + + if (d_state == desc_finalized || d_state == desc_reusable) { + /* + * Guarantee any descriptor states that have transitioned to + * reusable are stored before pushing the tail ID. This allows + * verifying the recycled descriptor state. A full memory + * barrier is needed since other CPUs may have made the + * descriptor states reusable. This pairs with desc_reserve:D. + */ + atomic_long_cmpxchg(&desc_ring->tail_id, tail_id, + DESC_ID(tail_id + 1)); /* LMM(desc_push_tail:B) */ + } else { + /* + * Guarantee the last state load from desc_read() is before + * reloading @tail_id in order to see a new tail ID in the + * case that the descriptor has been recycled. This pairs + * with desc_reserve:D. + * + * Memory barrier involvement: + * + * If desc_push_tail:A reads from desc_reserve:F, then + * desc_push_tail:D reads from desc_push_tail:B. + * + * Relies on: + * + * MB from desc_push_tail:B to desc_reserve:F + * matching + * RMB from desc_push_tail:A to desc_push_tail:D + * + * Note: desc_push_tail:B and desc_reserve:F can be different + * CPUs. However, the desc_reserve:F CPU (which performs + * the full memory barrier) must have previously seen + * desc_push_tail:B. + */ + smp_rmb(); /* LMM(desc_push_tail:C) */ + + /* + * Re-check the tail ID. The descriptor following @tail_id is + * not in an allowed tail state. But if the tail has since + * been moved by another CPU, then it does not matter. + */ + if (atomic_long_read(&desc_ring->tail_id) == tail_id) /* LMM(desc_push_tail:D) */ + return false; + } + + return true; +} + +/* Reserve a new descriptor, invalidating the oldest if necessary. */ +static bool desc_reserve(struct printk_ringbuffer *rb, unsigned long *id_out) +{ + struct prb_desc_ring *desc_ring = &rb->desc_ring; + unsigned long prev_state_val; + unsigned long id_prev_wrap; + struct prb_desc *desc; + unsigned long head_id; + unsigned long id; + + head_id = atomic_long_read(&desc_ring->head_id); /* LMM(desc_reserve:A) */ + + do { + desc = to_desc(desc_ring, head_id); + + id = DESC_ID(head_id + 1); + id_prev_wrap = DESC_ID_PREV_WRAP(desc_ring, id); + + /* + * Guarantee the head ID is read before reading the tail ID. + * Since the tail ID is updated before the head ID, this + * guarantees that @id_prev_wrap is never ahead of the tail + * ID. This pairs with desc_reserve:D. + * + * Memory barrier involvement: + * + * If desc_reserve:A reads from desc_reserve:D, then + * desc_reserve:C reads from desc_push_tail:B. + * + * Relies on: + * + * MB from desc_push_tail:B to desc_reserve:D + * matching + * RMB from desc_reserve:A to desc_reserve:C + * + * Note: desc_push_tail:B and desc_reserve:D can be different + * CPUs. However, the desc_reserve:D CPU (which performs + * the full memory barrier) must have previously seen + * desc_push_tail:B. + */ + smp_rmb(); /* LMM(desc_reserve:B) */ + + if (id_prev_wrap == atomic_long_read(&desc_ring->tail_id + )) { /* LMM(desc_reserve:C) */ + /* + * Make space for the new descriptor by + * advancing the tail. + */ + if (!desc_push_tail(rb, id_prev_wrap)) + return false; + } + + /* + * 1. Guarantee the tail ID is read before validating the + * recycled descriptor state. A read memory barrier is + * sufficient for this. This pairs with desc_push_tail:B. + * + * Memory barrier involvement: + * + * If desc_reserve:C reads from desc_push_tail:B, then + * desc_reserve:E reads from desc_make_reusable:A. + * + * Relies on: + * + * MB from desc_make_reusable:A to desc_push_tail:B + * matching + * RMB from desc_reserve:C to desc_reserve:E + * + * Note: desc_make_reusable:A and desc_push_tail:B can be + * different CPUs. However, the desc_push_tail:B CPU + * (which performs the full memory barrier) must have + * previously seen desc_make_reusable:A. + * + * 2. Guarantee the tail ID is stored before storing the head + * ID. This pairs with desc_reserve:B. + * + * 3. Guarantee any data ring tail changes are stored before + * recycling the descriptor. Data ring tail changes can + * happen via desc_push_tail()->data_push_tail(). A full + * memory barrier is needed since another CPU may have + * pushed the data ring tails. This pairs with + * data_push_tail:B. + * + * 4. Guarantee a new tail ID is stored before recycling the + * descriptor. A full memory barrier is needed since + * another CPU may have pushed the tail ID. This pairs + * with desc_push_tail:C and this also pairs with + * prb_first_seq:C. + * + * 5. Guarantee the head ID is stored before trying to + * finalize the previous descriptor. This pairs with + * _prb_commit:B. + */ + } while (!atomic_long_try_cmpxchg(&desc_ring->head_id, &head_id, + id)); /* LMM(desc_reserve:D) */ + + desc = to_desc(desc_ring, id); + + /* + * If the descriptor has been recycled, verify the old state val. + * See "ABA Issues" about why this verification is performed. + */ + prev_state_val = atomic_long_read(&desc->state_var); /* LMM(desc_reserve:E) */ + if (prev_state_val && + get_desc_state(id_prev_wrap, prev_state_val) != desc_reusable) { + WARN_ON_ONCE(1); + return false; + } + + /* + * Assign the descriptor a new ID and set its state to reserved. + * See "ABA Issues" about why cmpxchg() instead of set() is used. + * + * Guarantee the new descriptor ID and state is stored before making + * any other changes. A write memory barrier is sufficient for this. + * This pairs with desc_read:D. + */ + if (!atomic_long_try_cmpxchg(&desc->state_var, &prev_state_val, + DESC_SV(id, desc_reserved))) { /* LMM(desc_reserve:F) */ + WARN_ON_ONCE(1); + return false; + } + + /* Now data in @desc can be modified: LMM(desc_reserve:G) */ + + *id_out = id; + return true; +} + +/* Determine the end of a data block. */ +static unsigned long get_next_lpos(struct prb_data_ring *data_ring, + unsigned long lpos, unsigned int size) +{ + unsigned long begin_lpos; + unsigned long next_lpos; + + begin_lpos = lpos; + next_lpos = lpos + size; + + /* First check if the data block does not wrap. */ + if (DATA_WRAPS(data_ring, begin_lpos) == DATA_WRAPS(data_ring, next_lpos)) + return next_lpos; + + /* Wrapping data blocks store their data at the beginning. */ + return (DATA_THIS_WRAP_START_LPOS(data_ring, next_lpos) + size); +} + +/* + * Allocate a new data block, invalidating the oldest data block(s) + * if necessary. This function also associates the data block with + * a specified descriptor. + */ +static char *data_alloc(struct printk_ringbuffer *rb, + struct prb_data_ring *data_ring, unsigned int size, + struct prb_data_blk_lpos *blk_lpos, unsigned long id) +{ + struct prb_data_block *blk; + unsigned long begin_lpos; + unsigned long next_lpos; + + if (size == 0) { + /* Specify a data-less block. */ + blk_lpos->begin = NO_LPOS; + blk_lpos->next = NO_LPOS; + return NULL; + } + + size = to_blk_size(size); + + begin_lpos = atomic_long_read(&data_ring->head_lpos); + + do { + next_lpos = get_next_lpos(data_ring, begin_lpos, size); + + if (!data_push_tail(rb, data_ring, next_lpos - DATA_SIZE(data_ring))) { + /* Failed to allocate, specify a data-less block. */ + blk_lpos->begin = FAILED_LPOS; + blk_lpos->next = FAILED_LPOS; + return NULL; + } + + /* + * 1. Guarantee any descriptor states that have transitioned + * to reusable are stored before modifying the newly + * allocated data area. A full memory barrier is needed + * since other CPUs may have made the descriptor states + * reusable. See data_push_tail:A about why the reusable + * states are visible. This pairs with desc_read:D. + * + * 2. Guarantee any updated tail lpos is stored before + * modifying the newly allocated data area. Another CPU may + * be in data_make_reusable() and is reading a block ID + * from this area. data_make_reusable() can handle reading + * a garbage block ID value, but then it must be able to + * load a new tail lpos. A full memory barrier is needed + * since other CPUs may have updated the tail lpos. This + * pairs with data_push_tail:B. + */ + } while (!atomic_long_try_cmpxchg(&data_ring->head_lpos, &begin_lpos, + next_lpos)); /* LMM(data_alloc:A) */ + + blk = to_block(data_ring, begin_lpos); + blk->id = id; /* LMM(data_alloc:B) */ + + if (DATA_WRAPS(data_ring, begin_lpos) != DATA_WRAPS(data_ring, next_lpos)) { + /* Wrapping data blocks store their data at the beginning. */ + blk = to_block(data_ring, 0); + + /* + * Store the ID on the wrapped block for consistency. + * The printk_ringbuffer does not actually use it. + */ + blk->id = id; + } + + blk_lpos->begin = begin_lpos; + blk_lpos->next = next_lpos; + + return &blk->data[0]; +} + +/* + * Try to resize an existing data block associated with the descriptor + * specified by @id. If the resized data block should become wrapped, it + * copies the old data to the new data block. If @size yields a data block + * with the same or less size, the data block is left as is. + * + * Fail if this is not the last allocated data block or if there is not + * enough space or it is not possible make enough space. + * + * Return a pointer to the beginning of the entire data buffer or NULL on + * failure. + */ +static char *data_realloc(struct printk_ringbuffer *rb, + struct prb_data_ring *data_ring, unsigned int size, + struct prb_data_blk_lpos *blk_lpos, unsigned long id) +{ + struct prb_data_block *blk; + unsigned long head_lpos; + unsigned long next_lpos; + bool wrapped; + + /* Reallocation only works if @blk_lpos is the newest data block. */ + head_lpos = atomic_long_read(&data_ring->head_lpos); + if (head_lpos != blk_lpos->next) + return NULL; + + /* Keep track if @blk_lpos was a wrapping data block. */ + wrapped = (DATA_WRAPS(data_ring, blk_lpos->begin) != DATA_WRAPS(data_ring, blk_lpos->next)); + + size = to_blk_size(size); + + next_lpos = get_next_lpos(data_ring, blk_lpos->begin, size); + + /* If the data block does not increase, there is nothing to do. */ + if (head_lpos - next_lpos < DATA_SIZE(data_ring)) { + if (wrapped) + blk = to_block(data_ring, 0); + else + blk = to_block(data_ring, blk_lpos->begin); + return &blk->data[0]; + } + + if (!data_push_tail(rb, data_ring, next_lpos - DATA_SIZE(data_ring))) + return NULL; + + /* The memory barrier involvement is the same as data_alloc:A. */ + if (!atomic_long_try_cmpxchg(&data_ring->head_lpos, &head_lpos, + next_lpos)) { /* LMM(data_realloc:A) */ + return NULL; + } + + blk = to_block(data_ring, blk_lpos->begin); + + if (DATA_WRAPS(data_ring, blk_lpos->begin) != DATA_WRAPS(data_ring, next_lpos)) { + struct prb_data_block *old_blk = blk; + + /* Wrapping data blocks store their data at the beginning. */ + blk = to_block(data_ring, 0); + + /* + * Store the ID on the wrapped block for consistency. + * The printk_ringbuffer does not actually use it. + */ + blk->id = id; + + if (!wrapped) { + /* + * Since the allocated space is now in the newly + * created wrapping data block, copy the content + * from the old data block. + */ + memcpy(&blk->data[0], &old_blk->data[0], + (blk_lpos->next - blk_lpos->begin) - sizeof(blk->id)); + } + } + + blk_lpos->next = next_lpos; + + return &blk->data[0]; +} + +/* Return the number of bytes used by a data block. */ +static unsigned int space_used(struct prb_data_ring *data_ring, + struct prb_data_blk_lpos *blk_lpos) +{ + /* Data-less blocks take no space. */ + if (BLK_DATALESS(blk_lpos)) + return 0; + + if (DATA_WRAPS(data_ring, blk_lpos->begin) == DATA_WRAPS(data_ring, blk_lpos->next)) { + /* Data block does not wrap. */ + return (DATA_INDEX(data_ring, blk_lpos->next) - + DATA_INDEX(data_ring, blk_lpos->begin)); + } + + /* + * For wrapping data blocks, the trailing (wasted) space is + * also counted. + */ + return (DATA_INDEX(data_ring, blk_lpos->next) + + DATA_SIZE(data_ring) - DATA_INDEX(data_ring, blk_lpos->begin)); +} + +/* + * Given @blk_lpos, return a pointer to the writer data from the data block + * and calculate the size of the data part. A NULL pointer is returned if + * @blk_lpos specifies values that could never be legal. + * + * This function (used by readers) performs strict validation on the lpos + * values to possibly detect bugs in the writer code. A WARN_ON_ONCE() is + * triggered if an internal error is detected. + */ +static const char *get_data(struct prb_data_ring *data_ring, + struct prb_data_blk_lpos *blk_lpos, + unsigned int *data_size) +{ + struct prb_data_block *db; + + /* Data-less data block description. */ + if (BLK_DATALESS(blk_lpos)) { + if (blk_lpos->begin == NO_LPOS && blk_lpos->next == NO_LPOS) { + *data_size = 0; + return ""; + } + return NULL; + } + + /* Regular data block: @begin less than @next and in same wrap. */ + if (DATA_WRAPS(data_ring, blk_lpos->begin) == DATA_WRAPS(data_ring, blk_lpos->next) && + blk_lpos->begin < blk_lpos->next) { + db = to_block(data_ring, blk_lpos->begin); + *data_size = blk_lpos->next - blk_lpos->begin; + + /* Wrapping data block: @begin is one wrap behind @next. */ + } else if (DATA_WRAPS(data_ring, blk_lpos->begin + DATA_SIZE(data_ring)) == + DATA_WRAPS(data_ring, blk_lpos->next)) { + db = to_block(data_ring, 0); + *data_size = DATA_INDEX(data_ring, blk_lpos->next); + + /* Illegal block description. */ + } else { + WARN_ON_ONCE(1); + return NULL; + } + + /* A valid data block will always be aligned to the ID size. */ + if (WARN_ON_ONCE(blk_lpos->begin != ALIGN(blk_lpos->begin, sizeof(db->id))) || + WARN_ON_ONCE(blk_lpos->next != ALIGN(blk_lpos->next, sizeof(db->id)))) { + return NULL; + } + + /* A valid data block will always have at least an ID. */ + if (WARN_ON_ONCE(*data_size < sizeof(db->id))) + return NULL; + + /* Subtract block ID space from size to reflect data size. */ + *data_size -= sizeof(db->id); + + return &db->data[0]; +} + +/* + * Attempt to transition the newest descriptor from committed back to reserved + * so that the record can be modified by a writer again. This is only possible + * if the descriptor is not yet finalized and the provided @caller_id matches. + */ +static struct prb_desc *desc_reopen_last(struct prb_desc_ring *desc_ring, + u32 caller_id, unsigned long *id_out) +{ + unsigned long prev_state_val; + enum desc_state d_state; + struct prb_desc desc; + struct prb_desc *d; + unsigned long id; + u32 cid; + + id = atomic_long_read(&desc_ring->head_id); + + /* + * To reduce unnecessarily reopening, first check if the descriptor + * state and caller ID are correct. + */ + d_state = desc_read(desc_ring, id, &desc, NULL, &cid); + if (d_state != desc_committed || cid != caller_id) + return NULL; + + d = to_desc(desc_ring, id); + + prev_state_val = DESC_SV(id, desc_committed); + + /* + * Guarantee the reserved state is stored before reading any + * record data. A full memory barrier is needed because @state_var + * modification is followed by reading. This pairs with _prb_commit:B. + * + * Memory barrier involvement: + * + * If desc_reopen_last:A reads from _prb_commit:B, then + * prb_reserve_in_last:A reads from _prb_commit:A. + * + * Relies on: + * + * WMB from _prb_commit:A to _prb_commit:B + * matching + * MB If desc_reopen_last:A to prb_reserve_in_last:A + */ + if (!atomic_long_try_cmpxchg(&d->state_var, &prev_state_val, + DESC_SV(id, desc_reserved))) { /* LMM(desc_reopen_last:A) */ + return NULL; + } + + *id_out = id; + return d; +} + +/** + * prb_reserve_in_last() - Re-reserve and extend the space in the ringbuffer + * used by the newest record. + * + * @e: The entry structure to setup. + * @rb: The ringbuffer to re-reserve and extend data in. + * @r: The record structure to allocate buffers for. + * @caller_id: The caller ID of the caller (reserving writer). + * @max_size: Fail if the extended size would be greater than this. + * + * This is the public function available to writers to re-reserve and extend + * data. + * + * The writer specifies the text size to extend (not the new total size) by + * setting the @text_buf_size field of @r. To ensure proper initialization + * of @r, prb_rec_init_wr() should be used. + * + * This function will fail if @caller_id does not match the caller ID of the + * newest record. In that case the caller must reserve new data using + * prb_reserve(). + * + * Context: Any context. Disables local interrupts on success. + * Return: true if text data could be extended, otherwise false. + * + * On success: + * + * - @r->text_buf points to the beginning of the entire text buffer. + * + * - @r->text_buf_size is set to the new total size of the buffer. + * + * - @r->info is not touched so that @r->info->text_len could be used + * to append the text. + * + * - prb_record_text_space() can be used on @e to query the new + * actually used space. + * + * Important: All @r->info fields will already be set with the current values + * for the record. I.e. @r->info->text_len will be less than + * @text_buf_size. Writers can use @r->info->text_len to know + * where concatenation begins and writers should update + * @r->info->text_len after concatenating. + */ +bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, + struct printk_record *r, u32 caller_id, unsigned int max_size) +{ + struct prb_desc_ring *desc_ring = &rb->desc_ring; + struct printk_info *info; + unsigned int data_size; + struct prb_desc *d; + unsigned long id; + + local_irq_save(e->irqflags); + + /* Transition the newest descriptor back to the reserved state. */ + d = desc_reopen_last(desc_ring, caller_id, &id); + if (!d) { + local_irq_restore(e->irqflags); + goto fail_reopen; + } + + /* Now the writer has exclusive access: LMM(prb_reserve_in_last:A) */ + + info = to_info(desc_ring, id); + + /* + * Set the @e fields here so that prb_commit() can be used if + * anything fails from now on. + */ + e->rb = rb; + e->id = id; + + /* + * desc_reopen_last() checked the caller_id, but there was no + * exclusive access at that point. The descriptor may have + * changed since then. + */ + if (caller_id != info->caller_id) + goto fail; + + if (BLK_DATALESS(&d->text_blk_lpos)) { + if (WARN_ON_ONCE(info->text_len != 0)) { + pr_warn_once("wrong text_len value (%hu, expecting 0)\n", + info->text_len); + info->text_len = 0; + } + + if (!data_check_size(&rb->text_data_ring, r->text_buf_size)) + goto fail; + + if (r->text_buf_size > max_size) + goto fail; + + r->text_buf = data_alloc(rb, &rb->text_data_ring, r->text_buf_size, + &d->text_blk_lpos, id); + } else { + if (!get_data(&rb->text_data_ring, &d->text_blk_lpos, &data_size)) + goto fail; + + /* + * Increase the buffer size to include the original size. If + * the meta data (@text_len) is not sane, use the full data + * block size. + */ + if (WARN_ON_ONCE(info->text_len > data_size)) { + pr_warn_once("wrong text_len value (%hu, expecting <=%u)\n", + info->text_len, data_size); + info->text_len = data_size; + } + r->text_buf_size += info->text_len; + + if (!data_check_size(&rb->text_data_ring, r->text_buf_size)) + goto fail; + + if (r->text_buf_size > max_size) + goto fail; + + r->text_buf = data_realloc(rb, &rb->text_data_ring, r->text_buf_size, + &d->text_blk_lpos, id); + } + if (r->text_buf_size && !r->text_buf) + goto fail; + + r->info = info; + + e->text_space = space_used(&rb->text_data_ring, &d->text_blk_lpos); + + return true; +fail: + prb_commit(e); + /* prb_commit() re-enabled interrupts. */ +fail_reopen: + /* Make it clear to the caller that the re-reserve failed. */ + memset(r, 0, sizeof(*r)); + return false; +} + +/* + * Attempt to finalize a specified descriptor. If this fails, the descriptor + * is either already final or it will finalize itself when the writer commits. + */ +static void desc_make_final(struct prb_desc_ring *desc_ring, unsigned long id) +{ + unsigned long prev_state_val = DESC_SV(id, desc_committed); + struct prb_desc *d = to_desc(desc_ring, id); + + atomic_long_cmpxchg_relaxed(&d->state_var, prev_state_val, + DESC_SV(id, desc_finalized)); /* LMM(desc_make_final:A) */ +} + +/** + * prb_reserve() - Reserve space in the ringbuffer. + * + * @e: The entry structure to setup. + * @rb: The ringbuffer to reserve data in. + * @r: The record structure to allocate buffers for. + * + * This is the public function available to writers to reserve data. + * + * The writer specifies the text size to reserve by setting the + * @text_buf_size field of @r. To ensure proper initialization of @r, + * prb_rec_init_wr() should be used. + * + * Context: Any context. Disables local interrupts on success. + * Return: true if at least text data could be allocated, otherwise false. + * + * On success, the fields @info and @text_buf of @r will be set by this + * function and should be filled in by the writer before committing. Also + * on success, prb_record_text_space() can be used on @e to query the actual + * space used for the text data block. + * + * Important: @info->text_len needs to be set correctly by the writer in + * order for data to be readable and/or extended. Its value + * is initialized to 0. + */ +bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, + struct printk_record *r) +{ + struct prb_desc_ring *desc_ring = &rb->desc_ring; + struct printk_info *info; + struct prb_desc *d; + unsigned long id; + u64 seq; + + if (!data_check_size(&rb->text_data_ring, r->text_buf_size)) + goto fail; + + /* + * Descriptors in the reserved state act as blockers to all further + * reservations once the desc_ring has fully wrapped. Disable + * interrupts during the reserve/commit window in order to minimize + * the likelihood of this happening. + */ + local_irq_save(e->irqflags); + + if (!desc_reserve(rb, &id)) { + /* Descriptor reservation failures are tracked. */ + atomic_long_inc(&rb->fail); + local_irq_restore(e->irqflags); + goto fail; + } + + d = to_desc(desc_ring, id); + info = to_info(desc_ring, id); + + /* + * All @info fields (except @seq) are cleared and must be filled in + * by the writer. Save @seq before clearing because it is used to + * determine the new sequence number. + */ + seq = info->seq; + memset(info, 0, sizeof(*info)); + + /* + * Set the @e fields here so that prb_commit() can be used if + * text data allocation fails. + */ + e->rb = rb; + e->id = id; + + /* + * Initialize the sequence number if it has "never been set". + * Otherwise just increment it by a full wrap. + * + * @seq is considered "never been set" if it has a value of 0, + * _except_ for @infos[0], which was specially setup by the ringbuffer + * initializer and therefore is always considered as set. + * + * See the "Bootstrap" comment block in printk_ringbuffer.h for + * details about how the initializer bootstraps the descriptors. + */ + if (seq == 0 && DESC_INDEX(desc_ring, id) != 0) + info->seq = DESC_INDEX(desc_ring, id); + else + info->seq = seq + DESCS_COUNT(desc_ring); + + /* + * New data is about to be reserved. Once that happens, previous + * descriptors are no longer able to be extended. Finalize the + * previous descriptor now so that it can be made available to + * readers. (For seq==0 there is no previous descriptor.) + */ + if (info->seq > 0) + desc_make_final(desc_ring, DESC_ID(id - 1)); + + r->text_buf = data_alloc(rb, &rb->text_data_ring, r->text_buf_size, + &d->text_blk_lpos, id); + /* If text data allocation fails, a data-less record is committed. */ + if (r->text_buf_size && !r->text_buf) { + prb_commit(e); + /* prb_commit() re-enabled interrupts. */ + goto fail; + } + + r->info = info; + + /* Record full text space used by record. */ + e->text_space = space_used(&rb->text_data_ring, &d->text_blk_lpos); + + return true; +fail: + /* Make it clear to the caller that the reserve failed. */ + memset(r, 0, sizeof(*r)); + return false; +} + +/* Commit the data (possibly finalizing it) and restore interrupts. */ +static void _prb_commit(struct prb_reserved_entry *e, unsigned long state_val) +{ + struct prb_desc_ring *desc_ring = &e->rb->desc_ring; + struct prb_desc *d = to_desc(desc_ring, e->id); + unsigned long prev_state_val = DESC_SV(e->id, desc_reserved); + + /* Now the writer has finished all writing: LMM(_prb_commit:A) */ + + /* + * Set the descriptor as committed. See "ABA Issues" about why + * cmpxchg() instead of set() is used. + * + * 1 Guarantee all record data is stored before the descriptor state + * is stored as committed. A write memory barrier is sufficient + * for this. This pairs with desc_read:B and desc_reopen_last:A. + * + * 2. Guarantee the descriptor state is stored as committed before + * re-checking the head ID in order to possibly finalize this + * descriptor. This pairs with desc_reserve:D. + * + * Memory barrier involvement: + * + * If prb_commit:A reads from desc_reserve:D, then + * desc_make_final:A reads from _prb_commit:B. + * + * Relies on: + * + * MB _prb_commit:B to prb_commit:A + * matching + * MB desc_reserve:D to desc_make_final:A + */ + if (!atomic_long_try_cmpxchg(&d->state_var, &prev_state_val, + DESC_SV(e->id, state_val))) { /* LMM(_prb_commit:B) */ + WARN_ON_ONCE(1); + } + + /* Restore interrupts, the reserve/commit window is finished. */ + local_irq_restore(e->irqflags); +} + +/** + * prb_commit() - Commit (previously reserved) data to the ringbuffer. + * + * @e: The entry containing the reserved data information. + * + * This is the public function available to writers to commit data. + * + * Note that the data is not yet available to readers until it is finalized. + * Finalizing happens automatically when space for the next record is + * reserved. + * + * See prb_final_commit() for a version of this function that finalizes + * immediately. + * + * Context: Any context. Enables local interrupts. + */ +void prb_commit(struct prb_reserved_entry *e) +{ + struct prb_desc_ring *desc_ring = &e->rb->desc_ring; + unsigned long head_id; + + _prb_commit(e, desc_committed); + + /* + * If this descriptor is no longer the head (i.e. a new record has + * been allocated), extending the data for this record is no longer + * allowed and therefore it must be finalized. + */ + head_id = atomic_long_read(&desc_ring->head_id); /* LMM(prb_commit:A) */ + if (head_id != e->id) + desc_make_final(desc_ring, e->id); +} + +/** + * prb_final_commit() - Commit and finalize (previously reserved) data to + * the ringbuffer. + * + * @e: The entry containing the reserved data information. + * + * This is the public function available to writers to commit+finalize data. + * + * By finalizing, the data is made immediately available to readers. + * + * This function should only be used if there are no intentions of extending + * this data using prb_reserve_in_last(). + * + * Context: Any context. Enables local interrupts. + */ +void prb_final_commit(struct prb_reserved_entry *e) +{ + _prb_commit(e, desc_finalized); +} + +/* + * Count the number of lines in provided text. All text has at least 1 line + * (even if @text_size is 0). Each '\n' processed is counted as an additional + * line. + */ +static unsigned int count_lines(const char *text, unsigned int text_size) +{ + unsigned int next_size = text_size; + unsigned int line_count = 1; + const char *next = text; + + while (next_size) { + next = memchr(next, '\n', next_size); + if (!next) + break; + line_count++; + next++; + next_size = text_size - (next - text); + } + + return line_count; +} + +/* + * Given @blk_lpos, copy an expected @len of data into the provided buffer. + * If @line_count is provided, count the number of lines in the data. + * + * This function (used by readers) performs strict validation on the data + * size to possibly detect bugs in the writer code. A WARN_ON_ONCE() is + * triggered if an internal error is detected. + */ +static bool copy_data(struct prb_data_ring *data_ring, + struct prb_data_blk_lpos *blk_lpos, u16 len, char *buf, + unsigned int buf_size, unsigned int *line_count) +{ + unsigned int data_size; + const char *data; + + /* Caller might not want any data. */ + if ((!buf || !buf_size) && !line_count) + return true; + + data = get_data(data_ring, blk_lpos, &data_size); + if (!data) + return false; + + /* + * Actual cannot be less than expected. It can be more than expected + * because of the trailing alignment padding. + * + * Note that invalid @len values can occur because the caller loads + * the value during an allowed data race. + */ + if (data_size < (unsigned int)len) + return false; + + /* Caller interested in the line count? */ + if (line_count) + *line_count = count_lines(data, data_size); + + /* Caller interested in the data content? */ + if (!buf || !buf_size) + return true; + + data_size = min_t(u16, buf_size, len); + + memcpy(&buf[0], data, data_size); /* LMM(copy_data:A) */ + return true; +} + +/* + * This is an extended version of desc_read(). It gets a copy of a specified + * descriptor. However, it also verifies that the record is finalized and has + * the sequence number @seq. On success, 0 is returned. + * + * Error return values: + * -EINVAL: A finalized record with sequence number @seq does not exist. + * -ENOENT: A finalized record with sequence number @seq exists, but its data + * is not available. This is a valid record, so readers should + * continue with the next record. + */ +static int desc_read_finalized_seq(struct prb_desc_ring *desc_ring, + unsigned long id, u64 seq, + struct prb_desc *desc_out) +{ + struct prb_data_blk_lpos *blk_lpos = &desc_out->text_blk_lpos; + enum desc_state d_state; + u64 s; + + d_state = desc_read(desc_ring, id, desc_out, &s, NULL); + + /* + * An unexpected @id (desc_miss) or @seq mismatch means the record + * does not exist. A descriptor in the reserved or committed state + * means the record does not yet exist for the reader. + */ + if (d_state == desc_miss || + d_state == desc_reserved || + d_state == desc_committed || + s != seq) { + return -EINVAL; + } + + /* + * A descriptor in the reusable state may no longer have its data + * available; report it as existing but with lost data. Or the record + * may actually be a record with lost data. + */ + if (d_state == desc_reusable || + (blk_lpos->begin == FAILED_LPOS && blk_lpos->next == FAILED_LPOS)) { + return -ENOENT; + } + + return 0; +} + +/* + * Copy the ringbuffer data from the record with @seq to the provided + * @r buffer. On success, 0 is returned. + * + * See desc_read_finalized_seq() for error return values. + */ +static int prb_read(struct printk_ringbuffer *rb, u64 seq, + struct printk_record *r, unsigned int *line_count) +{ + struct prb_desc_ring *desc_ring = &rb->desc_ring; + struct printk_info *info = to_info(desc_ring, seq); + struct prb_desc *rdesc = to_desc(desc_ring, seq); + atomic_long_t *state_var = &rdesc->state_var; + struct prb_desc desc; + unsigned long id; + int err; + + /* Extract the ID, used to specify the descriptor to read. */ + id = DESC_ID(atomic_long_read(state_var)); + + /* Get a local copy of the correct descriptor (if available). */ + err = desc_read_finalized_seq(desc_ring, id, seq, &desc); + + /* + * If @r is NULL, the caller is only interested in the availability + * of the record. + */ + if (err || !r) + return err; + + /* If requested, copy meta data. */ + if (r->info) + memcpy(r->info, info, sizeof(*(r->info))); + + /* Copy text data. If it fails, this is a data-less record. */ + if (!copy_data(&rb->text_data_ring, &desc.text_blk_lpos, info->text_len, + r->text_buf, r->text_buf_size, line_count)) { + return -ENOENT; + } + + /* Ensure the record is still finalized and has the same @seq. */ + return desc_read_finalized_seq(desc_ring, id, seq, &desc); +} + +/* Get the sequence number of the tail descriptor. */ +static u64 prb_first_seq(struct printk_ringbuffer *rb) +{ + struct prb_desc_ring *desc_ring = &rb->desc_ring; + enum desc_state d_state; + struct prb_desc desc; + unsigned long id; + u64 seq; + + for (;;) { + id = atomic_long_read(&rb->desc_ring.tail_id); /* LMM(prb_first_seq:A) */ + + d_state = desc_read(desc_ring, id, &desc, &seq, NULL); /* LMM(prb_first_seq:B) */ + + /* + * This loop will not be infinite because the tail is + * _always_ in the finalized or reusable state. + */ + if (d_state == desc_finalized || d_state == desc_reusable) + break; + + /* + * Guarantee the last state load from desc_read() is before + * reloading @tail_id in order to see a new tail in the case + * that the descriptor has been recycled. This pairs with + * desc_reserve:D. + * + * Memory barrier involvement: + * + * If prb_first_seq:B reads from desc_reserve:F, then + * prb_first_seq:A reads from desc_push_tail:B. + * + * Relies on: + * + * MB from desc_push_tail:B to desc_reserve:F + * matching + * RMB prb_first_seq:B to prb_first_seq:A + */ + smp_rmb(); /* LMM(prb_first_seq:C) */ + } + + return seq; +} + +/* + * Non-blocking read of a record. Updates @seq to the last finalized record + * (which may have no data available). + * + * See the description of prb_read_valid() and prb_read_valid_info() + * for details. + */ +static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq, + struct printk_record *r, unsigned int *line_count) +{ + u64 tail_seq; + int err; + + while ((err = prb_read(rb, *seq, r, line_count))) { + tail_seq = prb_first_seq(rb); + + if (*seq < tail_seq) { + /* + * Behind the tail. Catch up and try again. This + * can happen for -ENOENT and -EINVAL cases. + */ + *seq = tail_seq; + + } else if (err == -ENOENT) { + /* Record exists, but no data available. Skip. */ + (*seq)++; + + } else { + /* Non-existent/non-finalized record. Must stop. */ + return false; + } + } + + return true; +} + +/** + * prb_read_valid() - Non-blocking read of a requested record or (if gone) + * the next available record. + * + * @rb: The ringbuffer to read from. + * @seq: The sequence number of the record to read. + * @r: A record data buffer to store the read record to. + * + * This is the public function available to readers to read a record. + * + * The reader provides the @info and @text_buf buffers of @r to be + * filled in. Any of the buffer pointers can be set to NULL if the reader + * is not interested in that data. To ensure proper initialization of @r, + * prb_rec_init_rd() should be used. + * + * Context: Any context. + * Return: true if a record was read, otherwise false. + * + * On success, the reader must check r->info.seq to see which record was + * actually read. This allows the reader to detect dropped records. + * + * Failure means @seq refers to a not yet written record. + */ +bool prb_read_valid(struct printk_ringbuffer *rb, u64 seq, + struct printk_record *r) +{ + return _prb_read_valid(rb, &seq, r, NULL); +} + +/** + * prb_read_valid_info() - Non-blocking read of meta data for a requested + * record or (if gone) the next available record. + * + * @rb: The ringbuffer to read from. + * @seq: The sequence number of the record to read. + * @info: A buffer to store the read record meta data to. + * @line_count: A buffer to store the number of lines in the record text. + * + * This is the public function available to readers to read only the + * meta data of a record. + * + * The reader provides the @info, @line_count buffers to be filled in. + * Either of the buffer pointers can be set to NULL if the reader is not + * interested in that data. + * + * Context: Any context. + * Return: true if a record's meta data was read, otherwise false. + * + * On success, the reader must check info->seq to see which record meta data + * was actually read. This allows the reader to detect dropped records. + * + * Failure means @seq refers to a not yet written record. + */ +bool prb_read_valid_info(struct printk_ringbuffer *rb, u64 seq, + struct printk_info *info, unsigned int *line_count) +{ + struct printk_record r; + + prb_rec_init_rd(&r, info, NULL, 0); + + return _prb_read_valid(rb, &seq, &r, line_count); +} + +/** + * prb_first_valid_seq() - Get the sequence number of the oldest available + * record. + * + * @rb: The ringbuffer to get the sequence number from. + * + * This is the public function available to readers to see what the + * first/oldest valid sequence number is. + * + * This provides readers a starting point to begin iterating the ringbuffer. + * + * Context: Any context. + * Return: The sequence number of the first/oldest record or, if the + * ringbuffer is empty, 0 is returned. + */ +u64 prb_first_valid_seq(struct printk_ringbuffer *rb) +{ + u64 seq = 0; + + if (!_prb_read_valid(rb, &seq, NULL, NULL)) + return 0; + + return seq; +} + +/** + * prb_next_seq() - Get the sequence number after the last available record. + * + * @rb: The ringbuffer to get the sequence number from. + * + * This is the public function available to readers to see what the next + * newest sequence number available to readers will be. + * + * This provides readers a sequence number to jump to if all currently + * available records should be skipped. + * + * Context: Any context. + * Return: The sequence number of the next newest (not yet available) record + * for readers. + */ +u64 prb_next_seq(struct printk_ringbuffer *rb) +{ + u64 seq = 0; + + /* Search forward from the oldest descriptor. */ + while (_prb_read_valid(rb, &seq, NULL, NULL)) + seq++; + + return seq; +} + +/** + * prb_init() - Initialize a ringbuffer to use provided external buffers. + * + * @rb: The ringbuffer to initialize. + * @text_buf: The data buffer for text data. + * @textbits: The size of @text_buf as a power-of-2 value. + * @descs: The descriptor buffer for ringbuffer records. + * @descbits: The count of @descs items as a power-of-2 value. + * @infos: The printk_info buffer for ringbuffer records. + * + * This is the public function available to writers to setup a ringbuffer + * during runtime using provided buffers. + * + * This must match the initialization of DEFINE_PRINTKRB(). + * + * Context: Any context. + */ +void prb_init(struct printk_ringbuffer *rb, + char *text_buf, unsigned int textbits, + struct prb_desc *descs, unsigned int descbits, + struct printk_info *infos) +{ + memset(descs, 0, _DESCS_COUNT(descbits) * sizeof(descs[0])); + memset(infos, 0, _DESCS_COUNT(descbits) * sizeof(infos[0])); + + rb->desc_ring.count_bits = descbits; + rb->desc_ring.descs = descs; + rb->desc_ring.infos = infos; + atomic_long_set(&rb->desc_ring.head_id, DESC0_ID(descbits)); + atomic_long_set(&rb->desc_ring.tail_id, DESC0_ID(descbits)); + + rb->text_data_ring.size_bits = textbits; + rb->text_data_ring.data = text_buf; + atomic_long_set(&rb->text_data_ring.head_lpos, BLK0_LPOS(textbits)); + atomic_long_set(&rb->text_data_ring.tail_lpos, BLK0_LPOS(textbits)); + + atomic_long_set(&rb->fail, 0); + + atomic_long_set(&(descs[_DESCS_COUNT(descbits) - 1].state_var), DESC0_SV(descbits)); + descs[_DESCS_COUNT(descbits) - 1].text_blk_lpos.begin = FAILED_LPOS; + descs[_DESCS_COUNT(descbits) - 1].text_blk_lpos.next = FAILED_LPOS; + + infos[0].seq = -(u64)_DESCS_COUNT(descbits); + infos[_DESCS_COUNT(descbits) - 1].seq = 0; +} + +/** + * prb_record_text_space() - Query the full actual used ringbuffer space for + * the text data of a reserved entry. + * + * @e: The successfully reserved entry to query. + * + * This is the public function available to writers to see how much actual + * space is used in the ringbuffer to store the text data of the specified + * entry. + * + * This function is only valid if @e has been successfully reserved using + * prb_reserve(). + * + * Context: Any context. + * Return: The size in bytes used by the text data of the associated record. + */ +unsigned int prb_record_text_space(struct prb_reserved_entry *e) +{ + return e->text_space; +} diff --git a/kernel/printk/printk_ringbuffer.h b/kernel/printk/printk_ringbuffer.h new file mode 100644 index 000000000000..5dc9d022db07 --- /dev/null +++ b/kernel/printk/printk_ringbuffer.h @@ -0,0 +1,382 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _KERNEL_PRINTK_RINGBUFFER_H +#define _KERNEL_PRINTK_RINGBUFFER_H + +#include <linux/atomic.h> +#include <linux/dev_printk.h> + +/* + * Meta information about each stored message. + * + * All fields are set by the printk code except for @seq, which is + * set by the ringbuffer code. + */ +struct printk_info { + u64 seq; /* sequence number */ + u64 ts_nsec; /* timestamp in nanoseconds */ + u16 text_len; /* length of text message */ + u8 facility; /* syslog facility */ + u8 flags:5; /* internal record flags */ + u8 level:3; /* syslog level */ + u32 caller_id; /* thread id or processor id */ + + struct dev_printk_info dev_info; +}; + +/* + * A structure providing the buffers, used by writers and readers. + * + * Writers: + * Using prb_rec_init_wr(), a writer sets @text_buf_size before calling + * prb_reserve(). On success, prb_reserve() sets @info and @text_buf to + * buffers reserved for that writer. + * + * Readers: + * Using prb_rec_init_rd(), a reader sets all fields before calling + * prb_read_valid(). Note that the reader provides the @info and @text_buf, + * buffers. On success, the struct pointed to by @info will be filled and + * the char array pointed to by @text_buf will be filled with text data. + */ +struct printk_record { + struct printk_info *info; + char *text_buf; + unsigned int text_buf_size; +}; + +/* Specifies the logical position and span of a data block. */ +struct prb_data_blk_lpos { + unsigned long begin; + unsigned long next; +}; + +/* + * A descriptor: the complete meta-data for a record. + * + * @state_var: A bitwise combination of descriptor ID and descriptor state. + */ +struct prb_desc { + atomic_long_t state_var; + struct prb_data_blk_lpos text_blk_lpos; +}; + +/* A ringbuffer of "ID + data" elements. */ +struct prb_data_ring { + unsigned int size_bits; + char *data; + atomic_long_t head_lpos; + atomic_long_t tail_lpos; +}; + +/* A ringbuffer of "struct prb_desc" elements. */ +struct prb_desc_ring { + unsigned int count_bits; + struct prb_desc *descs; + struct printk_info *infos; + atomic_long_t head_id; + atomic_long_t tail_id; +}; + +/* + * The high level structure representing the printk ringbuffer. + * + * @fail: Count of failed prb_reserve() calls where not even a data-less + * record was created. + */ +struct printk_ringbuffer { + struct prb_desc_ring desc_ring; + struct prb_data_ring text_data_ring; + atomic_long_t fail; +}; + +/* + * Used by writers as a reserve/commit handle. + * + * @rb: Ringbuffer where the entry is reserved. + * @irqflags: Saved irq flags to restore on entry commit. + * @id: ID of the reserved descriptor. + * @text_space: Total occupied buffer space in the text data ring, including + * ID, alignment padding, and wrapping data blocks. + * + * This structure is an opaque handle for writers. Its contents are only + * to be used by the ringbuffer implementation. + */ +struct prb_reserved_entry { + struct printk_ringbuffer *rb; + unsigned long irqflags; + unsigned long id; + unsigned int text_space; +}; + +/* The possible responses of a descriptor state-query. */ +enum desc_state { + desc_miss = -1, /* ID mismatch (pseudo state) */ + desc_reserved = 0x0, /* reserved, in use by writer */ + desc_committed = 0x1, /* committed by writer, could get reopened */ + desc_finalized = 0x2, /* committed, no further modification allowed */ + desc_reusable = 0x3, /* free, not yet used by any writer */ +}; + +#define _DATA_SIZE(sz_bits) (1UL << (sz_bits)) +#define _DESCS_COUNT(ct_bits) (1U << (ct_bits)) +#define DESC_SV_BITS (sizeof(unsigned long) * 8) +#define DESC_FLAGS_SHIFT (DESC_SV_BITS - 2) +#define DESC_FLAGS_MASK (3UL << DESC_FLAGS_SHIFT) +#define DESC_STATE(sv) (3UL & (sv >> DESC_FLAGS_SHIFT)) +#define DESC_SV(id, state) (((unsigned long)state << DESC_FLAGS_SHIFT) | id) +#define DESC_ID_MASK (~DESC_FLAGS_MASK) +#define DESC_ID(sv) ((sv) & DESC_ID_MASK) +#define FAILED_LPOS 0x1 +#define NO_LPOS 0x3 + +#define FAILED_BLK_LPOS \ +{ \ + .begin = FAILED_LPOS, \ + .next = FAILED_LPOS, \ +} + +/* + * Descriptor Bootstrap + * + * The descriptor array is minimally initialized to allow immediate usage + * by readers and writers. The requirements that the descriptor array + * initialization must satisfy: + * + * Req1 + * The tail must point to an existing (committed or reusable) descriptor. + * This is required by the implementation of prb_first_seq(). + * + * Req2 + * Readers must see that the ringbuffer is initially empty. + * + * Req3 + * The first record reserved by a writer is assigned sequence number 0. + * + * To satisfy Req1, the tail initially points to a descriptor that is + * minimally initialized (having no data block, i.e. data-less with the + * data block's lpos @begin and @next values set to FAILED_LPOS). + * + * To satisfy Req2, the initial tail descriptor is initialized to the + * reusable state. Readers recognize reusable descriptors as existing + * records, but skip over them. + * + * To satisfy Req3, the last descriptor in the array is used as the initial + * head (and tail) descriptor. This allows the first record reserved by a + * writer (head + 1) to be the first descriptor in the array. (Only the first + * descriptor in the array could have a valid sequence number of 0.) + * + * The first time a descriptor is reserved, it is assigned a sequence number + * with the value of the array index. A "first time reserved" descriptor can + * be recognized because it has a sequence number of 0 but does not have an + * index of 0. (Only the first descriptor in the array could have a valid + * sequence number of 0.) After the first reservation, all future reservations + * (recycling) simply involve incrementing the sequence number by the array + * count. + * + * Hack #1 + * Only the first descriptor in the array is allowed to have the sequence + * number 0. In this case it is not possible to recognize if it is being + * reserved the first time (set to index value) or has been reserved + * previously (increment by the array count). This is handled by _always_ + * incrementing the sequence number by the array count when reserving the + * first descriptor in the array. In order to satisfy Req3, the sequence + * number of the first descriptor in the array is initialized to minus + * the array count. Then, upon the first reservation, it is incremented + * to 0, thus satisfying Req3. + * + * Hack #2 + * prb_first_seq() can be called at any time by readers to retrieve the + * sequence number of the tail descriptor. However, due to Req2 and Req3, + * initially there are no records to report the sequence number of + * (sequence numbers are u64 and there is nothing less than 0). To handle + * this, the sequence number of the initial tail descriptor is initialized + * to 0. Technically this is incorrect, because there is no record with + * sequence number 0 (yet) and the tail descriptor is not the first + * descriptor in the array. But it allows prb_read_valid() to correctly + * report the existence of a record for _any_ given sequence number at all + * times. Bootstrapping is complete when the tail is pushed the first + * time, thus finally pointing to the first descriptor reserved by a + * writer, which has the assigned sequence number 0. + */ + +/* + * Initiating Logical Value Overflows + * + * Both logical position (lpos) and ID values can be mapped to array indexes + * but may experience overflows during the lifetime of the system. To ensure + * that printk_ringbuffer can handle the overflows for these types, initial + * values are chosen that map to the correct initial array indexes, but will + * result in overflows soon. + * + * BLK0_LPOS + * The initial @head_lpos and @tail_lpos for data rings. It is at index + * 0 and the lpos value is such that it will overflow on the first wrap. + * + * DESC0_ID + * The initial @head_id and @tail_id for the desc ring. It is at the last + * index of the descriptor array (see Req3 above) and the ID value is such + * that it will overflow on the second wrap. + */ +#define BLK0_LPOS(sz_bits) (-(_DATA_SIZE(sz_bits))) +#define DESC0_ID(ct_bits) DESC_ID(-(_DESCS_COUNT(ct_bits) + 1)) +#define DESC0_SV(ct_bits) DESC_SV(DESC0_ID(ct_bits), desc_reusable) + +/* + * Define a ringbuffer with an external text data buffer. The same as + * DEFINE_PRINTKRB() but requires specifying an external buffer for the + * text data. + * + * Note: The specified external buffer must be of the size: + * 2 ^ (descbits + avgtextbits) + */ +#define _DEFINE_PRINTKRB(name, descbits, avgtextbits, text_buf) \ +static struct prb_desc _##name##_descs[_DESCS_COUNT(descbits)] = { \ + /* the initial head and tail */ \ + [_DESCS_COUNT(descbits) - 1] = { \ + /* reusable */ \ + .state_var = ATOMIC_INIT(DESC0_SV(descbits)), \ + /* no associated data block */ \ + .text_blk_lpos = FAILED_BLK_LPOS, \ + }, \ +}; \ +static struct printk_info _##name##_infos[_DESCS_COUNT(descbits)] = { \ + /* this will be the first record reserved by a writer */ \ + [0] = { \ + /* will be incremented to 0 on the first reservation */ \ + .seq = -(u64)_DESCS_COUNT(descbits), \ + }, \ + /* the initial head and tail */ \ + [_DESCS_COUNT(descbits) - 1] = { \ + /* reports the first seq value during the bootstrap phase */ \ + .seq = 0, \ + }, \ +}; \ +static struct printk_ringbuffer name = { \ + .desc_ring = { \ + .count_bits = descbits, \ + .descs = &_##name##_descs[0], \ + .infos = &_##name##_infos[0], \ + .head_id = ATOMIC_INIT(DESC0_ID(descbits)), \ + .tail_id = ATOMIC_INIT(DESC0_ID(descbits)), \ + }, \ + .text_data_ring = { \ + .size_bits = (avgtextbits) + (descbits), \ + .data = text_buf, \ + .head_lpos = ATOMIC_LONG_INIT(BLK0_LPOS((avgtextbits) + (descbits))), \ + .tail_lpos = ATOMIC_LONG_INIT(BLK0_LPOS((avgtextbits) + (descbits))), \ + }, \ + .fail = ATOMIC_LONG_INIT(0), \ +} + +/** + * DEFINE_PRINTKRB() - Define a ringbuffer. + * + * @name: The name of the ringbuffer variable. + * @descbits: The number of descriptors as a power-of-2 value. + * @avgtextbits: The average text data size per record as a power-of-2 value. + * + * This is a macro for defining a ringbuffer and all internal structures + * such that it is ready for immediate use. See _DEFINE_PRINTKRB() for a + * variant where the text data buffer can be specified externally. + */ +#define DEFINE_PRINTKRB(name, descbits, avgtextbits) \ +static char _##name##_text[1U << ((avgtextbits) + (descbits))] \ + __aligned(__alignof__(unsigned long)); \ +_DEFINE_PRINTKRB(name, descbits, avgtextbits, &_##name##_text[0]) + +/* Writer Interface */ + +/** + * prb_rec_init_wd() - Initialize a buffer for writing records. + * + * @r: The record to initialize. + * @text_buf_size: The needed text buffer size. + */ +static inline void prb_rec_init_wr(struct printk_record *r, + unsigned int text_buf_size) +{ + r->info = NULL; + r->text_buf = NULL; + r->text_buf_size = text_buf_size; +} + +bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, + struct printk_record *r); +bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, + struct printk_record *r, u32 caller_id, unsigned int max_size); +void prb_commit(struct prb_reserved_entry *e); +void prb_final_commit(struct prb_reserved_entry *e); + +void prb_init(struct printk_ringbuffer *rb, + char *text_buf, unsigned int text_buf_size, + struct prb_desc *descs, unsigned int descs_count_bits, + struct printk_info *infos); +unsigned int prb_record_text_space(struct prb_reserved_entry *e); + +/* Reader Interface */ + +/** + * prb_rec_init_rd() - Initialize a buffer for reading records. + * + * @r: The record to initialize. + * @info: A buffer to store record meta-data. + * @text_buf: A buffer to store text data. + * @text_buf_size: The size of @text_buf. + * + * Initialize all the fields that a reader is interested in. All arguments + * (except @r) are optional. Only record data for arguments that are + * non-NULL or non-zero will be read. + */ +static inline void prb_rec_init_rd(struct printk_record *r, + struct printk_info *info, + char *text_buf, unsigned int text_buf_size) +{ + r->info = info; + r->text_buf = text_buf; + r->text_buf_size = text_buf_size; +} + +/** + * prb_for_each_record() - Iterate over the records of a ringbuffer. + * + * @from: The sequence number to begin with. + * @rb: The ringbuffer to iterate over. + * @s: A u64 to store the sequence number on each iteration. + * @r: A printk_record to store the record on each iteration. + * + * This is a macro for conveniently iterating over a ringbuffer. + * Note that @s may not be the sequence number of the record on each + * iteration. For the sequence number, @r->info->seq should be checked. + * + * Context: Any context. + */ +#define prb_for_each_record(from, rb, s, r) \ +for ((s) = from; prb_read_valid(rb, s, r); (s) = (r)->info->seq + 1) + +/** + * prb_for_each_info() - Iterate over the meta data of a ringbuffer. + * + * @from: The sequence number to begin with. + * @rb: The ringbuffer to iterate over. + * @s: A u64 to store the sequence number on each iteration. + * @i: A printk_info to store the record meta data on each iteration. + * @lc: An unsigned int to store the text line count of each record. + * + * This is a macro for conveniently iterating over a ringbuffer. + * Note that @s may not be the sequence number of the record on each + * iteration. For the sequence number, @r->info->seq should be checked. + * + * Context: Any context. + */ +#define prb_for_each_info(from, rb, s, i, lc) \ +for ((s) = from; prb_read_valid_info(rb, s, i, lc); (s) = (i)->seq + 1) + +bool prb_read_valid(struct printk_ringbuffer *rb, u64 seq, + struct printk_record *r); +bool prb_read_valid_info(struct printk_ringbuffer *rb, u64 seq, + struct printk_info *info, unsigned int *line_count); + +u64 prb_first_valid_seq(struct printk_ringbuffer *rb); +u64 prb_next_seq(struct printk_ringbuffer *rb); + +#endif /* _KERNEL_PRINTK_RINGBUFFER_H */ diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index 50aeae770434..a0e6f746de6c 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -22,7 +22,7 @@ * is later flushed into the main ring buffer via IRQ work. * * The alternative implementation is chosen transparently - * by examinig current printk() context mask stored in @printk_context + * by examining current printk() context mask stored in @printk_context * per-CPU variable. * * The implementation allows to flush the strings also from another CPU. @@ -375,7 +375,7 @@ __printf(1, 0) int vprintk_func(const char *fmt, va_list args) raw_spin_trylock(&logbuf_lock)) { int len; - len = vprintk_store(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args); + len = vprintk_store(0, LOGLEVEL_DEFAULT, NULL, fmt, args); raw_spin_unlock(&logbuf_lock); defer_console_output(); return len; diff --git a/kernel/range.c b/kernel/range.c index d84de6766472..56435f96da73 100644 --- a/kernel/range.c +++ b/kernel/range.c @@ -2,8 +2,9 @@ /* * Range add and subtract */ -#include <linux/kernel.h> #include <linux/init.h> +#include <linux/minmax.h> +#include <linux/printk.h> #include <linux/sort.h> #include <linux/string.h> #include <linux/range.h> diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 0ebe15a84985..b71e21f73c40 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -135,10 +135,12 @@ config RCU_FANOUT config RCU_FANOUT_LEAF int "Tree-based hierarchical RCU leaf-level fanout value" - range 2 64 if 64BIT - range 2 32 if !64BIT + range 2 64 if 64BIT && !RCU_STRICT_GRACE_PERIOD + range 2 32 if !64BIT && !RCU_STRICT_GRACE_PERIOD + range 2 3 if RCU_STRICT_GRACE_PERIOD depends on TREE_RCU && RCU_EXPERT - default 16 + default 16 if !RCU_STRICT_GRACE_PERIOD + default 2 if RCU_STRICT_GRACE_PERIOD help This option controls the leaf-level fanout of hierarchical implementations of RCU, and allows trading off cache misses diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 3cf6132a4bb9..1942c1f1bb65 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -23,7 +23,7 @@ config TORTURE_TEST tristate default n -config RCU_PERF_TEST +config RCU_SCALE_TEST tristate "performance tests for RCU" depends on DEBUG_KERNEL select TORTURE_TEST @@ -114,4 +114,19 @@ config RCU_EQS_DEBUG Say N here if you need ultimate kernel/user switch latencies Say Y if you are unsure +config RCU_STRICT_GRACE_PERIOD + bool "Provide debug RCU implementation with short grace periods" + depends on DEBUG_KERNEL && RCU_EXPERT + default n + select PREEMPT_COUNT if PREEMPT=n + help + Select this option to build an RCU variant that is strict about + grace periods, making them as short as it can. This limits + scalability, destroys real-time response, degrades battery + lifetime and kills performance. Don't try this on large + machines, as in systems with more than about 10 or 20 CPUs. + But in conjunction with tools like KASAN, it can be helpful + when looking for certain types of RCU usage bugs, for example, + too-short RCU read-side critical sections. + endmenu # "RCU Debugging" diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 95f5117ef8da..0cfb009a99b9 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -11,7 +11,7 @@ obj-y += update.o sync.o obj-$(CONFIG_TREE_SRCU) += srcutree.o obj-$(CONFIG_TINY_SRCU) += srcutiny.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o -obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o +obj-$(CONFIG_RCU_SCALE_TEST) += rcuscale.o obj-$(CONFIG_RCU_REF_SCALE_TEST) += refscale.o obj-$(CONFIG_TREE_RCU) += tree.o obj-$(CONFIG_TINY_RCU) += tiny.o diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index cf66a3ccd757..e01cba5e4b52 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -167,7 +167,7 @@ static inline unsigned long rcu_seq_diff(unsigned long new, unsigned long old) # define STATE_RCU_HEAD_READY 0 # define STATE_RCU_HEAD_QUEUED 1 -extern struct debug_obj_descr rcuhead_debug_descr; +extern const struct debug_obj_descr rcuhead_debug_descr; static inline int debug_rcu_head_queue(struct rcu_head *head) { diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 9a0f66133b4b..2d2a6b6b9dfb 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -475,8 +475,16 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq) * Also advance to the oldest segment of callbacks whose * ->gp_seq[] completion is at or after that passed in via "seq", * skipping any empty segments. + * + * Note that segment "i" (and any lower-numbered segments + * containing older callbacks) will be unaffected, and their + * grace-period numbers remain unchanged. For example, if i == + * WAIT_TAIL, then neither WAIT_TAIL nor DONE_TAIL will be touched. + * Instead, the CBs in NEXT_TAIL will be merged with those in + * NEXT_READY_TAIL and the grace-period number of NEXT_READY_TAIL + * would be updated. NEXT_TAIL would then be empty. */ - if (++i >= RCU_NEXT_TAIL) + if (rcu_segcblist_restempty(rsclp, i) || ++i >= RCU_NEXT_TAIL) return false; /* diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuscale.c index 21448d3374e2..2819b95479af 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuscale.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0+ /* - * Read-Copy Update module-based performance-test facility + * Read-Copy Update module-based scalability-test facility * * Copyright (C) IBM Corporation, 2015 * @@ -44,13 +44,13 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>"); -#define PERF_FLAG "-perf:" -#define PERFOUT_STRING(s) \ - pr_alert("%s" PERF_FLAG " %s\n", perf_type, s) -#define VERBOSE_PERFOUT_STRING(s) \ - do { if (verbose) pr_alert("%s" PERF_FLAG " %s\n", perf_type, s); } while (0) -#define VERBOSE_PERFOUT_ERRSTRING(s) \ - do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0) +#define SCALE_FLAG "-scale:" +#define SCALEOUT_STRING(s) \ + pr_alert("%s" SCALE_FLAG " %s\n", scale_type, s) +#define VERBOSE_SCALEOUT_STRING(s) \ + do { if (verbose) pr_alert("%s" SCALE_FLAG " %s\n", scale_type, s); } while (0) +#define VERBOSE_SCALEOUT_ERRSTRING(s) \ + do { if (verbose) pr_alert("%s" SCALE_FLAG "!!! %s\n", scale_type, s); } while (0) /* * The intended use cases for the nreaders and nwriters module parameters @@ -61,25 +61,25 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>"); * nr_cpus for a mixed reader/writer test. * * 2. Specify the nr_cpus kernel boot parameter, but set - * rcuperf.nreaders to zero. This will set nwriters to the + * rcuscale.nreaders to zero. This will set nwriters to the * value specified by nr_cpus for an update-only test. * * 3. Specify the nr_cpus kernel boot parameter, but set - * rcuperf.nwriters to zero. This will set nreaders to the + * rcuscale.nwriters to zero. This will set nreaders to the * value specified by nr_cpus for a read-only test. * * Various other use cases may of course be specified. * * Note that this test's readers are intended only as a test load for - * the writers. The reader performance statistics will be overly + * the writers. The reader scalability statistics will be overly * pessimistic due to the per-critical-section interrupt disabling, * test-end checks, and the pair of calls through pointers. */ #ifdef MODULE -# define RCUPERF_SHUTDOWN 0 +# define RCUSCALE_SHUTDOWN 0 #else -# define RCUPERF_SHUTDOWN 1 +# define RCUSCALE_SHUTDOWN 1 #endif torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives"); @@ -88,16 +88,16 @@ torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); torture_param(int, holdoff, 10, "Holdoff time before test start (s)"); torture_param(int, nreaders, -1, "Number of RCU reader threads"); torture_param(int, nwriters, -1, "Number of RCU updater threads"); -torture_param(bool, shutdown, RCUPERF_SHUTDOWN, - "Shutdown at end of performance tests."); +torture_param(bool, shutdown, RCUSCALE_SHUTDOWN, + "Shutdown at end of scalability tests."); torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable"); -torture_param(int, kfree_rcu_test, 0, "Do we run a kfree_rcu() perf test?"); +torture_param(int, kfree_rcu_test, 0, "Do we run a kfree_rcu() scale test?"); torture_param(int, kfree_mult, 1, "Multiple of kfree_obj size to allocate."); -static char *perf_type = "rcu"; -module_param(perf_type, charp, 0444); -MODULE_PARM_DESC(perf_type, "Type of RCU to performance-test (rcu, srcu, ...)"); +static char *scale_type = "rcu"; +module_param(scale_type, charp, 0444); +MODULE_PARM_DESC(scale_type, "Type of RCU to scalability-test (rcu, srcu, ...)"); static int nrealreaders; static int nrealwriters; @@ -107,12 +107,12 @@ static struct task_struct *shutdown_task; static u64 **writer_durations; static int *writer_n_durations; -static atomic_t n_rcu_perf_reader_started; -static atomic_t n_rcu_perf_writer_started; -static atomic_t n_rcu_perf_writer_finished; +static atomic_t n_rcu_scale_reader_started; +static atomic_t n_rcu_scale_writer_started; +static atomic_t n_rcu_scale_writer_finished; static wait_queue_head_t shutdown_wq; -static u64 t_rcu_perf_writer_started; -static u64 t_rcu_perf_writer_finished; +static u64 t_rcu_scale_writer_started; +static u64 t_rcu_scale_writer_finished; static unsigned long b_rcu_gp_test_started; static unsigned long b_rcu_gp_test_finished; static DEFINE_PER_CPU(atomic_t, n_async_inflight); @@ -124,7 +124,7 @@ static DEFINE_PER_CPU(atomic_t, n_async_inflight); * Operations vector for selecting different types of tests. */ -struct rcu_perf_ops { +struct rcu_scale_ops { int ptype; void (*init)(void); void (*cleanup)(void); @@ -140,19 +140,19 @@ struct rcu_perf_ops { const char *name; }; -static struct rcu_perf_ops *cur_ops; +static struct rcu_scale_ops *cur_ops; /* - * Definitions for rcu perf testing. + * Definitions for rcu scalability testing. */ -static int rcu_perf_read_lock(void) __acquires(RCU) +static int rcu_scale_read_lock(void) __acquires(RCU) { rcu_read_lock(); return 0; } -static void rcu_perf_read_unlock(int idx) __releases(RCU) +static void rcu_scale_read_unlock(int idx) __releases(RCU) { rcu_read_unlock(); } @@ -162,15 +162,15 @@ static unsigned long __maybe_unused rcu_no_completed(void) return 0; } -static void rcu_sync_perf_init(void) +static void rcu_sync_scale_init(void) { } -static struct rcu_perf_ops rcu_ops = { +static struct rcu_scale_ops rcu_ops = { .ptype = RCU_FLAVOR, - .init = rcu_sync_perf_init, - .readlock = rcu_perf_read_lock, - .readunlock = rcu_perf_read_unlock, + .init = rcu_sync_scale_init, + .readlock = rcu_scale_read_lock, + .readunlock = rcu_scale_read_unlock, .get_gp_seq = rcu_get_gp_seq, .gp_diff = rcu_seq_diff, .exp_completed = rcu_exp_batches_completed, @@ -182,23 +182,23 @@ static struct rcu_perf_ops rcu_ops = { }; /* - * Definitions for srcu perf testing. + * Definitions for srcu scalability testing. */ -DEFINE_STATIC_SRCU(srcu_ctl_perf); -static struct srcu_struct *srcu_ctlp = &srcu_ctl_perf; +DEFINE_STATIC_SRCU(srcu_ctl_scale); +static struct srcu_struct *srcu_ctlp = &srcu_ctl_scale; -static int srcu_perf_read_lock(void) __acquires(srcu_ctlp) +static int srcu_scale_read_lock(void) __acquires(srcu_ctlp) { return srcu_read_lock(srcu_ctlp); } -static void srcu_perf_read_unlock(int idx) __releases(srcu_ctlp) +static void srcu_scale_read_unlock(int idx) __releases(srcu_ctlp) { srcu_read_unlock(srcu_ctlp, idx); } -static unsigned long srcu_perf_completed(void) +static unsigned long srcu_scale_completed(void) { return srcu_batches_completed(srcu_ctlp); } @@ -213,78 +213,78 @@ static void srcu_rcu_barrier(void) srcu_barrier(srcu_ctlp); } -static void srcu_perf_synchronize(void) +static void srcu_scale_synchronize(void) { synchronize_srcu(srcu_ctlp); } -static void srcu_perf_synchronize_expedited(void) +static void srcu_scale_synchronize_expedited(void) { synchronize_srcu_expedited(srcu_ctlp); } -static struct rcu_perf_ops srcu_ops = { +static struct rcu_scale_ops srcu_ops = { .ptype = SRCU_FLAVOR, - .init = rcu_sync_perf_init, - .readlock = srcu_perf_read_lock, - .readunlock = srcu_perf_read_unlock, - .get_gp_seq = srcu_perf_completed, + .init = rcu_sync_scale_init, + .readlock = srcu_scale_read_lock, + .readunlock = srcu_scale_read_unlock, + .get_gp_seq = srcu_scale_completed, .gp_diff = rcu_seq_diff, - .exp_completed = srcu_perf_completed, + .exp_completed = srcu_scale_completed, .async = srcu_call_rcu, .gp_barrier = srcu_rcu_barrier, - .sync = srcu_perf_synchronize, - .exp_sync = srcu_perf_synchronize_expedited, + .sync = srcu_scale_synchronize, + .exp_sync = srcu_scale_synchronize_expedited, .name = "srcu" }; static struct srcu_struct srcud; -static void srcu_sync_perf_init(void) +static void srcu_sync_scale_init(void) { srcu_ctlp = &srcud; init_srcu_struct(srcu_ctlp); } -static void srcu_sync_perf_cleanup(void) +static void srcu_sync_scale_cleanup(void) { cleanup_srcu_struct(srcu_ctlp); } -static struct rcu_perf_ops srcud_ops = { +static struct rcu_scale_ops srcud_ops = { .ptype = SRCU_FLAVOR, - .init = srcu_sync_perf_init, - .cleanup = srcu_sync_perf_cleanup, - .readlock = srcu_perf_read_lock, - .readunlock = srcu_perf_read_unlock, - .get_gp_seq = srcu_perf_completed, + .init = srcu_sync_scale_init, + .cleanup = srcu_sync_scale_cleanup, + .readlock = srcu_scale_read_lock, + .readunlock = srcu_scale_read_unlock, + .get_gp_seq = srcu_scale_completed, .gp_diff = rcu_seq_diff, - .exp_completed = srcu_perf_completed, + .exp_completed = srcu_scale_completed, .async = srcu_call_rcu, .gp_barrier = srcu_rcu_barrier, - .sync = srcu_perf_synchronize, - .exp_sync = srcu_perf_synchronize_expedited, + .sync = srcu_scale_synchronize, + .exp_sync = srcu_scale_synchronize_expedited, .name = "srcud" }; /* - * Definitions for RCU-tasks perf testing. + * Definitions for RCU-tasks scalability testing. */ -static int tasks_perf_read_lock(void) +static int tasks_scale_read_lock(void) { return 0; } -static void tasks_perf_read_unlock(int idx) +static void tasks_scale_read_unlock(int idx) { } -static struct rcu_perf_ops tasks_ops = { +static struct rcu_scale_ops tasks_ops = { .ptype = RCU_TASKS_FLAVOR, - .init = rcu_sync_perf_init, - .readlock = tasks_perf_read_lock, - .readunlock = tasks_perf_read_unlock, + .init = rcu_sync_scale_init, + .readlock = tasks_scale_read_lock, + .readunlock = tasks_scale_read_unlock, .get_gp_seq = rcu_no_completed, .gp_diff = rcu_seq_diff, .async = call_rcu_tasks, @@ -294,7 +294,7 @@ static struct rcu_perf_ops tasks_ops = { .name = "tasks" }; -static unsigned long rcuperf_seq_diff(unsigned long new, unsigned long old) +static unsigned long rcuscale_seq_diff(unsigned long new, unsigned long old) { if (!cur_ops->gp_diff) return new - old; @@ -302,60 +302,60 @@ static unsigned long rcuperf_seq_diff(unsigned long new, unsigned long old) } /* - * If performance tests complete, wait for shutdown to commence. + * If scalability tests complete, wait for shutdown to commence. */ -static void rcu_perf_wait_shutdown(void) +static void rcu_scale_wait_shutdown(void) { cond_resched_tasks_rcu_qs(); - if (atomic_read(&n_rcu_perf_writer_finished) < nrealwriters) + if (atomic_read(&n_rcu_scale_writer_finished) < nrealwriters) return; while (!torture_must_stop()) schedule_timeout_uninterruptible(1); } /* - * RCU perf reader kthread. Repeatedly does empty RCU read-side critical - * section, minimizing update-side interference. However, the point of - * this test is not to evaluate reader performance, but instead to serve - * as a test load for update-side performance testing. + * RCU scalability reader kthread. Repeatedly does empty RCU read-side + * critical section, minimizing update-side interference. However, the + * point of this test is not to evaluate reader scalability, but instead + * to serve as a test load for update-side scalability testing. */ static int -rcu_perf_reader(void *arg) +rcu_scale_reader(void *arg) { unsigned long flags; int idx; long me = (long)arg; - VERBOSE_PERFOUT_STRING("rcu_perf_reader task started"); + VERBOSE_SCALEOUT_STRING("rcu_scale_reader task started"); set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); set_user_nice(current, MAX_NICE); - atomic_inc(&n_rcu_perf_reader_started); + atomic_inc(&n_rcu_scale_reader_started); do { local_irq_save(flags); idx = cur_ops->readlock(); cur_ops->readunlock(idx); local_irq_restore(flags); - rcu_perf_wait_shutdown(); + rcu_scale_wait_shutdown(); } while (!torture_must_stop()); - torture_kthread_stopping("rcu_perf_reader"); + torture_kthread_stopping("rcu_scale_reader"); return 0; } /* - * Callback function for asynchronous grace periods from rcu_perf_writer(). + * Callback function for asynchronous grace periods from rcu_scale_writer(). */ -static void rcu_perf_async_cb(struct rcu_head *rhp) +static void rcu_scale_async_cb(struct rcu_head *rhp) { atomic_dec(this_cpu_ptr(&n_async_inflight)); kfree(rhp); } /* - * RCU perf writer kthread. Repeatedly does a grace period. + * RCU scale writer kthread. Repeatedly does a grace period. */ static int -rcu_perf_writer(void *arg) +rcu_scale_writer(void *arg) { int i = 0; int i_max; @@ -366,7 +366,7 @@ rcu_perf_writer(void *arg) u64 *wdp; u64 *wdpp = writer_durations[me]; - VERBOSE_PERFOUT_STRING("rcu_perf_writer task started"); + VERBOSE_SCALEOUT_STRING("rcu_scale_writer task started"); WARN_ON(!wdpp); set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); sched_set_fifo_low(current); @@ -383,8 +383,8 @@ rcu_perf_writer(void *arg) schedule_timeout_uninterruptible(1); t = ktime_get_mono_fast_ns(); - if (atomic_inc_return(&n_rcu_perf_writer_started) >= nrealwriters) { - t_rcu_perf_writer_started = t; + if (atomic_inc_return(&n_rcu_scale_writer_started) >= nrealwriters) { + t_rcu_scale_writer_started = t; if (gp_exp) { b_rcu_gp_test_started = cur_ops->exp_completed() / 2; @@ -404,7 +404,7 @@ retry: rhp = kmalloc(sizeof(*rhp), GFP_KERNEL); if (rhp && atomic_read(this_cpu_ptr(&n_async_inflight)) < gp_async_max) { atomic_inc(this_cpu_ptr(&n_async_inflight)); - cur_ops->async(rhp, rcu_perf_async_cb); + cur_ops->async(rhp, rcu_scale_async_cb); rhp = NULL; } else if (!kthread_should_stop()) { cur_ops->gp_barrier(); @@ -421,19 +421,19 @@ retry: *wdp = t - *wdp; i_max = i; if (!started && - atomic_read(&n_rcu_perf_writer_started) >= nrealwriters) + atomic_read(&n_rcu_scale_writer_started) >= nrealwriters) started = true; if (!done && i >= MIN_MEAS) { done = true; sched_set_normal(current, 0); - pr_alert("%s%s rcu_perf_writer %ld has %d measurements\n", - perf_type, PERF_FLAG, me, MIN_MEAS); - if (atomic_inc_return(&n_rcu_perf_writer_finished) >= + pr_alert("%s%s rcu_scale_writer %ld has %d measurements\n", + scale_type, SCALE_FLAG, me, MIN_MEAS); + if (atomic_inc_return(&n_rcu_scale_writer_finished) >= nrealwriters) { schedule_timeout_interruptible(10); rcu_ftrace_dump(DUMP_ALL); - PERFOUT_STRING("Test complete"); - t_rcu_perf_writer_finished = t; + SCALEOUT_STRING("Test complete"); + t_rcu_scale_writer_finished = t; if (gp_exp) { b_rcu_gp_test_finished = cur_ops->exp_completed() / 2; @@ -448,30 +448,30 @@ retry: } } if (done && !alldone && - atomic_read(&n_rcu_perf_writer_finished) >= nrealwriters) + atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters) alldone = true; if (started && !alldone && i < MAX_MEAS - 1) i++; - rcu_perf_wait_shutdown(); + rcu_scale_wait_shutdown(); } while (!torture_must_stop()); if (gp_async) { cur_ops->gp_barrier(); } writer_n_durations[me] = i_max; - torture_kthread_stopping("rcu_perf_writer"); + torture_kthread_stopping("rcu_scale_writer"); return 0; } static void -rcu_perf_print_module_parms(struct rcu_perf_ops *cur_ops, const char *tag) +rcu_scale_print_module_parms(struct rcu_scale_ops *cur_ops, const char *tag) { - pr_alert("%s" PERF_FLAG + pr_alert("%s" SCALE_FLAG "--- %s: nreaders=%d nwriters=%d verbose=%d shutdown=%d\n", - perf_type, tag, nrealreaders, nrealwriters, verbose, shutdown); + scale_type, tag, nrealreaders, nrealwriters, verbose, shutdown); } static void -rcu_perf_cleanup(void) +rcu_scale_cleanup(void) { int i; int j; @@ -484,11 +484,11 @@ rcu_perf_cleanup(void) * during the mid-boot phase, so have to wait till the end. */ if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp) - VERBOSE_PERFOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!"); + VERBOSE_SCALEOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!"); if (rcu_gp_is_normal() && gp_exp) - VERBOSE_PERFOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!"); + VERBOSE_SCALEOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!"); if (gp_exp && gp_async) - VERBOSE_PERFOUT_ERRSTRING("No expedited async GPs, so went with async!"); + VERBOSE_SCALEOUT_ERRSTRING("No expedited async GPs, so went with async!"); if (torture_cleanup_begin()) return; @@ -499,30 +499,30 @@ rcu_perf_cleanup(void) if (reader_tasks) { for (i = 0; i < nrealreaders; i++) - torture_stop_kthread(rcu_perf_reader, + torture_stop_kthread(rcu_scale_reader, reader_tasks[i]); kfree(reader_tasks); } if (writer_tasks) { for (i = 0; i < nrealwriters; i++) { - torture_stop_kthread(rcu_perf_writer, + torture_stop_kthread(rcu_scale_writer, writer_tasks[i]); if (!writer_n_durations) continue; j = writer_n_durations[i]; pr_alert("%s%s writer %d gps: %d\n", - perf_type, PERF_FLAG, i, j); + scale_type, SCALE_FLAG, i, j); ngps += j; } pr_alert("%s%s start: %llu end: %llu duration: %llu gps: %d batches: %ld\n", - perf_type, PERF_FLAG, - t_rcu_perf_writer_started, t_rcu_perf_writer_finished, - t_rcu_perf_writer_finished - - t_rcu_perf_writer_started, + scale_type, SCALE_FLAG, + t_rcu_scale_writer_started, t_rcu_scale_writer_finished, + t_rcu_scale_writer_finished - + t_rcu_scale_writer_started, ngps, - rcuperf_seq_diff(b_rcu_gp_test_finished, - b_rcu_gp_test_started)); + rcuscale_seq_diff(b_rcu_gp_test_finished, + b_rcu_gp_test_started)); for (i = 0; i < nrealwriters; i++) { if (!writer_durations) break; @@ -534,7 +534,7 @@ rcu_perf_cleanup(void) for (j = 0; j <= writer_n_durations[i]; j++) { wdp = &wdpp[j]; pr_alert("%s%s %4d writer-duration: %5d %llu\n", - perf_type, PERF_FLAG, + scale_type, SCALE_FLAG, i, j, *wdp); if (j % 100 == 0) schedule_timeout_uninterruptible(1); @@ -573,22 +573,22 @@ static int compute_real(int n) } /* - * RCU perf shutdown kthread. Just waits to be awakened, then shuts + * RCU scalability shutdown kthread. Just waits to be awakened, then shuts * down system. */ static int -rcu_perf_shutdown(void *arg) +rcu_scale_shutdown(void *arg) { wait_event(shutdown_wq, - atomic_read(&n_rcu_perf_writer_finished) >= nrealwriters); + atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters); smp_mb(); /* Wake before output. */ - rcu_perf_cleanup(); + rcu_scale_cleanup(); kernel_power_off(); return -EINVAL; } /* - * kfree_rcu() performance tests: Start a kfree_rcu() loop on all CPUs for number + * kfree_rcu() scalability tests: Start a kfree_rcu() loop on all CPUs for number * of iterations and measure total time and number of GP for all iterations to complete. */ @@ -598,8 +598,8 @@ torture_param(int, kfree_loops, 10, "Number of loops doing kfree_alloc_num alloc static struct task_struct **kfree_reader_tasks; static int kfree_nrealthreads; -static atomic_t n_kfree_perf_thread_started; -static atomic_t n_kfree_perf_thread_ended; +static atomic_t n_kfree_scale_thread_started; +static atomic_t n_kfree_scale_thread_ended; struct kfree_obj { char kfree_obj[8]; @@ -607,7 +607,7 @@ struct kfree_obj { }; static int -kfree_perf_thread(void *arg) +kfree_scale_thread(void *arg) { int i, loop = 0; long me = (long)arg; @@ -615,13 +615,13 @@ kfree_perf_thread(void *arg) u64 start_time, end_time; long long mem_begin, mem_during = 0; - VERBOSE_PERFOUT_STRING("kfree_perf_thread task started"); + VERBOSE_SCALEOUT_STRING("kfree_scale_thread task started"); set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); set_user_nice(current, MAX_NICE); start_time = ktime_get_mono_fast_ns(); - if (atomic_inc_return(&n_kfree_perf_thread_started) >= kfree_nrealthreads) { + if (atomic_inc_return(&n_kfree_scale_thread_started) >= kfree_nrealthreads) { if (gp_exp) b_rcu_gp_test_started = cur_ops->exp_completed() / 2; else @@ -646,7 +646,7 @@ kfree_perf_thread(void *arg) cond_resched(); } while (!torture_must_stop() && ++loop < kfree_loops); - if (atomic_inc_return(&n_kfree_perf_thread_ended) >= kfree_nrealthreads) { + if (atomic_inc_return(&n_kfree_scale_thread_ended) >= kfree_nrealthreads) { end_time = ktime_get_mono_fast_ns(); if (gp_exp) @@ -656,7 +656,7 @@ kfree_perf_thread(void *arg) pr_alert("Total time taken by all kfree'ers: %llu ns, loops: %d, batches: %ld, memory footprint: %lldMB\n", (unsigned long long)(end_time - start_time), kfree_loops, - rcuperf_seq_diff(b_rcu_gp_test_finished, b_rcu_gp_test_started), + rcuscale_seq_diff(b_rcu_gp_test_finished, b_rcu_gp_test_started), (mem_begin - mem_during) >> (20 - PAGE_SHIFT)); if (shutdown) { @@ -665,12 +665,12 @@ kfree_perf_thread(void *arg) } } - torture_kthread_stopping("kfree_perf_thread"); + torture_kthread_stopping("kfree_scale_thread"); return 0; } static void -kfree_perf_cleanup(void) +kfree_scale_cleanup(void) { int i; @@ -679,7 +679,7 @@ kfree_perf_cleanup(void) if (kfree_reader_tasks) { for (i = 0; i < kfree_nrealthreads; i++) - torture_stop_kthread(kfree_perf_thread, + torture_stop_kthread(kfree_scale_thread, kfree_reader_tasks[i]); kfree(kfree_reader_tasks); } @@ -691,20 +691,20 @@ kfree_perf_cleanup(void) * shutdown kthread. Just waits to be awakened, then shuts down system. */ static int -kfree_perf_shutdown(void *arg) +kfree_scale_shutdown(void *arg) { wait_event(shutdown_wq, - atomic_read(&n_kfree_perf_thread_ended) >= kfree_nrealthreads); + atomic_read(&n_kfree_scale_thread_ended) >= kfree_nrealthreads); smp_mb(); /* Wake before output. */ - kfree_perf_cleanup(); + kfree_scale_cleanup(); kernel_power_off(); return -EINVAL; } static int __init -kfree_perf_init(void) +kfree_scale_init(void) { long i; int firsterr = 0; @@ -713,7 +713,7 @@ kfree_perf_init(void) /* Start up the kthreads. */ if (shutdown) { init_waitqueue_head(&shutdown_wq); - firsterr = torture_create_kthread(kfree_perf_shutdown, NULL, + firsterr = torture_create_kthread(kfree_scale_shutdown, NULL, shutdown_task); if (firsterr) goto unwind; @@ -730,13 +730,13 @@ kfree_perf_init(void) } for (i = 0; i < kfree_nrealthreads; i++) { - firsterr = torture_create_kthread(kfree_perf_thread, (void *)i, + firsterr = torture_create_kthread(kfree_scale_thread, (void *)i, kfree_reader_tasks[i]); if (firsterr) goto unwind; } - while (atomic_read(&n_kfree_perf_thread_started) < kfree_nrealthreads) + while (atomic_read(&n_kfree_scale_thread_started) < kfree_nrealthreads) schedule_timeout_uninterruptible(1); torture_init_end(); @@ -744,35 +744,35 @@ kfree_perf_init(void) unwind: torture_init_end(); - kfree_perf_cleanup(); + kfree_scale_cleanup(); return firsterr; } static int __init -rcu_perf_init(void) +rcu_scale_init(void) { long i; int firsterr = 0; - static struct rcu_perf_ops *perf_ops[] = { + static struct rcu_scale_ops *scale_ops[] = { &rcu_ops, &srcu_ops, &srcud_ops, &tasks_ops, }; - if (!torture_init_begin(perf_type, verbose)) + if (!torture_init_begin(scale_type, verbose)) return -EBUSY; - /* Process args and tell the world that the perf'er is on the job. */ - for (i = 0; i < ARRAY_SIZE(perf_ops); i++) { - cur_ops = perf_ops[i]; - if (strcmp(perf_type, cur_ops->name) == 0) + /* Process args and announce that the scalability'er is on the job. */ + for (i = 0; i < ARRAY_SIZE(scale_ops); i++) { + cur_ops = scale_ops[i]; + if (strcmp(scale_type, cur_ops->name) == 0) break; } - if (i == ARRAY_SIZE(perf_ops)) { - pr_alert("rcu-perf: invalid perf type: \"%s\"\n", perf_type); - pr_alert("rcu-perf types:"); - for (i = 0; i < ARRAY_SIZE(perf_ops); i++) - pr_cont(" %s", perf_ops[i]->name); + if (i == ARRAY_SIZE(scale_ops)) { + pr_alert("rcu-scale: invalid scale type: \"%s\"\n", scale_type); + pr_alert("rcu-scale types:"); + for (i = 0; i < ARRAY_SIZE(scale_ops); i++) + pr_cont(" %s", scale_ops[i]->name); pr_cont("\n"); - WARN_ON(!IS_MODULE(CONFIG_RCU_PERF_TEST)); + WARN_ON(!IS_MODULE(CONFIG_RCU_SCALE_TEST)); firsterr = -EINVAL; cur_ops = NULL; goto unwind; @@ -781,20 +781,20 @@ rcu_perf_init(void) cur_ops->init(); if (kfree_rcu_test) - return kfree_perf_init(); + return kfree_scale_init(); nrealwriters = compute_real(nwriters); nrealreaders = compute_real(nreaders); - atomic_set(&n_rcu_perf_reader_started, 0); - atomic_set(&n_rcu_perf_writer_started, 0); - atomic_set(&n_rcu_perf_writer_finished, 0); - rcu_perf_print_module_parms(cur_ops, "Start of test"); + atomic_set(&n_rcu_scale_reader_started, 0); + atomic_set(&n_rcu_scale_writer_started, 0); + atomic_set(&n_rcu_scale_writer_finished, 0); + rcu_scale_print_module_parms(cur_ops, "Start of test"); /* Start up the kthreads. */ if (shutdown) { init_waitqueue_head(&shutdown_wq); - firsterr = torture_create_kthread(rcu_perf_shutdown, NULL, + firsterr = torture_create_kthread(rcu_scale_shutdown, NULL, shutdown_task); if (firsterr) goto unwind; @@ -803,17 +803,17 @@ rcu_perf_init(void) reader_tasks = kcalloc(nrealreaders, sizeof(reader_tasks[0]), GFP_KERNEL); if (reader_tasks == NULL) { - VERBOSE_PERFOUT_ERRSTRING("out of memory"); + VERBOSE_SCALEOUT_ERRSTRING("out of memory"); firsterr = -ENOMEM; goto unwind; } for (i = 0; i < nrealreaders; i++) { - firsterr = torture_create_kthread(rcu_perf_reader, (void *)i, + firsterr = torture_create_kthread(rcu_scale_reader, (void *)i, reader_tasks[i]); if (firsterr) goto unwind; } - while (atomic_read(&n_rcu_perf_reader_started) < nrealreaders) + while (atomic_read(&n_rcu_scale_reader_started) < nrealreaders) schedule_timeout_uninterruptible(1); writer_tasks = kcalloc(nrealwriters, sizeof(reader_tasks[0]), GFP_KERNEL); @@ -823,7 +823,7 @@ rcu_perf_init(void) kcalloc(nrealwriters, sizeof(*writer_n_durations), GFP_KERNEL); if (!writer_tasks || !writer_durations || !writer_n_durations) { - VERBOSE_PERFOUT_ERRSTRING("out of memory"); + VERBOSE_SCALEOUT_ERRSTRING("out of memory"); firsterr = -ENOMEM; goto unwind; } @@ -835,7 +835,7 @@ rcu_perf_init(void) firsterr = -ENOMEM; goto unwind; } - firsterr = torture_create_kthread(rcu_perf_writer, (void *)i, + firsterr = torture_create_kthread(rcu_scale_writer, (void *)i, writer_tasks[i]); if (firsterr) goto unwind; @@ -845,9 +845,9 @@ rcu_perf_init(void) unwind: torture_init_end(); - rcu_perf_cleanup(); + rcu_scale_cleanup(); return firsterr; } -module_init(rcu_perf_init); -module_exit(rcu_perf_cleanup); +module_init(rcu_scale_init); +module_exit(rcu_scale_cleanup); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index f453bf8d2f1e..916ea4f66e4b 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -52,19 +52,6 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com> and Josh Triplett <josh@joshtriplett.org>"); -#ifndef data_race -#define data_race(expr) \ - ({ \ - expr; \ - }) -#endif -#ifndef ASSERT_EXCLUSIVE_WRITER -#define ASSERT_EXCLUSIVE_WRITER(var) do { } while (0) -#endif -#ifndef ASSERT_EXCLUSIVE_ACCESS -#define ASSERT_EXCLUSIVE_ACCESS(var) do { } while (0) -#endif - /* Bits for ->extendables field, extendables param, and related definitions. */ #define RCUTORTURE_RDR_SHIFT 8 /* Put SRCU index in upper bits. */ #define RCUTORTURE_RDR_MASK ((1 << RCUTORTURE_RDR_SHIFT) - 1) @@ -100,6 +87,7 @@ torture_param(bool, gp_normal, false, "Use normal (non-expedited) GP wait primitives"); torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives"); torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers"); +torture_param(int, leakpointer, 0, "Leak pointer dereferences from readers"); torture_param(int, n_barrier_cbs, 0, "# of callbacks/kthreads for barrier testing"); torture_param(int, nfakewriters, 4, "Number of RCU fake writer threads"); @@ -185,6 +173,7 @@ static long n_barrier_successes; /* did rcu_barrier test succeed? */ static unsigned long n_read_exits; static struct list_head rcu_torture_removed; static unsigned long shutdown_jiffies; +static unsigned long start_gp_seq; static int rcu_torture_writer_state; #define RTWS_FIXED_DELAY 0 @@ -1413,6 +1402,9 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp) preempt_enable(); rcutorture_one_extend(&readstate, 0, trsp, rtrsp); WARN_ON_ONCE(readstate & RCUTORTURE_RDR_MASK); + // This next splat is expected behavior if leakpointer, especially + // for CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels. + WARN_ON_ONCE(leakpointer && READ_ONCE(p->rtort_pipe_count) > 1); /* If error or close call, record the sequence of reader protections. */ if ((pipe_count > 1 || completed > 1) && !xchg(&err_segs_recorded, 1)) { @@ -1808,6 +1800,7 @@ struct rcu_fwd { unsigned long rcu_launder_gp_seq_start; }; +static DEFINE_MUTEX(rcu_fwd_mutex); static struct rcu_fwd *rcu_fwds; static bool rcu_fwd_emergency_stop; @@ -2074,8 +2067,14 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp) static int rcutorture_oom_notify(struct notifier_block *self, unsigned long notused, void *nfreed) { - struct rcu_fwd *rfp = rcu_fwds; + struct rcu_fwd *rfp; + mutex_lock(&rcu_fwd_mutex); + rfp = rcu_fwds; + if (!rfp) { + mutex_unlock(&rcu_fwd_mutex); + return NOTIFY_OK; + } WARN(1, "%s invoked upon OOM during forward-progress testing.\n", __func__); rcu_torture_fwd_cb_hist(rfp); @@ -2093,6 +2092,7 @@ static int rcutorture_oom_notify(struct notifier_block *self, smp_mb(); /* Frees before return to avoid redoing OOM. */ (*(unsigned long *)nfreed)++; /* Forward progress CBs freed! */ pr_info("%s returning after OOM processing.\n", __func__); + mutex_unlock(&rcu_fwd_mutex); return NOTIFY_OK; } @@ -2114,13 +2114,11 @@ static int rcu_torture_fwd_prog(void *args) do { schedule_timeout_interruptible(fwd_progress_holdoff * HZ); WRITE_ONCE(rcu_fwd_emergency_stop, false); - register_oom_notifier(&rcutorture_oom_nb); if (!IS_ENABLED(CONFIG_TINY_RCU) || rcu_inkernel_boot_has_ended()) rcu_torture_fwd_prog_nr(rfp, &tested, &tested_tries); if (rcu_inkernel_boot_has_ended()) rcu_torture_fwd_prog_cr(rfp); - unregister_oom_notifier(&rcutorture_oom_nb); /* Avoid slow periods, better to test when busy. */ stutter_wait("rcu_torture_fwd_prog"); @@ -2160,9 +2158,26 @@ static int __init rcu_torture_fwd_prog_init(void) return -ENOMEM; spin_lock_init(&rfp->rcu_fwd_lock); rfp->rcu_fwd_cb_tail = &rfp->rcu_fwd_cb_head; + mutex_lock(&rcu_fwd_mutex); + rcu_fwds = rfp; + mutex_unlock(&rcu_fwd_mutex); + register_oom_notifier(&rcutorture_oom_nb); return torture_create_kthread(rcu_torture_fwd_prog, rfp, fwd_prog_task); } +static void rcu_torture_fwd_prog_cleanup(void) +{ + struct rcu_fwd *rfp; + + torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task); + rfp = rcu_fwds; + mutex_lock(&rcu_fwd_mutex); + rcu_fwds = NULL; + mutex_unlock(&rcu_fwd_mutex); + unregister_oom_notifier(&rcutorture_oom_nb); + kfree(rfp); +} + /* Callback function for RCU barrier testing. */ static void rcu_torture_barrier_cbf(struct rcu_head *rcu) { @@ -2460,7 +2475,7 @@ rcu_torture_cleanup(void) show_rcu_gp_kthreads(); rcu_torture_read_exit_cleanup(); rcu_torture_barrier_cleanup(); - torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task); + rcu_torture_fwd_prog_cleanup(); torture_stop_kthread(rcu_torture_stall, stall_task); torture_stop_kthread(rcu_torture_writer, writer_task); @@ -2482,8 +2497,9 @@ rcu_torture_cleanup(void) rcutorture_get_gp_data(cur_ops->ttype, &flags, &gp_seq); srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, &flags, &gp_seq); - pr_alert("%s: End-test grace-period state: g%lu f%#x\n", - cur_ops->name, gp_seq, flags); + pr_alert("%s: End-test grace-period state: g%ld f%#x total-gps=%ld\n", + cur_ops->name, (long)gp_seq, flags, + rcutorture_seq_diff(gp_seq, start_gp_seq)); torture_stop_kthread(rcu_torture_stats, stats_task); torture_stop_kthread(rcu_torture_fqs, fqs_task); if (rcu_torture_can_boost()) @@ -2607,6 +2623,8 @@ rcu_torture_init(void) long i; int cpu; int firsterr = 0; + int flags = 0; + unsigned long gp_seq = 0; static struct rcu_torture_ops *torture_ops[] = { &rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, &busted_srcud_ops, &tasks_ops, &tasks_rude_ops, @@ -2649,6 +2667,11 @@ rcu_torture_init(void) nrealreaders = 1; } rcu_torture_print_module_parms(cur_ops, "Start of test"); + rcutorture_get_gp_data(cur_ops->ttype, &flags, &gp_seq); + srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, &flags, &gp_seq); + start_gp_seq = gp_seq; + pr_alert("%s: Start-test grace-period state: g%ld f%#x\n", + cur_ops->name, (long)gp_seq, flags); /* Set up the freelist. */ diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index d9291f883b54..952595c678b3 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -546,9 +546,11 @@ static int main_func(void *arg) // Print the average of all experiments SCALEOUT("END OF TEST. Calculating average duration per loop (nanoseconds)...\n"); - buf[0] = 0; - strcat(buf, "\n"); - strcat(buf, "Runs\tTime(ns)\n"); + if (!errexit) { + buf[0] = 0; + strcat(buf, "\n"); + strcat(buf, "Runs\tTime(ns)\n"); + } for (exp = 0; exp < nruns; exp++) { u64 avg; diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index c100acf332ed..c13348ee80a5 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -29,19 +29,6 @@ #include "rcu.h" #include "rcu_segcblist.h" -#ifndef data_race -#define data_race(expr) \ - ({ \ - expr; \ - }) -#endif -#ifndef ASSERT_EXCLUSIVE_WRITER -#define ASSERT_EXCLUSIVE_WRITER(var) do { } while (0) -#endif -#ifndef ASSERT_EXCLUSIVE_ACCESS -#define ASSERT_EXCLUSIVE_ACCESS(var) do { } while (0) -#endif - /* Holdoff in nanoseconds for auto-expediting. */ #define DEFAULT_SRCU_EXP_HOLDOFF (25 * 1000) static ulong exp_holdoff = DEFAULT_SRCU_EXP_HOLDOFF; diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 835e2df8590a..d5d9f2d03e8a 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -28,6 +28,8 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp); * @kthread_ptr: This flavor's grace-period/callback-invocation kthread. * @gp_func: This flavor's grace-period-wait function. * @gp_state: Grace period's most recent state transition (debugging). + * @gp_sleep: Per-grace-period sleep to prevent CPU-bound looping. + * @init_fract: Initial backoff sleep interval. * @gp_jiffies: Time of last @gp_state transition. * @gp_start: Most recent grace-period start in jiffies. * @n_gps: Number of grace periods completed since boot. @@ -48,6 +50,8 @@ struct rcu_tasks { struct wait_queue_head cbs_wq; raw_spinlock_t cbs_lock; int gp_state; + int gp_sleep; + int init_fract; unsigned long gp_jiffies; unsigned long gp_start; unsigned long n_gps; @@ -81,7 +85,7 @@ static struct rcu_tasks rt_name = \ DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu); /* Avoid IPIing CPUs early in the grace period. */ -#define RCU_TASK_IPI_DELAY (HZ / 2) +#define RCU_TASK_IPI_DELAY (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) ? HZ / 2 : 0) static int rcu_task_ipi_delay __read_mostly = RCU_TASK_IPI_DELAY; module_param(rcu_task_ipi_delay, int, 0644); @@ -231,7 +235,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) cond_resched(); } /* Paranoid sleep to keep this from entering a tight loop */ - schedule_timeout_idle(HZ/10); + schedule_timeout_idle(rtp->gp_sleep); set_tasks_gp_state(rtp, RTGS_WAIT_CBS); } @@ -329,8 +333,10 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) */ lastreport = jiffies; - /* Start off with HZ/10 wait and slowly back off to 1 HZ wait. */ - fract = 10; + // Start off with initial wait and slowly back off to 1 HZ wait. + fract = rtp->init_fract; + if (fract > HZ) + fract = HZ; for (;;) { bool firstreport; @@ -553,6 +559,8 @@ EXPORT_SYMBOL_GPL(rcu_barrier_tasks); static int __init rcu_spawn_tasks_kthread(void) { + rcu_tasks.gp_sleep = HZ / 10; + rcu_tasks.init_fract = 10; rcu_tasks.pregp_func = rcu_tasks_pregp_step; rcu_tasks.pertask_func = rcu_tasks_pertask; rcu_tasks.postscan_func = rcu_tasks_postscan; @@ -590,7 +598,7 @@ void exit_tasks_rcu_finish(void) __releases(&tasks_rcu_exit_srcu) } #else /* #ifdef CONFIG_TASKS_RCU */ -static void show_rcu_tasks_classic_gp_kthread(void) { } +static inline void show_rcu_tasks_classic_gp_kthread(void) { } void exit_tasks_rcu_start(void) { } void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); } #endif /* #else #ifdef CONFIG_TASKS_RCU */ @@ -685,6 +693,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier_tasks_rude); static int __init rcu_spawn_tasks_rude_kthread(void) { + rcu_tasks_rude.gp_sleep = HZ / 10; rcu_spawn_tasks_kthread_generic(&rcu_tasks_rude); return 0; } @@ -745,9 +754,9 @@ static DEFINE_PER_CPU(bool, trc_ipi_to_cpu); // The number of detections of task quiescent state relying on // heavyweight readers executing explicit memory barriers. -unsigned long n_heavy_reader_attempts; -unsigned long n_heavy_reader_updates; -unsigned long n_heavy_reader_ofl_updates; +static unsigned long n_heavy_reader_attempts; +static unsigned long n_heavy_reader_updates; +static unsigned long n_heavy_reader_ofl_updates; void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func); DEFINE_RCU_TASKS(rcu_tasks_trace, rcu_tasks_wait_gp, call_rcu_tasks_trace, @@ -821,6 +830,12 @@ static void trc_read_check_handler(void *t_in) WRITE_ONCE(t->trc_reader_checked, true); goto reset_ipi; } + // If we are racing with an rcu_read_unlock_trace(), try again later. + if (unlikely(t->trc_reader_nesting < 0)) { + if (WARN_ON_ONCE(atomic_dec_and_test(&trc_n_readers_need_end))) + wake_up(&trc_wait); + goto reset_ipi; + } WRITE_ONCE(t->trc_reader_checked, true); // Get here if the task is in a read-side critical section. Set @@ -911,7 +926,8 @@ static void trc_wait_for_one_reader(struct task_struct *t, // If currently running, send an IPI, either way, add to list. trc_add_holdout(t, bhp); - if (task_curr(t) && time_after(jiffies, rcu_tasks_trace.gp_start + rcu_task_ipi_delay)) { + if (task_curr(t) && + time_after(jiffies + 1, rcu_tasks_trace.gp_start + rcu_task_ipi_delay)) { // The task is currently running, so try IPIing it. cpu = task_cpu(t); @@ -1072,15 +1088,17 @@ static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp) if (ret) break; // Count reached zero. // Stall warning time, so make a list of the offenders. + rcu_read_lock(); for_each_process_thread(g, t) if (READ_ONCE(t->trc_reader_special.b.need_qs)) trc_add_holdout(t, &holdouts); + rcu_read_unlock(); firstreport = true; - list_for_each_entry_safe(t, g, &holdouts, trc_holdout_list) - if (READ_ONCE(t->trc_reader_special.b.need_qs)) { + list_for_each_entry_safe(t, g, &holdouts, trc_holdout_list) { + if (READ_ONCE(t->trc_reader_special.b.need_qs)) show_stalled_task_trace(t, &firstreport); - trc_del_holdout(t); - } + trc_del_holdout(t); // Release task_struct reference. + } if (firstreport) pr_err("INFO: rcu_tasks_trace detected stalls? (Counter/taskslist mismatch?)\n"); show_stalled_ipi_trace(); @@ -1163,6 +1181,17 @@ EXPORT_SYMBOL_GPL(rcu_barrier_tasks_trace); static int __init rcu_spawn_tasks_trace_kthread(void) { + if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) { + rcu_tasks_trace.gp_sleep = HZ / 10; + rcu_tasks_trace.init_fract = 10; + } else { + rcu_tasks_trace.gp_sleep = HZ / 200; + if (rcu_tasks_trace.gp_sleep <= 0) + rcu_tasks_trace.gp_sleep = 1; + rcu_tasks_trace.init_fract = HZ / 5; + if (rcu_tasks_trace.init_fract <= 0) + rcu_tasks_trace.init_fract = 1; + } rcu_tasks_trace.pregp_func = rcu_tasks_trace_pregp_step; rcu_tasks_trace.pertask_func = rcu_tasks_trace_pertask; rcu_tasks_trace.postscan_func = rcu_tasks_trace_postscan; diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8ce77d9ac716..2a52f42f64b6 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -70,19 +70,6 @@ #endif #define MODULE_PARAM_PREFIX "rcutree." -#ifndef data_race -#define data_race(expr) \ - ({ \ - expr; \ - }) -#endif -#ifndef ASSERT_EXCLUSIVE_WRITER -#define ASSERT_EXCLUSIVE_WRITER(var) do { } while (0) -#endif -#ifndef ASSERT_EXCLUSIVE_ACCESS -#define ASSERT_EXCLUSIVE_ACCESS(var) do { } while (0) -#endif - /* Data structures. */ /* @@ -178,6 +165,12 @@ module_param(gp_init_delay, int, 0444); static int gp_cleanup_delay; module_param(gp_cleanup_delay, int, 0444); +// Add delay to rcu_read_unlock() for strict grace periods. +static int rcu_unlock_delay; +#ifdef CONFIG_RCU_STRICT_GRACE_PERIOD +module_param(rcu_unlock_delay, int, 0444); +#endif + /* * This rcu parameter is runtime-read-only. It reflects * a minimum allowed number of objects which can be cached @@ -416,7 +409,7 @@ bool rcu_eqs_special_set(int cpu) * * The caller must have disabled interrupts and must not be idle. */ -void rcu_momentary_dyntick_idle(void) +notrace void rcu_momentary_dyntick_idle(void) { int special; @@ -468,24 +461,25 @@ static int rcu_is_cpu_rrupt_from_idle(void) return __this_cpu_read(rcu_data.dynticks_nesting) == 0; } -#define DEFAULT_RCU_BLIMIT 10 /* Maximum callbacks per rcu_do_batch ... */ -#define DEFAULT_MAX_RCU_BLIMIT 10000 /* ... even during callback flood. */ +#define DEFAULT_RCU_BLIMIT (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ? 1000 : 10) + // Maximum callbacks per rcu_do_batch ... +#define DEFAULT_MAX_RCU_BLIMIT 10000 // ... even during callback flood. static long blimit = DEFAULT_RCU_BLIMIT; -#define DEFAULT_RCU_QHIMARK 10000 /* If this many pending, ignore blimit. */ +#define DEFAULT_RCU_QHIMARK 10000 // If this many pending, ignore blimit. static long qhimark = DEFAULT_RCU_QHIMARK; -#define DEFAULT_RCU_QLOMARK 100 /* Once only this many pending, use blimit. */ +#define DEFAULT_RCU_QLOMARK 100 // Once only this many pending, use blimit. static long qlowmark = DEFAULT_RCU_QLOMARK; #define DEFAULT_RCU_QOVLD_MULT 2 #define DEFAULT_RCU_QOVLD (DEFAULT_RCU_QOVLD_MULT * DEFAULT_RCU_QHIMARK) -static long qovld = DEFAULT_RCU_QOVLD; /* If this many pending, hammer QS. */ -static long qovld_calc = -1; /* No pre-initialization lock acquisitions! */ +static long qovld = DEFAULT_RCU_QOVLD; // If this many pending, hammer QS. +static long qovld_calc = -1; // No pre-initialization lock acquisitions! module_param(blimit, long, 0444); module_param(qhimark, long, 0444); module_param(qlowmark, long, 0444); module_param(qovld, long, 0444); -static ulong jiffies_till_first_fqs = ULONG_MAX; +static ulong jiffies_till_first_fqs = IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ? 0 : ULONG_MAX; static ulong jiffies_till_next_fqs = ULONG_MAX; static bool rcu_kick_kthreads; static int rcu_divisor = 7; @@ -673,6 +667,7 @@ void rcu_idle_enter(void) lockdep_assert_irqs_disabled(); rcu_eqs_enter(false); } +EXPORT_SYMBOL_GPL(rcu_idle_enter); #ifdef CONFIG_NO_HZ_FULL /** @@ -886,6 +881,7 @@ void rcu_idle_exit(void) rcu_eqs_exit(false); local_irq_restore(flags); } +EXPORT_SYMBOL_GPL(rcu_idle_exit); #ifdef CONFIG_NO_HZ_FULL /** @@ -1090,11 +1086,6 @@ static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp) } } -noinstr bool __rcu_is_watching(void) -{ - return !rcu_dynticks_curr_cpu_in_eqs(); -} - /** * rcu_is_watching - see if RCU thinks that the current CPU is not idle * @@ -1227,13 +1218,28 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) return 1; } - /* If waiting too long on an offline CPU, complain. */ - if (!(rdp->grpmask & rcu_rnp_online_cpus(rnp)) && - time_after(jiffies, rcu_state.gp_start + HZ)) { + /* + * Complain if a CPU that is considered to be offline from RCU's + * perspective has not yet reported a quiescent state. After all, + * the offline CPU should have reported a quiescent state during + * the CPU-offline process, or, failing that, by rcu_gp_init() + * if it ran concurrently with either the CPU going offline or the + * last task on a leaf rcu_node structure exiting its RCU read-side + * critical section while all CPUs corresponding to that structure + * are offline. This added warning detects bugs in any of these + * code paths. + * + * The rcu_node structure's ->lock is held here, which excludes + * the relevant portions the CPU-hotplug code, the grace-period + * initialization code, and the rcu_read_unlock() code paths. + * + * For more detail, please refer to the "Hotplug CPU" section + * of RCU's Requirements documentation. + */ + if (WARN_ON_ONCE(!(rdp->grpmask & rcu_rnp_online_cpus(rnp)))) { bool onl; struct rcu_node *rnp1; - WARN_ON(1); /* Offline CPUs are supposed to report QS! */ pr_info("%s: grp: %d-%d level: %d ->gp_seq %ld ->completedqs %ld\n", __func__, rnp->grplo, rnp->grphi, rnp->level, (long)rnp->gp_seq, (long)rnp->completedqs); @@ -1496,9 +1502,10 @@ static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp) /* Trace depending on how much we were able to accelerate. */ if (rcu_segcblist_restempty(&rdp->cblist, RCU_WAIT_TAIL)) - trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("AccWaitCB")); + trace_rcu_grace_period(rcu_state.name, gp_seq_req, TPS("AccWaitCB")); else - trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("AccReadyCB")); + trace_rcu_grace_period(rcu_state.name, gp_seq_req, TPS("AccReadyCB")); + return ret; } @@ -1574,6 +1581,19 @@ static void __maybe_unused rcu_advance_cbs_nowake(struct rcu_node *rnp, } /* + * In CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels, attempt to generate a + * quiescent state. This is intended to be invoked when the CPU notices + * a new grace period. + */ +static void rcu_strict_gp_check_qs(void) +{ + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) { + rcu_read_lock(); + rcu_read_unlock(); + } +} + +/* * Update CPU-local rcu_data state to record the beginnings and ends of * grace periods. The caller must hold the ->lock of the leaf rcu_node * structure corresponding to the current CPU, and must have irqs disabled. @@ -1643,6 +1663,7 @@ static void note_gp_changes(struct rcu_data *rdp) } needwake = __note_gp_changes(rnp, rdp); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + rcu_strict_gp_check_qs(); if (needwake) rcu_gp_kthread_wake(); } @@ -1681,6 +1702,15 @@ static void rcu_gp_torture_wait(void) } /* + * Handler for on_each_cpu() to invoke the target CPU's RCU core + * processing. + */ +static void rcu_strict_gp_boundary(void *unused) +{ + invoke_rcu_core(); +} + +/* * Initialize a new grace period. Return false if no grace period required. */ static bool rcu_gp_init(void) @@ -1718,10 +1748,13 @@ static bool rcu_gp_init(void) raw_spin_unlock_irq_rcu_node(rnp); /* - * Apply per-leaf buffered online and offline operations to the - * rcu_node tree. Note that this new grace period need not wait - * for subsequent online CPUs, and that quiescent-state forcing - * will handle subsequent offline CPUs. + * Apply per-leaf buffered online and offline operations to + * the rcu_node tree. Note that this new grace period need not + * wait for subsequent online CPUs, and that RCU hooks in the CPU + * offlining path, when combined with checks in this function, + * will handle CPUs that are currently going offline or that will + * go offline later. Please also refer to "Hotplug CPU" section + * of RCU's Requirements documentation. */ rcu_state.gp_state = RCU_GP_ONOFF; rcu_for_each_leaf_node(rnp) { @@ -1808,6 +1841,10 @@ static bool rcu_gp_init(void) WRITE_ONCE(rcu_state.gp_activity, jiffies); } + // If strict, make all CPUs aware of new grace period. + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) + on_each_cpu(rcu_strict_gp_boundary, NULL, 0); + return true; } @@ -1896,7 +1933,7 @@ static void rcu_gp_fqs_loop(void) break; /* If time for quiescent-state forcing, do it. */ if (!time_after(rcu_state.jiffies_force_qs, jiffies) || - (gf & RCU_GP_FLAG_FQS)) { + (gf & (RCU_GP_FLAG_FQS | RCU_GP_FLAG_OVLD))) { trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("fqsstart")); rcu_gp_fqs(first_gp_fqs); @@ -2024,6 +2061,10 @@ static void rcu_gp_cleanup(void) rcu_state.gp_flags & RCU_GP_FLAG_INIT); } raw_spin_unlock_irq_rcu_node(rnp); + + // If strict, make all CPUs aware of the end of the old grace period. + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) + on_each_cpu(rcu_strict_gp_boundary, NULL, 0); } /* @@ -2202,7 +2243,7 @@ rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) * structure. This must be called from the specified CPU. */ static void -rcu_report_qs_rdp(int cpu, struct rcu_data *rdp) +rcu_report_qs_rdp(struct rcu_data *rdp) { unsigned long flags; unsigned long mask; @@ -2211,6 +2252,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp) rcu_segcblist_is_offloaded(&rdp->cblist); struct rcu_node *rnp; + WARN_ON_ONCE(rdp->cpu != smp_processor_id()); rnp = rdp->mynode; raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rdp->cpu_no_qs.b.norm || rdp->gp_seq != rnp->gp_seq || @@ -2227,8 +2269,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp) return; } mask = rdp->grpmask; - if (rdp->cpu == smp_processor_id()) - rdp->core_needs_qs = false; + rdp->core_needs_qs = false; if ((rnp->qsmask & mask) == 0) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } else { @@ -2277,7 +2318,7 @@ rcu_check_quiescent_state(struct rcu_data *rdp) * Tell RCU we are done (but rcu_report_qs_rdp() will be the * judge of that). */ - rcu_report_qs_rdp(rdp->cpu, rdp); + rcu_report_qs_rdp(rdp); } /* @@ -2374,6 +2415,7 @@ int rcutree_dead_cpu(unsigned int cpu) */ static void rcu_do_batch(struct rcu_data *rdp) { + int div; unsigned long flags; const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) && rcu_segcblist_is_offloaded(&rdp->cblist); @@ -2402,9 +2444,15 @@ static void rcu_do_batch(struct rcu_data *rdp) rcu_nocb_lock(rdp); WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); pending = rcu_segcblist_n_cbs(&rdp->cblist); - bl = max(rdp->blimit, pending >> rcu_divisor); - if (unlikely(bl > 100)) - tlimit = local_clock() + rcu_resched_ns; + div = READ_ONCE(rcu_divisor); + div = div < 0 ? 7 : div > sizeof(long) * 8 - 2 ? sizeof(long) * 8 - 2 : div; + bl = max(rdp->blimit, pending >> div); + if (unlikely(bl > 100)) { + long rrn = READ_ONCE(rcu_resched_ns); + + rrn = rrn < NSEC_PER_MSEC ? NSEC_PER_MSEC : rrn > NSEC_PER_SEC ? NSEC_PER_SEC : rrn; + tlimit = local_clock() + rrn; + } trace_rcu_batch_start(rcu_state.name, rcu_segcblist_n_cbs(&rdp->cblist), bl); rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl); @@ -2545,8 +2593,7 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) raw_spin_lock_irqsave_rcu_node(rnp, flags); rcu_state.cbovldnext |= !!rnp->cbovldmask; if (rnp->qsmask == 0) { - if (!IS_ENABLED(CONFIG_PREEMPT_RCU) || - rcu_preempt_blocked_readers_cgp(rnp)) { + if (rcu_preempt_blocked_readers_cgp(rnp)) { /* * No point in scanning bits because they * are all zero. But we might need to @@ -2614,6 +2661,14 @@ void rcu_force_quiescent_state(void) } EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); +// Workqueue handler for an RCU reader for kernels enforcing struct RCU +// grace periods. +static void strict_work_handler(struct work_struct *work) +{ + rcu_read_lock(); + rcu_read_unlock(); +} + /* Perform RCU core processing work for the current CPU. */ static __latent_entropy void rcu_core(void) { @@ -2658,6 +2713,10 @@ static __latent_entropy void rcu_core(void) /* Do any needed deferred wakeups of rcuo kthreads. */ do_nocb_deferred_wakeup(rdp); trace_rcu_utilization(TPS("End RCU core")); + + // If strict GPs, schedule an RCU reader in a clean environment. + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) + queue_work_on(rdp->cpu, rcu_gp_wq, &rdp->strict_work); } static void rcu_core_si(struct softirq_action *h) @@ -3020,6 +3079,12 @@ struct kfree_rcu_cpu_work { * @monitor_todo: Tracks whether a @monitor_work delayed work is pending * @initialized: The @rcu_work fields have been initialized * @count: Number of objects for which GP not started + * @bkvcache: + * A simple cache list that contains objects for reuse purpose. + * In order to save some per-cpu space the list is singular. + * Even though it is lockless an access has to be protected by the + * per-cpu lock. + * @nr_bkv_objs: number of allocated objects at @bkvcache. * * This is a per-CPU structure. The reason that it is not included in * the rcu_data structure is to permit this code to be extracted from @@ -3035,14 +3100,6 @@ struct kfree_rcu_cpu { bool monitor_todo; bool initialized; int count; - - /* - * A simple cache list that contains objects for - * reuse purpose. In order to save some per-cpu - * space the list is singular. Even though it is - * lockless an access has to be protected by the - * per-cpu lock. - */ struct llist_head bkvcache; int nr_bkv_objs; }; @@ -3443,7 +3500,7 @@ kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) unsigned long count = 0; /* Snapshot count of all CPUs */ - for_each_online_cpu(cpu) { + for_each_possible_cpu(cpu) { struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); count += READ_ONCE(krcp->count); @@ -3458,7 +3515,7 @@ kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) int cpu, freed = 0; unsigned long flags; - for_each_online_cpu(cpu) { + for_each_possible_cpu(cpu) { int count; struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); @@ -3491,7 +3548,7 @@ void __init kfree_rcu_scheduler_running(void) int cpu; unsigned long flags; - for_each_online_cpu(cpu) { + for_each_possible_cpu(cpu) { struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); raw_spin_lock_irqsave(&krcp->lock, flags); @@ -3855,6 +3912,7 @@ rcu_boot_init_percpu_data(int cpu) /* Set up local state, ensuring consistent view of global state. */ rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu); + INIT_WORK(&rdp->strict_work, strict_work_handler); WARN_ON_ONCE(rdp->dynticks_nesting != 1); WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp))); rdp->rcu_ofl_gp_seq = rcu_state.gp_seq; @@ -3973,8 +4031,6 @@ int rcutree_offline_cpu(unsigned int cpu) return 0; } -static DEFINE_PER_CPU(int, rcu_cpu_started); - /* * Mark the specified CPU as being online so that subsequent grace periods * (both expedited and normal) will wait on it. Note that this means that @@ -3994,12 +4050,11 @@ void rcu_cpu_starting(unsigned int cpu) struct rcu_node *rnp; bool newcpu; - if (per_cpu(rcu_cpu_started, cpu)) + rdp = per_cpu_ptr(&rcu_data, cpu); + if (rdp->cpu_started) return; + rdp->cpu_started = true; - per_cpu(rcu_cpu_started, cpu) = 1; - - rdp = per_cpu_ptr(&rcu_data, cpu); rnp = rdp->mynode; mask = rdp->grpmask; raw_spin_lock_irqsave_rcu_node(rnp, flags); @@ -4059,7 +4114,7 @@ void rcu_report_dead(unsigned int cpu) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); raw_spin_unlock(&rcu_state.ofl_lock); - per_cpu(rcu_cpu_started, cpu) = 0; + rdp->cpu_started = false; } /* diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index c96ae351688b..e4f66b8f7c47 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -156,6 +156,7 @@ struct rcu_data { bool beenonline; /* CPU online at least once. */ bool gpwrap; /* Possible ->gp_seq wrap. */ bool exp_deferred_qs; /* This CPU awaiting a deferred QS? */ + bool cpu_started; /* RCU watching this onlining CPU. */ struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ unsigned long grpmask; /* Mask to apply to leaf qsmask. */ unsigned long ticks_this_gp; /* The number of scheduling-clock */ @@ -164,6 +165,7 @@ struct rcu_data { /* period it is aware of. */ struct irq_work defer_qs_iw; /* Obtain later scheduler attention. */ bool defer_qs_iw_pending; /* Scheduler attention pending? */ + struct work_struct strict_work; /* Schedule readers for strict GPs. */ /* 2) batch handling */ struct rcu_segcblist cblist; /* Segmented callback list, with */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 1888c0eb1216..8760b6ead770 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -732,11 +732,9 @@ static void rcu_exp_need_qs(void) /* Invoked on each online non-idle CPU for expedited quiescent state. */ static void rcu_exp_handler(void *unused) { - struct rcu_data *rdp; - struct rcu_node *rnp; + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); + struct rcu_node *rnp = rdp->mynode; - rdp = this_cpu_ptr(&rcu_data); - rnp = rdp->mynode; if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || __this_cpu_read(rcu_data.cpu_no_qs.b.exp)) return; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 982fc5be5269..fd8a52e9a887 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -36,6 +36,8 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n"); if (IS_ENABLED(CONFIG_PROVE_RCU)) pr_info("\tRCU lockdep checking is enabled.\n"); + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) + pr_info("\tRCU strict (and thus non-scalable) grace periods enabled.\n"); if (RCU_NUM_LVLS >= 4) pr_info("\tFour(or more)-level hierarchy is enabled.\n"); if (RCU_FANOUT_LEAF != 16) @@ -374,6 +376,8 @@ void __rcu_read_lock(void) rcu_preempt_read_enter(); if (IS_ENABLED(CONFIG_PROVE_LOCKING)) WARN_ON_ONCE(rcu_preempt_depth() > RCU_NEST_PMAX); + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) && rcu_state.gp_kthread) + WRITE_ONCE(current->rcu_read_unlock_special.b.need_qs, true); barrier(); /* critical section after entry code. */ } EXPORT_SYMBOL_GPL(__rcu_read_lock); @@ -455,8 +459,14 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) return; } t->rcu_read_unlock_special.s = 0; - if (special.b.need_qs) - rcu_qs(); + if (special.b.need_qs) { + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) { + rcu_report_qs_rdp(rdp); + udelay(rcu_unlock_delay); + } else { + rcu_qs(); + } + } /* * Respond to a request by an expedited grace period for a @@ -769,6 +779,24 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) #else /* #ifdef CONFIG_PREEMPT_RCU */ /* + * If strict grace periods are enabled, and if the calling + * __rcu_read_unlock() marks the beginning of a quiescent state, immediately + * report that quiescent state and, if requested, spin for a bit. + */ +void rcu_read_unlock_strict(void) +{ + struct rcu_data *rdp; + + if (!IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) || + irqs_disabled() || preempt_count() || !rcu_state.gp_kthread) + return; + rdp = this_cpu_ptr(&rcu_data); + rcu_report_qs_rdp(rdp); + udelay(rcu_unlock_delay); +} +EXPORT_SYMBOL_GPL(rcu_read_unlock_strict); + +/* * Tell them what RCU they are running. */ static void __init rcu_bootup_announce(void) @@ -1926,6 +1954,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) * nearest grace period (if any) to wait for next. The CB kthreads * and the global grace-period kthread are awakened if needed. */ + WARN_ON_ONCE(my_rdp->nocb_gp_rdp != my_rdp); for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_cb_rdp) { trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check")); rcu_nocb_lock_irqsave(rdp, flags); @@ -2411,13 +2440,12 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) return; waslocked = raw_spin_is_locked(&rdp->nocb_gp_lock); - wastimer = timer_pending(&rdp->nocb_timer); + wastimer = timer_pending(&rdp->nocb_bypass_timer); wassleep = swait_active(&rdp->nocb_gp_wq); - if (!rdp->nocb_defer_wakeup && !rdp->nocb_gp_sleep && - !waslocked && !wastimer && !wassleep) + if (!rdp->nocb_gp_sleep && !waslocked && !wastimer && !wassleep) return; /* Nothing untowards. */ - pr_info(" !!! %c%c%c%c %c\n", + pr_info(" nocb GP activity on CB-only CPU!!! %c%c%c%c %c\n", "lL"[waslocked], "dD"[!!rdp->nocb_defer_wakeup], "tT"[wastimer], diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index b5d3b4794db4..0fde39b8daab 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -158,7 +158,7 @@ static void rcu_stall_kick_kthreads(void) { unsigned long j; - if (!rcu_kick_kthreads) + if (!READ_ONCE(rcu_kick_kthreads)) return; j = READ_ONCE(rcu_state.jiffies_kick_kthreads); if (time_after(jiffies, j) && rcu_state.gp_kthread && @@ -580,7 +580,7 @@ static void check_cpu_stall(struct rcu_data *rdp) unsigned long js; struct rcu_node *rnp; - if ((rcu_stall_is_suppressed() && !rcu_kick_kthreads) || + if ((rcu_stall_is_suppressed() && !READ_ONCE(rcu_kick_kthreads)) || !rcu_gp_in_progress()) return; rcu_stall_kick_kthreads(); @@ -623,7 +623,7 @@ static void check_cpu_stall(struct rcu_data *rdp) /* We haven't checked in, so go dump stack. */ print_cpu_stall(gps); - if (rcu_cpu_stall_ftrace_dump) + if (READ_ONCE(rcu_cpu_stall_ftrace_dump)) rcu_ftrace_dump(DUMP_ALL); } else if (rcu_gp_in_progress() && @@ -632,7 +632,7 @@ static void check_cpu_stall(struct rcu_data *rdp) /* They had a few time units to dump stack, so complain. */ print_other_cpu_stall(gs2, gps); - if (rcu_cpu_stall_ftrace_dump) + if (READ_ONCE(rcu_cpu_stall_ftrace_dump)) rcu_ftrace_dump(DUMP_ALL); } } diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 2de49b5d8dd2..39334d2d2b37 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -53,19 +53,6 @@ #endif #define MODULE_PARAM_PREFIX "rcupdate." -#ifndef data_race -#define data_race(expr) \ - ({ \ - expr; \ - }) -#endif -#ifndef ASSERT_EXCLUSIVE_WRITER -#define ASSERT_EXCLUSIVE_WRITER(var) do { } while (0) -#endif -#ifndef ASSERT_EXCLUSIVE_ACCESS -#define ASSERT_EXCLUSIVE_ACCESS(var) do { } while (0) -#endif - #ifndef CONFIG_TINY_RCU module_param(rcu_expedited, int, 0); module_param(rcu_normal, int, 0); @@ -469,7 +456,7 @@ void destroy_rcu_head_on_stack(struct rcu_head *head) } EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack); -struct debug_obj_descr rcuhead_debug_descr = { +const struct debug_obj_descr rcuhead_debug_descr = { .name = "rcu_head", .is_static_object = rcuhead_is_static_object, }; diff --git a/kernel/relay.c b/kernel/relay.c index 72fe443ea78f..b08d936d5fa7 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -197,6 +197,7 @@ free_buf: static void relay_destroy_channel(struct kref *kref) { struct rchan *chan = container_of(kref, struct rchan, kref); + free_percpu(chan->buf); kfree(chan); } @@ -1001,7 +1002,7 @@ static int relay_file_read_avail(struct rchan_buf *buf) size_t subbuf_size = buf->chan->subbuf_size; size_t n_subbufs = buf->chan->n_subbufs; size_t produced = buf->subbufs_produced; - size_t consumed = buf->subbufs_consumed; + size_t consumed; relay_file_read_consume(buf, 0, 0); diff --git a/kernel/resource.c b/kernel/resource.c index 841737bbda9e..3ae2f56cc79d 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -382,10 +382,13 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end, if (p) { /* copy data */ - res->start = max(start, p->start); - res->end = min(end, p->end); - res->flags = p->flags; - res->desc = p->desc; + *res = (struct resource) { + .start = max(start, p->start), + .end = min(end, p->end), + .flags = p->flags, + .desc = p->desc, + .parent = p->parent, + }; } read_unlock(&resource_lock); @@ -1237,7 +1240,6 @@ EXPORT_SYMBOL(__release_region); #ifdef CONFIG_MEMORY_HOTREMOVE /** * release_mem_region_adjustable - release a previously reserved memory region - * @parent: parent resource descriptor * @start: resource start address * @size: resource region size * @@ -1255,21 +1257,28 @@ EXPORT_SYMBOL(__release_region); * assumes that all children remain in the lower address entry for * simplicity. Enhance this logic when necessary. */ -int release_mem_region_adjustable(struct resource *parent, - resource_size_t start, resource_size_t size) +void release_mem_region_adjustable(resource_size_t start, resource_size_t size) { + struct resource *parent = &iomem_resource; + struct resource *new_res = NULL; + bool alloc_nofail = false; struct resource **p; struct resource *res; - struct resource *new_res; resource_size_t end; - int ret = -EINVAL; end = start + size - 1; - if ((start < parent->start) || (end > parent->end)) - return ret; + if (WARN_ON_ONCE((start < parent->start) || (end > parent->end))) + return; - /* The alloc_resource() result gets checked later */ - new_res = alloc_resource(GFP_KERNEL); + /* + * We free up quite a lot of memory on memory hotunplug (esp., memap), + * just before releasing the region. This is highly unlikely to + * fail - let's play save and make it never fail as the caller cannot + * perform any error handling (e.g., trying to re-add memory will fail + * similarly). + */ +retry: + new_res = alloc_resource(GFP_KERNEL | (alloc_nofail ? __GFP_NOFAIL : 0)); p = &parent->child; write_lock(&resource_lock); @@ -1295,7 +1304,6 @@ int release_mem_region_adjustable(struct resource *parent, * so if we are dealing with them, let us just back off here. */ if (!(res->flags & IORESOURCE_SYSRAM)) { - ret = 0; break; } @@ -1312,20 +1320,23 @@ int release_mem_region_adjustable(struct resource *parent, /* free the whole entry */ *p = res->sibling; free_resource(res); - ret = 0; } else if (res->start == start && res->end != end) { /* adjust the start */ - ret = __adjust_resource(res, end + 1, - res->end - end); + WARN_ON_ONCE(__adjust_resource(res, end + 1, + res->end - end)); } else if (res->start != start && res->end == end) { /* adjust the end */ - ret = __adjust_resource(res, res->start, - start - res->start); + WARN_ON_ONCE(__adjust_resource(res, res->start, + start - res->start)); } else { - /* split into two entries */ + /* split into two entries - we need a new resource */ if (!new_res) { - ret = -ENOMEM; - break; + new_res = alloc_resource(GFP_ATOMIC); + if (!new_res) { + alloc_nofail = true; + write_unlock(&resource_lock); + goto retry; + } } new_res->name = res->name; new_res->start = end + 1; @@ -1336,9 +1347,8 @@ int release_mem_region_adjustable(struct resource *parent, new_res->sibling = res->sibling; new_res->child = NULL; - ret = __adjust_resource(res, res->start, - start - res->start); - if (ret) + if (WARN_ON_ONCE(__adjust_resource(res, res->start, + start - res->start))) break; res->sibling = new_res; new_res = NULL; @@ -1349,10 +1359,69 @@ int release_mem_region_adjustable(struct resource *parent, write_unlock(&resource_lock); free_resource(new_res); - return ret; } #endif /* CONFIG_MEMORY_HOTREMOVE */ +#ifdef CONFIG_MEMORY_HOTPLUG +static bool system_ram_resources_mergeable(struct resource *r1, + struct resource *r2) +{ + /* We assume either r1 or r2 is IORESOURCE_SYSRAM_MERGEABLE. */ + return r1->flags == r2->flags && r1->end + 1 == r2->start && + r1->name == r2->name && r1->desc == r2->desc && + !r1->child && !r2->child; +} + +/* + * merge_system_ram_resource - mark the System RAM resource mergeable and try to + * merge it with adjacent, mergeable resources + * @res: resource descriptor + * + * This interface is intended for memory hotplug, whereby lots of contiguous + * system ram resources are added (e.g., via add_memory*()) by a driver, and + * the actual resource boundaries are not of interest (e.g., it might be + * relevant for DIMMs). Only resources that are marked mergeable, that have the + * same parent, and that don't have any children are considered. All mergeable + * resources must be immutable during the request. + * + * Note: + * - The caller has to make sure that no pointers to resources that are + * marked mergeable are used anymore after this call - the resource might + * be freed and the pointer might be stale! + * - release_mem_region_adjustable() will split on demand on memory hotunplug + */ +void merge_system_ram_resource(struct resource *res) +{ + const unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; + struct resource *cur; + + if (WARN_ON_ONCE((res->flags & flags) != flags)) + return; + + write_lock(&resource_lock); + res->flags |= IORESOURCE_SYSRAM_MERGEABLE; + + /* Try to merge with next item in the list. */ + cur = res->sibling; + if (cur && system_ram_resources_mergeable(res, cur)) { + res->end = cur->end; + res->sibling = cur->sibling; + free_resource(cur); + } + + /* Try to merge with previous item in the list. */ + cur = res->parent->child; + while (cur && cur->sibling != res) + cur = cur->sibling; + if (cur && system_ram_resources_mergeable(cur, res)) { + cur->end = res->end; + cur->sibling = res->sibling; + free_resource(res); + } + write_unlock(&resource_lock); +} +#endif /* CONFIG_MEMORY_HOTPLUG */ + /* * Managed region resource */ diff --git a/kernel/scftorture.c b/kernel/scftorture.c new file mode 100644 index 000000000000..554a521ee235 --- /dev/null +++ b/kernel/scftorture.c @@ -0,0 +1,575 @@ +// SPDX-License-Identifier: GPL-2.0+ +// +// Torture test for smp_call_function() and friends. +// +// Copyright (C) Facebook, 2020. +// +// Author: Paul E. McKenney <paulmck@kernel.org> + +#define pr_fmt(fmt) fmt + +#include <linux/atomic.h> +#include <linux/bitops.h> +#include <linux/completion.h> +#include <linux/cpu.h> +#include <linux/delay.h> +#include <linux/err.h> +#include <linux/init.h> +#include <linux/interrupt.h> +#include <linux/kthread.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/notifier.h> +#include <linux/percpu.h> +#include <linux/rcupdate.h> +#include <linux/rcupdate_trace.h> +#include <linux/reboot.h> +#include <linux/sched.h> +#include <linux/spinlock.h> +#include <linux/smp.h> +#include <linux/stat.h> +#include <linux/srcu.h> +#include <linux/slab.h> +#include <linux/torture.h> +#include <linux/types.h> + +#define SCFTORT_STRING "scftorture" +#define SCFTORT_FLAG SCFTORT_STRING ": " + +#define SCFTORTOUT(s, x...) \ + pr_alert(SCFTORT_FLAG s, ## x) + +#define VERBOSE_SCFTORTOUT(s, x...) \ + do { if (verbose) pr_alert(SCFTORT_FLAG s, ## x); } while (0) + +#define VERBOSE_SCFTORTOUT_ERRSTRING(s, x...) \ + do { if (verbose) pr_alert(SCFTORT_FLAG "!!! " s, ## x); } while (0) + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Paul E. McKenney <paulmck@kernel.org>"); + +// Wait until there are multiple CPUs before starting test. +torture_param(int, holdoff, IS_BUILTIN(CONFIG_SCF_TORTURE_TEST) ? 10 : 0, + "Holdoff time before test start (s)"); +torture_param(int, longwait, 0, "Include ridiculously long waits? (seconds)"); +torture_param(int, nthreads, -1, "# threads, defaults to -1 for all CPUs."); +torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)"); +torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (s), 0=disable"); +torture_param(int, shutdown_secs, 0, "Shutdown time (ms), <= zero to disable."); +torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s."); +torture_param(int, stutter_cpus, 5, "Number of jiffies to change CPUs under test, 0=disable"); +torture_param(bool, use_cpus_read_lock, 0, "Use cpus_read_lock() to exclude CPU hotplug."); +torture_param(int, verbose, 0, "Enable verbose debugging printk()s"); +torture_param(int, weight_single, -1, "Testing weight for single-CPU no-wait operations."); +torture_param(int, weight_single_wait, -1, "Testing weight for single-CPU operations."); +torture_param(int, weight_many, -1, "Testing weight for multi-CPU no-wait operations."); +torture_param(int, weight_many_wait, -1, "Testing weight for multi-CPU operations."); +torture_param(int, weight_all, -1, "Testing weight for all-CPU no-wait operations."); +torture_param(int, weight_all_wait, -1, "Testing weight for all-CPU operations."); + +char *torture_type = ""; + +#ifdef MODULE +# define SCFTORT_SHUTDOWN 0 +#else +# define SCFTORT_SHUTDOWN 1 +#endif + +torture_param(bool, shutdown, SCFTORT_SHUTDOWN, "Shutdown at end of torture test."); + +struct scf_statistics { + struct task_struct *task; + int cpu; + long long n_single; + long long n_single_ofl; + long long n_single_wait; + long long n_single_wait_ofl; + long long n_many; + long long n_many_wait; + long long n_all; + long long n_all_wait; +}; + +static struct scf_statistics *scf_stats_p; +static struct task_struct *scf_torture_stats_task; +static DEFINE_PER_CPU(long long, scf_invoked_count); + +// Data for random primitive selection +#define SCF_PRIM_SINGLE 0 +#define SCF_PRIM_MANY 1 +#define SCF_PRIM_ALL 2 +#define SCF_NPRIMS (2 * 3) // Need wait and no-wait versions of each. + +static char *scf_prim_name[] = { + "smp_call_function_single", + "smp_call_function_many", + "smp_call_function", +}; + +struct scf_selector { + unsigned long scfs_weight; + int scfs_prim; + bool scfs_wait; +}; +static struct scf_selector scf_sel_array[SCF_NPRIMS]; +static int scf_sel_array_len; +static unsigned long scf_sel_totweight; + +// Communicate between caller and handler. +struct scf_check { + bool scfc_in; + bool scfc_out; + int scfc_cpu; // -1 for not _single(). + bool scfc_wait; +}; + +// Use to wait for all threads to start. +static atomic_t n_started; +static atomic_t n_errs; +static atomic_t n_mb_in_errs; +static atomic_t n_mb_out_errs; +static atomic_t n_alloc_errs; +static bool scfdone; +static char *bangstr = ""; + +static DEFINE_TORTURE_RANDOM_PERCPU(scf_torture_rand); + +// Print torture statistics. Caller must ensure serialization. +static void scf_torture_stats_print(void) +{ + int cpu; + int i; + long long invoked_count = 0; + bool isdone = READ_ONCE(scfdone); + struct scf_statistics scfs = {}; + + for_each_possible_cpu(cpu) + invoked_count += data_race(per_cpu(scf_invoked_count, cpu)); + for (i = 0; i < nthreads; i++) { + scfs.n_single += scf_stats_p[i].n_single; + scfs.n_single_ofl += scf_stats_p[i].n_single_ofl; + scfs.n_single_wait += scf_stats_p[i].n_single_wait; + scfs.n_single_wait_ofl += scf_stats_p[i].n_single_wait_ofl; + scfs.n_many += scf_stats_p[i].n_many; + scfs.n_many_wait += scf_stats_p[i].n_many_wait; + scfs.n_all += scf_stats_p[i].n_all; + scfs.n_all_wait += scf_stats_p[i].n_all_wait; + } + if (atomic_read(&n_errs) || atomic_read(&n_mb_in_errs) || + atomic_read(&n_mb_out_errs) || atomic_read(&n_alloc_errs)) + bangstr = "!!! "; + pr_alert("%s %sscf_invoked_count %s: %lld single: %lld/%lld single_ofl: %lld/%lld many: %lld/%lld all: %lld/%lld ", + SCFTORT_FLAG, bangstr, isdone ? "VER" : "ver", invoked_count, + scfs.n_single, scfs.n_single_wait, scfs.n_single_ofl, scfs.n_single_wait_ofl, + scfs.n_many, scfs.n_many_wait, scfs.n_all, scfs.n_all_wait); + torture_onoff_stats(); + pr_cont("ste: %d stnmie: %d stnmoe: %d staf: %d\n", atomic_read(&n_errs), + atomic_read(&n_mb_in_errs), atomic_read(&n_mb_out_errs), + atomic_read(&n_alloc_errs)); +} + +// Periodically prints torture statistics, if periodic statistics printing +// was specified via the stat_interval module parameter. +static int +scf_torture_stats(void *arg) +{ + VERBOSE_TOROUT_STRING("scf_torture_stats task started"); + do { + schedule_timeout_interruptible(stat_interval * HZ); + scf_torture_stats_print(); + torture_shutdown_absorb("scf_torture_stats"); + } while (!torture_must_stop()); + torture_kthread_stopping("scf_torture_stats"); + return 0; +} + +// Add a primitive to the scf_sel_array[]. +static void scf_sel_add(unsigned long weight, int prim, bool wait) +{ + struct scf_selector *scfsp = &scf_sel_array[scf_sel_array_len]; + + // If no weight, if array would overflow, if computing three-place + // percentages would overflow, or if the scf_prim_name[] array would + // overflow, don't bother. In the last three two cases, complain. + if (!weight || + WARN_ON_ONCE(scf_sel_array_len >= ARRAY_SIZE(scf_sel_array)) || + WARN_ON_ONCE(0 - 100000 * weight <= 100000 * scf_sel_totweight) || + WARN_ON_ONCE(prim >= ARRAY_SIZE(scf_prim_name))) + return; + scf_sel_totweight += weight; + scfsp->scfs_weight = scf_sel_totweight; + scfsp->scfs_prim = prim; + scfsp->scfs_wait = wait; + scf_sel_array_len++; +} + +// Dump out weighting percentages for scf_prim_name[] array. +static void scf_sel_dump(void) +{ + int i; + unsigned long oldw = 0; + struct scf_selector *scfsp; + unsigned long w; + + for (i = 0; i < scf_sel_array_len; i++) { + scfsp = &scf_sel_array[i]; + w = (scfsp->scfs_weight - oldw) * 100000 / scf_sel_totweight; + pr_info("%s: %3lu.%03lu %s(%s)\n", __func__, w / 1000, w % 1000, + scf_prim_name[scfsp->scfs_prim], + scfsp->scfs_wait ? "wait" : "nowait"); + oldw = scfsp->scfs_weight; + } +} + +// Randomly pick a primitive and wait/nowait, based on weightings. +static struct scf_selector *scf_sel_rand(struct torture_random_state *trsp) +{ + int i; + unsigned long w = torture_random(trsp) % (scf_sel_totweight + 1); + + for (i = 0; i < scf_sel_array_len; i++) + if (scf_sel_array[i].scfs_weight >= w) + return &scf_sel_array[i]; + WARN_ON_ONCE(1); + return &scf_sel_array[0]; +} + +// Update statistics and occasionally burn up mass quantities of CPU time, +// if told to do so via scftorture.longwait. Otherwise, occasionally burn +// a little bit. +static void scf_handler(void *scfc_in) +{ + int i; + int j; + unsigned long r = torture_random(this_cpu_ptr(&scf_torture_rand)); + struct scf_check *scfcp = scfc_in; + + if (likely(scfcp)) { + WRITE_ONCE(scfcp->scfc_out, false); // For multiple receivers. + if (WARN_ON_ONCE(unlikely(!READ_ONCE(scfcp->scfc_in)))) + atomic_inc(&n_mb_in_errs); + } + this_cpu_inc(scf_invoked_count); + if (longwait <= 0) { + if (!(r & 0xffc0)) + udelay(r & 0x3f); + goto out; + } + if (r & 0xfff) + goto out; + r = (r >> 12); + if (longwait <= 0) { + udelay((r & 0xff) + 1); + goto out; + } + r = r % longwait + 1; + for (i = 0; i < r; i++) { + for (j = 0; j < 1000; j++) { + udelay(1000); + cpu_relax(); + } + } +out: + if (unlikely(!scfcp)) + return; + if (scfcp->scfc_wait) + WRITE_ONCE(scfcp->scfc_out, true); + else + kfree(scfcp); +} + +// As above, but check for correct CPU. +static void scf_handler_1(void *scfc_in) +{ + struct scf_check *scfcp = scfc_in; + + if (likely(scfcp) && WARN_ONCE(smp_processor_id() != scfcp->scfc_cpu, "%s: Wanted CPU %d got CPU %d\n", __func__, scfcp->scfc_cpu, smp_processor_id())) { + atomic_inc(&n_errs); + } + scf_handler(scfcp); +} + +// Randomly do an smp_call_function*() invocation. +static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_random_state *trsp) +{ + uintptr_t cpu; + int ret = 0; + struct scf_check *scfcp = NULL; + struct scf_selector *scfsp = scf_sel_rand(trsp); + + if (use_cpus_read_lock) + cpus_read_lock(); + else + preempt_disable(); + if (scfsp->scfs_prim == SCF_PRIM_SINGLE || scfsp->scfs_wait) { + scfcp = kmalloc(sizeof(*scfcp), GFP_ATOMIC); + if (WARN_ON_ONCE(!scfcp)) { + atomic_inc(&n_alloc_errs); + } else { + scfcp->scfc_cpu = -1; + scfcp->scfc_wait = scfsp->scfs_wait; + scfcp->scfc_out = false; + } + } + switch (scfsp->scfs_prim) { + case SCF_PRIM_SINGLE: + cpu = torture_random(trsp) % nr_cpu_ids; + if (scfsp->scfs_wait) + scfp->n_single_wait++; + else + scfp->n_single++; + if (scfcp) { + scfcp->scfc_cpu = cpu; + barrier(); // Prevent race-reduction compiler optimizations. + scfcp->scfc_in = true; + } + ret = smp_call_function_single(cpu, scf_handler_1, (void *)scfcp, scfsp->scfs_wait); + if (ret) { + if (scfsp->scfs_wait) + scfp->n_single_wait_ofl++; + else + scfp->n_single_ofl++; + kfree(scfcp); + scfcp = NULL; + } + break; + case SCF_PRIM_MANY: + if (scfsp->scfs_wait) + scfp->n_many_wait++; + else + scfp->n_many++; + if (scfcp) { + barrier(); // Prevent race-reduction compiler optimizations. + scfcp->scfc_in = true; + } + smp_call_function_many(cpu_online_mask, scf_handler, scfcp, scfsp->scfs_wait); + break; + case SCF_PRIM_ALL: + if (scfsp->scfs_wait) + scfp->n_all_wait++; + else + scfp->n_all++; + if (scfcp) { + barrier(); // Prevent race-reduction compiler optimizations. + scfcp->scfc_in = true; + } + smp_call_function(scf_handler, scfcp, scfsp->scfs_wait); + break; + default: + WARN_ON_ONCE(1); + if (scfcp) + scfcp->scfc_out = true; + } + if (scfcp && scfsp->scfs_wait) { + if (WARN_ON_ONCE((num_online_cpus() > 1 || scfsp->scfs_prim == SCF_PRIM_SINGLE) && + !scfcp->scfc_out)) + atomic_inc(&n_mb_out_errs); // Leak rather than trash! + else + kfree(scfcp); + barrier(); // Prevent race-reduction compiler optimizations. + } + if (use_cpus_read_lock) + cpus_read_unlock(); + else + preempt_enable(); + if (!(torture_random(trsp) & 0xfff)) + schedule_timeout_uninterruptible(1); +} + +// SCF test kthread. Repeatedly does calls to members of the +// smp_call_function() family of functions. +static int scftorture_invoker(void *arg) +{ + int cpu; + DEFINE_TORTURE_RANDOM(rand); + struct scf_statistics *scfp = (struct scf_statistics *)arg; + bool was_offline = false; + + VERBOSE_SCFTORTOUT("scftorture_invoker %d: task started", scfp->cpu); + cpu = scfp->cpu % nr_cpu_ids; + set_cpus_allowed_ptr(current, cpumask_of(cpu)); + set_user_nice(current, MAX_NICE); + if (holdoff) + schedule_timeout_interruptible(holdoff * HZ); + + VERBOSE_SCFTORTOUT("scftorture_invoker %d: Waiting for all SCF torturers from cpu %d", scfp->cpu, smp_processor_id()); + + // Make sure that the CPU is affinitized appropriately during testing. + WARN_ON_ONCE(smp_processor_id() != scfp->cpu); + + if (!atomic_dec_return(&n_started)) + while (atomic_read_acquire(&n_started)) { + if (torture_must_stop()) { + VERBOSE_SCFTORTOUT("scftorture_invoker %d ended before starting", scfp->cpu); + goto end; + } + schedule_timeout_uninterruptible(1); + } + + VERBOSE_SCFTORTOUT("scftorture_invoker %d started", scfp->cpu); + + do { + scftorture_invoke_one(scfp, &rand); + while (cpu_is_offline(cpu) && !torture_must_stop()) { + schedule_timeout_interruptible(HZ / 5); + was_offline = true; + } + if (was_offline) { + set_cpus_allowed_ptr(current, cpumask_of(cpu)); + was_offline = false; + } + cond_resched(); + } while (!torture_must_stop()); + + VERBOSE_SCFTORTOUT("scftorture_invoker %d ended", scfp->cpu); +end: + torture_kthread_stopping("scftorture_invoker"); + return 0; +} + +static void +scftorture_print_module_parms(const char *tag) +{ + pr_alert(SCFTORT_FLAG + "--- %s: verbose=%d holdoff=%d longwait=%d nthreads=%d onoff_holdoff=%d onoff_interval=%d shutdown_secs=%d stat_interval=%d stutter_cpus=%d use_cpus_read_lock=%d, weight_single=%d, weight_single_wait=%d, weight_many=%d, weight_many_wait=%d, weight_all=%d, weight_all_wait=%d\n", tag, + verbose, holdoff, longwait, nthreads, onoff_holdoff, onoff_interval, shutdown, stat_interval, stutter_cpus, use_cpus_read_lock, weight_single, weight_single_wait, weight_many, weight_many_wait, weight_all, weight_all_wait); +} + +static void scf_cleanup_handler(void *unused) +{ +} + +static void scf_torture_cleanup(void) +{ + int i; + + if (torture_cleanup_begin()) + return; + + WRITE_ONCE(scfdone, true); + if (nthreads) + for (i = 0; i < nthreads; i++) + torture_stop_kthread("scftorture_invoker", scf_stats_p[i].task); + else + goto end; + smp_call_function(scf_cleanup_handler, NULL, 0); + torture_stop_kthread(scf_torture_stats, scf_torture_stats_task); + scf_torture_stats_print(); // -After- the stats thread is stopped! + kfree(scf_stats_p); // -After- the last stats print has completed! + scf_stats_p = NULL; + + if (atomic_read(&n_errs) || atomic_read(&n_mb_in_errs) || atomic_read(&n_mb_out_errs)) + scftorture_print_module_parms("End of test: FAILURE"); + else if (torture_onoff_failures()) + scftorture_print_module_parms("End of test: LOCK_HOTPLUG"); + else + scftorture_print_module_parms("End of test: SUCCESS"); + +end: + torture_cleanup_end(); +} + +static int __init scf_torture_init(void) +{ + long i; + int firsterr = 0; + unsigned long weight_single1 = weight_single; + unsigned long weight_single_wait1 = weight_single_wait; + unsigned long weight_many1 = weight_many; + unsigned long weight_many_wait1 = weight_many_wait; + unsigned long weight_all1 = weight_all; + unsigned long weight_all_wait1 = weight_all_wait; + + if (!torture_init_begin(SCFTORT_STRING, verbose)) + return -EBUSY; + + scftorture_print_module_parms("Start of test"); + + if (weight_single == -1 && weight_single_wait == -1 && + weight_many == -1 && weight_many_wait == -1 && + weight_all == -1 && weight_all_wait == -1) { + weight_single1 = 2 * nr_cpu_ids; + weight_single_wait1 = 2 * nr_cpu_ids; + weight_many1 = 2; + weight_many_wait1 = 2; + weight_all1 = 1; + weight_all_wait1 = 1; + } else { + if (weight_single == -1) + weight_single1 = 0; + if (weight_single_wait == -1) + weight_single_wait1 = 0; + if (weight_many == -1) + weight_many1 = 0; + if (weight_many_wait == -1) + weight_many_wait1 = 0; + if (weight_all == -1) + weight_all1 = 0; + if (weight_all_wait == -1) + weight_all_wait1 = 0; + } + if (weight_single1 == 0 && weight_single_wait1 == 0 && + weight_many1 == 0 && weight_many_wait1 == 0 && + weight_all1 == 0 && weight_all_wait1 == 0) { + VERBOSE_SCFTORTOUT_ERRSTRING("all zero weights makes no sense"); + firsterr = -EINVAL; + goto unwind; + } + scf_sel_add(weight_single1, SCF_PRIM_SINGLE, false); + scf_sel_add(weight_single_wait1, SCF_PRIM_SINGLE, true); + scf_sel_add(weight_many1, SCF_PRIM_MANY, false); + scf_sel_add(weight_many_wait1, SCF_PRIM_MANY, true); + scf_sel_add(weight_all1, SCF_PRIM_ALL, false); + scf_sel_add(weight_all_wait1, SCF_PRIM_ALL, true); + scf_sel_dump(); + + if (onoff_interval > 0) { + firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval, NULL); + if (firsterr) + goto unwind; + } + if (shutdown_secs > 0) { + firsterr = torture_shutdown_init(shutdown_secs, scf_torture_cleanup); + if (firsterr) + goto unwind; + } + + // Worker tasks invoking smp_call_function(). + if (nthreads < 0) + nthreads = num_online_cpus(); + scf_stats_p = kcalloc(nthreads, sizeof(scf_stats_p[0]), GFP_KERNEL); + if (!scf_stats_p) { + VERBOSE_SCFTORTOUT_ERRSTRING("out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + + VERBOSE_SCFTORTOUT("Starting %d smp_call_function() threads\n", nthreads); + + atomic_set(&n_started, nthreads); + for (i = 0; i < nthreads; i++) { + scf_stats_p[i].cpu = i; + firsterr = torture_create_kthread(scftorture_invoker, (void *)&scf_stats_p[i], + scf_stats_p[i].task); + if (firsterr) + goto unwind; + } + if (stat_interval > 0) { + firsterr = torture_create_kthread(scf_torture_stats, NULL, scf_torture_stats_task); + if (firsterr) + goto unwind; + } + + torture_init_end(); + return 0; + +unwind: + torture_init_end(); + scf_torture_cleanup(); + return firsterr; +} + +module_init(scf_torture_init); +module_exit(scf_torture_cleanup); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8471a0f7eb32..d2003a7d5ab5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -36,6 +36,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp); @@ -43,7 +44,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_JUMP_LABEL) +#ifdef CONFIG_SCHED_DEBUG /* * Debugging: various feature bits * @@ -940,11 +941,6 @@ static inline unsigned int uclamp_bucket_id(unsigned int clamp_value) return clamp_value / UCLAMP_BUCKET_DELTA; } -static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value) -{ - return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value); -} - static inline unsigned int uclamp_none(enum uclamp_id clamp_id) { if (clamp_id == UCLAMP_MIN) @@ -2320,7 +2316,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p) state = possible; break; } - /* Fall-through */ + fallthrough; case possible: do_set_cpus_allowed(p, cpu_possible_mask); state = fail; @@ -4551,9 +4547,12 @@ void __noreturn do_task_dead(void) static inline void sched_submit_work(struct task_struct *tsk) { + unsigned int task_flags; + if (!tsk->state) return; + task_flags = tsk->flags; /* * If a worker went to sleep, notify and ask workqueue whether * it wants to wake up a task to maintain concurrency. @@ -4562,9 +4561,9 @@ static inline void sched_submit_work(struct task_struct *tsk) * in the possible wakeup of a kworker and because wq_worker_sleeping() * requires it. */ - if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { + if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) { preempt_disable(); - if (tsk->flags & PF_WQ_WORKER) + if (task_flags & PF_WQ_WORKER) wq_worker_sleeping(tsk); else io_wq_worker_sleeping(tsk); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index e39008242cf4..c03a5775d019 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -102,7 +102,8 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time, unsigned int next_freq) { - if (sg_policy->next_freq == next_freq) + if (sg_policy->next_freq == next_freq && + !cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS)) return false; sg_policy->next_freq = next_freq; @@ -114,22 +115,8 @@ static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time, static void sugov_fast_switch(struct sugov_policy *sg_policy, u64 time, unsigned int next_freq) { - struct cpufreq_policy *policy = sg_policy->policy; - int cpu; - - if (!sugov_update_next_freq(sg_policy, time, next_freq)) - return; - - next_freq = cpufreq_driver_fast_switch(policy, next_freq); - if (!next_freq) - return; - - policy->cur = next_freq; - - if (trace_cpu_frequency_enabled()) { - for_each_cpu(cpu, policy->cpus) - trace_cpu_frequency(next_freq, cpu); - } + if (sugov_update_next_freq(sg_policy, time, next_freq)) + cpufreq_driver_fast_switch(sg_policy->policy, next_freq); } static void sugov_deferred_update(struct sugov_policy *sg_policy, u64 time, @@ -175,7 +162,8 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, freq = map_util_freq(util, freq, max); - if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update) + if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update && + !cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS)) return sg_policy->next_freq; sg_policy->need_freq_update = false; @@ -455,6 +443,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, unsigned long util, max; unsigned int next_f; bool busy; + unsigned int cached_freq = sg_policy->cached_raw_freq; sugov_iowait_boost(sg_cpu, time, flags); sg_cpu->last_update = time; @@ -478,8 +467,8 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, if (busy && next_f < sg_policy->next_freq) { next_f = sg_policy->next_freq; - /* Reset cached freq as next_freq has changed */ - sg_policy->cached_raw_freq = 0; + /* Restore cached freq as next_freq has changed */ + sg_policy->cached_raw_freq = cached_freq; } /* diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 3862a28cd05d..f232305dcefe 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1525,14 +1525,38 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) */ if (pi_task && dl_prio(pi_task->normal_prio) && p->dl.dl_boosted) { pi_se = &pi_task->dl; + /* + * Because of delays in the detection of the overrun of a + * thread's runtime, it might be the case that a thread + * goes to sleep in a rt mutex with negative runtime. As + * a consequence, the thread will be throttled. + * + * While waiting for the mutex, this thread can also be + * boosted via PI, resulting in a thread that is throttled + * and boosted at the same time. + * + * In this case, the boost overrides the throttle. + */ + if (p->dl.dl_throttled) { + /* + * The replenish timer needs to be canceled. No + * problem if it fires concurrently: boosted threads + * are ignored in dl_task_timer(). + */ + hrtimer_try_to_cancel(&p->dl.dl_timer); + p->dl.dl_throttled = 0; + } } else if (!dl_prio(p->normal_prio)) { /* - * Special case in which we have a !SCHED_DEADLINE task - * that is going to be deboosted, but exceeds its - * runtime while doing so. No point in replenishing - * it, as it's going to return back to its original - * scheduling class after this. + * Special case in which we have a !SCHED_DEADLINE task that is going + * to be deboosted, but exceeds its runtime while doing so. No point in + * replenishing it, as it's going to return back to its original + * scheduling class after this. If it has been throttled, we need to + * clear the flag, otherwise the task may wake up as throttled after + * being boosted again with no means to replenish the runtime and clear + * the throttle. */ + p->dl.dl_throttled = 0; BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH); return; } @@ -2480,7 +2504,7 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, } const struct sched_class dl_sched_class - __attribute__((section("__dl_sched_class"))) = { + __section("__dl_sched_class") = { .enqueue_task = enqueue_task_dl, .dequeue_task = dequeue_task_dl, .yield_task = yield_task_dl, diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 36c54265bb2b..0655524700d2 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -245,6 +245,60 @@ set_table_entry(struct ctl_table *entry, entry->proc_handler = proc_handler; } +static int sd_ctl_doflags(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + unsigned long flags = *(unsigned long *)table->data; + size_t data_size = 0; + size_t len = 0; + char *tmp; + int idx; + + if (write) + return 0; + + for_each_set_bit(idx, &flags, __SD_FLAG_CNT) { + char *name = sd_flag_debug[idx].name; + + /* Name plus whitespace */ + data_size += strlen(name) + 1; + } + + if (*ppos > data_size) { + *lenp = 0; + return 0; + } + + tmp = kcalloc(data_size + 1, sizeof(*tmp), GFP_KERNEL); + if (!tmp) + return -ENOMEM; + + for_each_set_bit(idx, &flags, __SD_FLAG_CNT) { + char *name = sd_flag_debug[idx].name; + + len += snprintf(tmp + len, strlen(name) + 2, "%s ", name); + } + + tmp += *ppos; + len -= *ppos; + + if (len > *lenp) + len = *lenp; + if (len) + memcpy(buffer, tmp, len); + if (len < *lenp) { + ((char *)buffer)[len] = '\n'; + len++; + } + + *lenp = len; + *ppos += len; + + kfree(tmp); + + return 0; +} + static struct ctl_table * sd_alloc_ctl_domain_table(struct sched_domain *sd) { @@ -258,7 +312,7 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax); set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax); set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0444, proc_dointvec_minmax); + set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0444, sd_ctl_doflags); set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax); set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring); /* &table[8] is terminator */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1a68a0536add..290f9e38378c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -831,7 +831,7 @@ void init_entity_runnable_average(struct sched_entity *se) void post_init_entity_util_avg(struct task_struct *p) { } -static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) +static void update_tg_load_avg(struct cfs_rq *cfs_rq) { } #endif /* CONFIG_SMP */ @@ -1504,6 +1504,7 @@ enum numa_type { /* Cached statistics for all CPUs within a node */ struct numa_stats { unsigned long load; + unsigned long runnable; unsigned long util; /* Total compute capacity of CPUs on a node */ unsigned long compute_capacity; @@ -1547,19 +1548,22 @@ struct task_numa_env { }; static unsigned long cpu_load(struct rq *rq); +static unsigned long cpu_runnable(struct rq *rq); static unsigned long cpu_util(int cpu); -static inline long adjust_numa_imbalance(int imbalance, int src_nr_running); +static inline long adjust_numa_imbalance(int imbalance, int nr_running); static inline enum numa_type numa_classify(unsigned int imbalance_pct, struct numa_stats *ns) { if ((ns->nr_running > ns->weight) && - ((ns->compute_capacity * 100) < (ns->util * imbalance_pct))) + (((ns->compute_capacity * 100) < (ns->util * imbalance_pct)) || + ((ns->compute_capacity * imbalance_pct) < (ns->runnable * 100)))) return node_overloaded; if ((ns->nr_running < ns->weight) || - ((ns->compute_capacity * 100) > (ns->util * imbalance_pct))) + (((ns->compute_capacity * 100) > (ns->util * imbalance_pct)) && + ((ns->compute_capacity * imbalance_pct) > (ns->runnable * 100)))) return node_has_spare; return node_fully_busy; @@ -1610,6 +1614,7 @@ static void update_numa_stats(struct task_numa_env *env, struct rq *rq = cpu_rq(cpu); ns->load += cpu_load(rq); + ns->runnable += cpu_runnable(rq); ns->util += cpu_util(cpu); ns->nr_running += rq->cfs.h_nr_running; ns->compute_capacity += capacity_of(cpu); @@ -1925,7 +1930,7 @@ static void task_numa_find_cpu(struct task_numa_env *env, src_running = env->src_stats.nr_running - 1; dst_running = env->dst_stats.nr_running + 1; imbalance = max(0, dst_running - src_running); - imbalance = adjust_numa_imbalance(imbalance, src_running); + imbalance = adjust_numa_imbalance(imbalance, dst_running); /* Use idle CPU if there is no imbalance */ if (!imbalance) { @@ -2923,7 +2928,7 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr) curr->node_stamp += period; if (!time_before(jiffies, curr->mm->numa_next_scan)) - task_work_add(curr, work, true); + task_work_add(curr, work, TWA_RESUME); } } @@ -3084,7 +3089,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, /* commit outstanding execution time */ if (cfs_rq->curr == se) update_curr(cfs_rq); - account_entity_dequeue(cfs_rq, se); + update_load_sub(&cfs_rq->load, se->load.weight); } dequeue_load_avg(cfs_rq, se); @@ -3100,7 +3105,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, enqueue_load_avg(cfs_rq, se); if (se->on_rq) - account_entity_enqueue(cfs_rq, se); + update_load_add(&cfs_rq->load, se->load.weight); } @@ -3288,7 +3293,6 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) /** * update_tg_load_avg - update the tg's load avg * @cfs_rq: the cfs_rq whose avg changed - * @force: update regardless of how small the difference * * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load. * However, because tg->load_avg is a global value there are performance @@ -3300,7 +3304,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) * * Updating tg's load_avg is necessary before update_cfs_share(). */ -static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) +static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) { long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib; @@ -3310,7 +3314,7 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) if (cfs_rq->tg == &root_task_group) return; - if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) { + if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) { atomic_long_add(delta, &cfs_rq->tg->load_avg); cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg; } @@ -3612,7 +3616,7 @@ static inline bool skip_blocked_update(struct sched_entity *se) #else /* CONFIG_FAIR_GROUP_SCHED */ -static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} +static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {} static inline int propagate_entity_load_avg(struct sched_entity *se) { @@ -3800,13 +3804,13 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s * IOW we're enqueueing a task on a new CPU. */ attach_entity_load_avg(cfs_rq, se); - update_tg_load_avg(cfs_rq, 0); + update_tg_load_avg(cfs_rq); } else if (decayed) { cfs_rq_util_change(cfs_rq, 0); if (flags & UPDATE_TG) - update_tg_load_avg(cfs_rq, 0); + update_tg_load_avg(cfs_rq); } } @@ -4461,17 +4465,17 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) se = second; } - /* - * Prefer last buddy, try to return the CPU to a preempted task. - */ - if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) - se = cfs_rq->last; - - /* - * Someone really wants this to run. If it's not unfair, run it. - */ - if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) + if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) { + /* + * Someone really wants this to run. If it's not unfair, run it. + */ se = cfs_rq->next; + } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) { + /* + * Prefer last buddy, try to return the CPU to a preempted task. + */ + se = cfs_rq->last; + } clear_buddies(cfs_rq, se); @@ -6075,7 +6079,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int /* * Scan the local SMT mask for idle CPUs. */ -static int select_idle_smt(struct task_struct *p, int target) +static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) { int cpu; @@ -6083,7 +6087,8 @@ static int select_idle_smt(struct task_struct *p, int target) return -1; for_each_cpu(cpu, cpu_smt_mask(target)) { - if (!cpumask_test_cpu(cpu, p->cpus_ptr)) + if (!cpumask_test_cpu(cpu, p->cpus_ptr) || + !cpumask_test_cpu(cpu, sched_domain_span(sd))) continue; if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) return cpu; @@ -6099,7 +6104,7 @@ static inline int select_idle_core(struct task_struct *p, struct sched_domain *s return -1; } -static inline int select_idle_smt(struct task_struct *p, int target) +static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) { return -1; } @@ -6274,7 +6279,7 @@ symmetric: if ((unsigned)i < nr_cpumask_bits) return i; - i = select_idle_smt(p, target); + i = select_idle_smt(p, sd, target); if ((unsigned)i < nr_cpumask_bits) return i; @@ -6594,7 +6599,8 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) util = cpu_util_next(cpu, p, cpu); cpu_cap = capacity_of(cpu); - spare_cap = cpu_cap - util; + spare_cap = cpu_cap; + lsub_positive(&spare_cap, util); /* * Skip CPUs that cannot satisfy the capacity request. @@ -7402,6 +7408,10 @@ static int task_hot(struct task_struct *p, struct lb_env *env) if (unlikely(task_has_idle_policy(p))) return 0; + /* SMT siblings share cache */ + if (env->sd->flags & SD_SHARE_CPUCAPACITY) + return 0; + /* * Buddy candidates are cache hot: */ @@ -7669,8 +7679,8 @@ static int detach_tasks(struct lb_env *env) * scheduler fails to find a good waiting task to * migrate. */ - if (load/2 > env->imbalance && - env->sd->nr_balance_failed <= env->sd->cache_nice_tries) + + if ((load >> env->sd->nr_balance_failed) > env->imbalance) goto next; env->imbalance -= load; @@ -7887,7 +7897,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done) struct sched_entity *se; if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) { - update_tg_load_avg(cfs_rq, 0); + update_tg_load_avg(cfs_rq); if (cfs_rq == &rq->cfs) decayed = true; @@ -8098,6 +8108,8 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) capacity = 1; cpu_rq(cpu)->cpu_capacity = capacity; + trace_sched_cpu_capacity_tp(cpu_rq(cpu)); + sdg->sgc->capacity = capacity; sdg->sgc->min_capacity = capacity; sdg->sgc->max_capacity = capacity; @@ -8957,7 +8969,7 @@ next_group: } } -static inline long adjust_numa_imbalance(int imbalance, int src_nr_running) +static inline long adjust_numa_imbalance(int imbalance, int nr_running) { unsigned int imbalance_min; @@ -8966,7 +8978,7 @@ static inline long adjust_numa_imbalance(int imbalance, int src_nr_running) * tasks that remain local when the source domain is almost idle. */ imbalance_min = 2; - if (src_nr_running <= imbalance_min) + if (nr_running <= imbalance_min) return 0; return imbalance; @@ -9780,6 +9792,15 @@ get_sd_balance_interval(struct sched_domain *sd, int cpu_busy) /* scale ms to jiffies */ interval = msecs_to_jiffies(interval); + + /* + * Reduce likelihood of busy balancing at higher domains racing with + * balancing at lower domains by preventing their balancing periods + * from being multiples of each other. + */ + if (cpu_busy) + interval -= 1; + interval = clamp(interval, 1UL, max_load_balance_interval); return interval; @@ -10786,7 +10807,7 @@ static void detach_entity_cfs_rq(struct sched_entity *se) /* Catch up with the cfs_rq and remove our load when we leave */ update_load_avg(cfs_rq, se, 0); detach_entity_load_avg(cfs_rq, se); - update_tg_load_avg(cfs_rq, false); + update_tg_load_avg(cfs_rq); propagate_entity_cfs_rq(se); } @@ -10805,7 +10826,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se) /* Synchronize entity with its cfs_rq */ update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); attach_entity_load_avg(cfs_rq, se); - update_tg_load_avg(cfs_rq, false); + update_tg_load_avg(cfs_rq); propagate_entity_cfs_rq(se); } @@ -11138,7 +11159,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task * All the scheduling class methods: */ const struct sched_class fair_sched_class - __attribute__((section("__fair_sched_class"))) = { + __section("__fair_sched_class") = { .enqueue_task = enqueue_task_fair, .dequeue_task = dequeue_task_fair, .yield_task = yield_task_fair, @@ -11302,6 +11323,18 @@ int sched_trace_rq_cpu(struct rq *rq) } EXPORT_SYMBOL_GPL(sched_trace_rq_cpu); +int sched_trace_rq_cpu_capacity(struct rq *rq) +{ + return rq ? +#ifdef CONFIG_SMP + rq->cpu_capacity +#else + SCHED_CAPACITY_SCALE +#endif + : -1; +} +EXPORT_SYMBOL_GPL(sched_trace_rq_cpu_capacity); + const struct cpumask *sched_trace_rd_span(struct root_domain *rd) { #ifdef CONFIG_SMP diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 7481cd96f391..68d369cba9e4 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -77,7 +77,7 @@ SCHED_FEAT(WARN_DOUBLE_CLOCK, false) SCHED_FEAT(RT_PUSH_IPI, true) #endif -SCHED_FEAT(RT_RUNTIME_SHARE, true) +SCHED_FEAT(RT_RUNTIME_SHARE, false) SCHED_FEAT(LB_MIN, false) SCHED_FEAT(ATTACH_AGE_LOAD, true) diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 6bf34986f45c..24d0ee26377d 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -54,17 +54,18 @@ __setup("hlt", cpu_idle_nopoll_setup); static noinline int __cpuidle cpu_idle_poll(void) { + trace_cpu_idle(0, smp_processor_id()); + stop_critical_timings(); rcu_idle_enter(); - trace_cpu_idle_rcuidle(0, smp_processor_id()); local_irq_enable(); - stop_critical_timings(); while (!tif_need_resched() && - (cpu_idle_force_poll || tick_check_broadcast_expired())) + (cpu_idle_force_poll || tick_check_broadcast_expired())) cpu_relax(); - start_critical_timings(); - trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); + rcu_idle_exit(); + start_critical_timings(); + trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); return 1; } @@ -90,9 +91,14 @@ void __cpuidle default_idle_call(void) if (current_clr_polling_and_test()) { local_irq_enable(); } else { + + trace_cpu_idle(1, smp_processor_id()); stop_critical_timings(); + rcu_idle_enter(); arch_cpu_idle(); + rcu_idle_exit(); start_critical_timings(); + trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); } } @@ -158,7 +164,6 @@ static void cpuidle_idle_call(void) if (cpuidle_not_available(drv, dev)) { tick_nohz_idle_stop_tick(); - rcu_idle_enter(); default_idle_call(); goto exit_idle; @@ -178,21 +183,17 @@ static void cpuidle_idle_call(void) u64 max_latency_ns; if (idle_should_enter_s2idle()) { - rcu_idle_enter(); entered_state = call_cpuidle_s2idle(drv, dev); if (entered_state > 0) goto exit_idle; - rcu_idle_exit(); - max_latency_ns = U64_MAX; } else { max_latency_ns = dev->forced_idle_latency_limit_ns; } tick_nohz_idle_stop_tick(); - rcu_idle_enter(); next_state = cpuidle_find_deepest_state(drv, dev, max_latency_ns); call_cpuidle(drv, dev, next_state); @@ -209,8 +210,6 @@ static void cpuidle_idle_call(void) else tick_nohz_idle_retain_tick(); - rcu_idle_enter(); - entered_state = call_cpuidle(drv, dev, next_state); /* * Give the governor an opportunity to reflect on the outcome @@ -226,8 +225,6 @@ exit_idle: */ if (WARN_ON_ONCE(irqs_disabled())) local_irq_enable(); - - rcu_idle_exit(); } /* @@ -461,7 +458,7 @@ static void update_curr_idle(struct rq *rq) * Simple, special scheduling class for the per-CPU idle tasks: */ const struct sched_class idle_sched_class - __attribute__((section("__idle_sched_class"))) = { + __section("__idle_sched_class") = { /* no enqueue/yield_task for idle tasks */ /* dequeue is not valid, we print a debug message there: */ diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 168479a7d61b..e23e74d52db5 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -18,6 +18,14 @@ #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0 #endif +#ifdef CONFIG_RSEQ +#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK \ + (MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ \ + | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ_BITMASK) +#else +#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK 0 +#endif + #define MEMBARRIER_CMD_BITMASK \ (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \ | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \ @@ -30,6 +38,11 @@ static void ipi_mb(void *info) smp_mb(); /* IPIs should be serializing but paranoid. */ } +static void ipi_rseq(void *info) +{ + rseq_preempt(current); +} + static void ipi_sync_rq_state(void *info) { struct mm_struct *mm = (struct mm_struct *) info; @@ -129,19 +142,27 @@ static int membarrier_global_expedited(void) return 0; } -static int membarrier_private_expedited(int flags) +static int membarrier_private_expedited(int flags, int cpu_id) { - int cpu; cpumask_var_t tmpmask; struct mm_struct *mm = current->mm; + smp_call_func_t ipi_func = ipi_mb; - if (flags & MEMBARRIER_FLAG_SYNC_CORE) { + if (flags == MEMBARRIER_FLAG_SYNC_CORE) { if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) return -EINVAL; if (!(atomic_read(&mm->membarrier_state) & MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY)) return -EPERM; + } else if (flags == MEMBARRIER_FLAG_RSEQ) { + if (!IS_ENABLED(CONFIG_RSEQ)) + return -EINVAL; + if (!(atomic_read(&mm->membarrier_state) & + MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY)) + return -EPERM; + ipi_func = ipi_rseq; } else { + WARN_ON_ONCE(flags); if (!(atomic_read(&mm->membarrier_state) & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)) return -EPERM; @@ -156,35 +177,59 @@ static int membarrier_private_expedited(int flags) */ smp_mb(); /* system call entry is not a mb. */ - if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) + if (cpu_id < 0 && !zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) return -ENOMEM; cpus_read_lock(); - rcu_read_lock(); - for_each_online_cpu(cpu) { + + if (cpu_id >= 0) { struct task_struct *p; - /* - * Skipping the current CPU is OK even through we can be - * migrated at any point. The current CPU, at the point - * where we read raw_smp_processor_id(), is ensured to - * be in program order with respect to the caller - * thread. Therefore, we can skip this CPU from the - * iteration. - */ - if (cpu == raw_smp_processor_id()) - continue; - p = rcu_dereference(cpu_rq(cpu)->curr); - if (p && p->mm == mm) - __cpumask_set_cpu(cpu, tmpmask); + if (cpu_id >= nr_cpu_ids || !cpu_online(cpu_id)) + goto out; + if (cpu_id == raw_smp_processor_id()) + goto out; + rcu_read_lock(); + p = rcu_dereference(cpu_rq(cpu_id)->curr); + if (!p || p->mm != mm) { + rcu_read_unlock(); + goto out; + } + rcu_read_unlock(); + } else { + int cpu; + + rcu_read_lock(); + for_each_online_cpu(cpu) { + struct task_struct *p; + + /* + * Skipping the current CPU is OK even through we can be + * migrated at any point. The current CPU, at the point + * where we read raw_smp_processor_id(), is ensured to + * be in program order with respect to the caller + * thread. Therefore, we can skip this CPU from the + * iteration. + */ + if (cpu == raw_smp_processor_id()) + continue; + p = rcu_dereference(cpu_rq(cpu)->curr); + if (p && p->mm == mm) + __cpumask_set_cpu(cpu, tmpmask); + } + rcu_read_unlock(); } - rcu_read_unlock(); preempt_disable(); - smp_call_function_many(tmpmask, ipi_mb, NULL, 1); + if (cpu_id >= 0) + smp_call_function_single(cpu_id, ipi_func, NULL, 1); + else + smp_call_function_many(tmpmask, ipi_func, NULL, 1); preempt_enable(); - free_cpumask_var(tmpmask); +out: + if (cpu_id < 0) + free_cpumask_var(tmpmask); cpus_read_unlock(); /* @@ -283,11 +328,18 @@ static int membarrier_register_private_expedited(int flags) set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED, ret; - if (flags & MEMBARRIER_FLAG_SYNC_CORE) { + if (flags == MEMBARRIER_FLAG_SYNC_CORE) { if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) return -EINVAL; ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY; + } else if (flags == MEMBARRIER_FLAG_RSEQ) { + if (!IS_ENABLED(CONFIG_RSEQ)) + return -EINVAL; + ready_state = + MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY; + } else { + WARN_ON_ONCE(flags); } /* @@ -299,6 +351,8 @@ static int membarrier_register_private_expedited(int flags) return 0; if (flags & MEMBARRIER_FLAG_SYNC_CORE) set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE; + if (flags & MEMBARRIER_FLAG_RSEQ) + set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ; atomic_or(set_state, &mm->membarrier_state); ret = sync_runqueues_membarrier_state(mm); if (ret) @@ -310,8 +364,15 @@ static int membarrier_register_private_expedited(int flags) /** * sys_membarrier - issue memory barriers on a set of threads - * @cmd: Takes command values defined in enum membarrier_cmd. - * @flags: Currently needs to be 0. For future extensions. + * @cmd: Takes command values defined in enum membarrier_cmd. + * @flags: Currently needs to be 0 for all commands other than + * MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: in the latter + * case it can be MEMBARRIER_CMD_FLAG_CPU, indicating that @cpu_id + * contains the CPU on which to interrupt (= restart) + * the RSEQ critical section. + * @cpu_id: if @flags == MEMBARRIER_CMD_FLAG_CPU, indicates the cpu on which + * RSEQ CS should be interrupted (@cmd must be + * MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ). * * If this system call is not implemented, -ENOSYS is returned. If the * command specified does not exist, not available on the running @@ -337,10 +398,21 @@ static int membarrier_register_private_expedited(int flags) * smp_mb() X O O * sys_membarrier() O O O */ -SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) +SYSCALL_DEFINE3(membarrier, int, cmd, unsigned int, flags, int, cpu_id) { - if (unlikely(flags)) - return -EINVAL; + switch (cmd) { + case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: + if (unlikely(flags && flags != MEMBARRIER_CMD_FLAG_CPU)) + return -EINVAL; + break; + default: + if (unlikely(flags)) + return -EINVAL; + } + + if (!(flags & MEMBARRIER_CMD_FLAG_CPU)) + cpu_id = -1; + switch (cmd) { case MEMBARRIER_CMD_QUERY: { @@ -362,13 +434,17 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED: return membarrier_register_global_expedited(); case MEMBARRIER_CMD_PRIVATE_EXPEDITED: - return membarrier_private_expedited(0); + return membarrier_private_expedited(0, cpu_id); case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED: return membarrier_register_private_expedited(0); case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE: - return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE); + return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE, cpu_id); case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE: return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE); + case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: + return membarrier_private_expedited(MEMBARRIER_FLAG_RSEQ, cpu_id); + case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ: + return membarrier_register_private_expedited(MEMBARRIER_FLAG_RSEQ); default: return -EINVAL; } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index f215eea6a966..49ec096a8aa1 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2430,7 +2430,7 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) } const struct sched_class rt_sched_class - __attribute__((section("__rt_sched_class"))) = { + __section("__rt_sched_class") = { .enqueue_task = enqueue_task_rt, .dequeue_task = dequeue_task_rt, .yield_task = yield_task_rt, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 28709f6b0975..df80bfcea92e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1471,7 +1471,7 @@ struct sched_group_capacity { int id; #endif - unsigned long cpumask[0]; /* Balance mask */ + unsigned long cpumask[]; /* Balance mask */ }; struct sched_group { @@ -1629,7 +1629,7 @@ enum { #undef SCHED_FEAT -#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_JUMP_LABEL) +#ifdef CONFIG_SCHED_DEBUG /* * To support run-time toggling of sched features, all the translation units @@ -1637,6 +1637,7 @@ enum { */ extern const_debug unsigned int sysctl_sched_features; +#ifdef CONFIG_JUMP_LABEL #define SCHED_FEAT(name, enabled) \ static __always_inline bool static_branch_##name(struct static_key *key) \ { \ @@ -1649,7 +1650,13 @@ static __always_inline bool static_branch_##name(struct static_key *key) \ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; #define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x])) -#else /* !(SCHED_DEBUG && CONFIG_JUMP_LABEL) */ +#else /* !CONFIG_JUMP_LABEL */ + +#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) + +#endif /* CONFIG_JUMP_LABEL */ + +#else /* !SCHED_DEBUG */ /* * Each translation unit has its own copy of sysctl_sched_features to allow @@ -1665,7 +1672,7 @@ static const_debug __maybe_unused unsigned int sysctl_sched_features = #define sched_feat(x) !!(sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) -#endif /* SCHED_DEBUG && CONFIG_JUMP_LABEL */ +#endif /* SCHED_DEBUG */ extern struct static_key_false sched_numa_balancing; extern struct static_key_false sched_schedstats; diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 394bc8126a1e..ceb5b6b12561 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -110,7 +110,7 @@ static void update_curr_stop(struct rq *rq) * Simple, special scheduling class for the per-CPU stop tasks: */ const struct sched_class stop_sched_class - __attribute__((section("__stop_sched_class"))) = { + __section("__stop_sched_class") = { .enqueue_task = enqueue_task_stop, .dequeue_task = dequeue_task_stop, diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 007b0a6b0152..dd7770226086 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -25,10 +25,18 @@ static inline bool sched_debug(void) return sched_debug_enabled; } +#define SD_FLAG(_name, mflags) [__##_name] = { .meta_flags = mflags, .name = #_name }, +const struct sd_flag_debug sd_flag_debug[] = { +#include <linux/sched/sd_flags.h> +}; +#undef SD_FLAG + static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, struct cpumask *groupmask) { struct sched_group *group = sd->groups; + unsigned long flags = sd->flags; + unsigned int idx; cpumask_clear(groupmask); @@ -43,6 +51,21 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu); } + for_each_set_bit(idx, &flags, __SD_FLAG_CNT) { + unsigned int flag = BIT(idx); + unsigned int meta_flags = sd_flag_debug[idx].meta_flags; + + if ((meta_flags & SDF_SHARED_CHILD) && sd->child && + !(sd->child->flags & flag)) + printk(KERN_ERR "ERROR: flag %s set here but not in child\n", + sd_flag_debug[idx].name); + + if ((meta_flags & SDF_SHARED_PARENT) && sd->parent && + !(sd->parent->flags & flag)) + printk(KERN_ERR "ERROR: flag %s set here but not in parent\n", + sd_flag_debug[idx].name); + } + printk(KERN_DEBUG "%*s groups:", level + 1, ""); do { if (!group) { @@ -137,22 +160,22 @@ static inline bool sched_debug(void) } #endif /* CONFIG_SCHED_DEBUG */ +/* Generate a mask of SD flags with the SDF_NEEDS_GROUPS metaflag */ +#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_NEEDS_GROUPS)) | +static const unsigned int SD_DEGENERATE_GROUPS_MASK = +#include <linux/sched/sd_flags.h> +0; +#undef SD_FLAG + static int sd_degenerate(struct sched_domain *sd) { if (cpumask_weight(sched_domain_span(sd)) == 1) return 1; /* Following flags need at least 2 groups */ - if (sd->flags & (SD_BALANCE_NEWIDLE | - SD_BALANCE_FORK | - SD_BALANCE_EXEC | - SD_SHARE_CPUCAPACITY | - SD_ASYM_CPUCAPACITY | - SD_SHARE_PKG_RESOURCES | - SD_SHARE_POWERDOMAIN)) { - if (sd->groups != sd->groups->next) - return 0; - } + if ((sd->flags & SD_DEGENERATE_GROUPS_MASK) && + (sd->groups != sd->groups->next)) + return 0; /* Following flags don't use groups */ if (sd->flags & (SD_WAKE_AFFINE)) @@ -173,18 +196,9 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) return 0; /* Flags needing groups don't count if only 1 group in parent */ - if (parent->groups == parent->groups->next) { - pflags &= ~(SD_BALANCE_NEWIDLE | - SD_BALANCE_FORK | - SD_BALANCE_EXEC | - SD_ASYM_CPUCAPACITY | - SD_SHARE_CPUCAPACITY | - SD_SHARE_PKG_RESOURCES | - SD_PREFER_SIBLING | - SD_SHARE_POWERDOMAIN); - if (nr_node_ids == 1) - pflags &= ~SD_SERIALIZE; - } + if (parent->groups == parent->groups->next) + pflags &= ~SD_DEGENERATE_GROUPS_MASK; + if (~cflags & pflags) return 0; @@ -1219,13 +1233,13 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, case sa_rootdomain: if (!atomic_read(&d->rd->refcount)) free_rootdomain(&d->rd->rcu); - /* Fall through */ + fallthrough; case sa_sd: free_percpu(d->sd); - /* Fall through */ + fallthrough; case sa_sd_storage: __sdt_free(cpu_map); - /* Fall through */ + fallthrough; case sa_none: break; } @@ -1292,7 +1306,6 @@ int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; * SD_SHARE_CPUCAPACITY - describes SMT topologies * SD_SHARE_PKG_RESOURCES - describes shared caches * SD_NUMA - describes NUMA topologies - * SD_SHARE_POWERDOMAIN - describes shared power domain * * Odd one out, which beside describing the topology has a quirk also * prescribes the desired behaviour that goes along with it: @@ -1303,8 +1316,7 @@ int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; (SD_SHARE_CPUCAPACITY | \ SD_SHARE_PKG_RESOURCES | \ SD_NUMA | \ - SD_ASYM_PACKING | \ - SD_SHARE_POWERDOMAIN) + SD_ASYM_PACKING) static struct sched_domain * sd_init(struct sched_domain_topology_level *tl, @@ -1336,8 +1348,8 @@ sd_init(struct sched_domain_topology_level *tl, *sd = (struct sched_domain){ .min_interval = sd_weight, .max_interval = 2*sd_weight, - .busy_factor = 32, - .imbalance_pct = 125, + .busy_factor = 16, + .imbalance_pct = 117, .cache_nice_tries = 0, @@ -1989,11 +2001,10 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att /* Set up domains for CPUs specified by the cpu_map: */ for_each_cpu(i, cpu_map) { struct sched_domain_topology_level *tl; + int dflags = 0; sd = NULL; for_each_sd_topology(tl) { - int dflags = 0; - if (tl == tl_asym) { dflags |= SD_ASYM_CPUCAPACITY; has_asym = true; diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 3ee59ce0a323..8ad7a293255a 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -196,6 +196,10 @@ struct seccomp_filter { */ static void populate_seccomp_data(struct seccomp_data *sd) { + /* + * Instead of using current_pt_reg(), we're already doing the work + * to safely fetch "current", so just use "task" everywhere below. + */ struct task_struct *task = current; struct pt_regs *regs = task_pt_regs(task); unsigned long args[6]; @@ -910,7 +914,7 @@ out: if (flags & SECCOMP_USER_NOTIF_FLAG_CONTINUE) return 0; - syscall_set_return_value(current, task_pt_regs(current), + syscall_set_return_value(current, current_pt_regs(), err, ret); return -1; } @@ -943,13 +947,13 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, /* Set low-order bits as an errno, capped at MAX_ERRNO. */ if (data > MAX_ERRNO) data = MAX_ERRNO; - syscall_set_return_value(current, task_pt_regs(current), + syscall_set_return_value(current, current_pt_regs(), -data, 0); goto skip; case SECCOMP_RET_TRAP: /* Show the handler the original registers. */ - syscall_rollback(current, task_pt_regs(current)); + syscall_rollback(current, current_pt_regs()); /* Let the filter pass back 16 bits of data. */ seccomp_send_sigsys(this_syscall, data); goto skip; @@ -962,7 +966,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, /* ENOSYS these calls if there is no tracer attached. */ if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) { syscall_set_return_value(current, - task_pt_regs(current), + current_pt_regs(), -ENOSYS, 0); goto skip; } @@ -982,7 +986,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, if (fatal_signal_pending(current)) goto skip; /* Check if the tracer forced the syscall to be skipped. */ - this_syscall = syscall_get_nr(current, task_pt_regs(current)); + this_syscall = syscall_get_nr(current, current_pt_regs()); if (this_syscall < 0) goto skip; @@ -1020,20 +1024,20 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, default: seccomp_log(this_syscall, SIGSYS, action, true); /* Dump core only if this is the last remaining thread. */ - if (action == SECCOMP_RET_KILL_PROCESS || + if (action != SECCOMP_RET_KILL_THREAD || get_nr_threads(current) == 1) { kernel_siginfo_t info; /* Show the original registers in the dump. */ - syscall_rollback(current, task_pt_regs(current)); + syscall_rollback(current, current_pt_regs()); /* Trigger a manual coredump since do_exit skips it. */ seccomp_init_siginfo(&info, this_syscall, data); do_coredump(&info); } - if (action == SECCOMP_RET_KILL_PROCESS) - do_group_exit(SIGSYS); - else + if (action == SECCOMP_RET_KILL_THREAD) do_exit(SIGSYS); + else + do_group_exit(SIGSYS); } unreachable(); @@ -1060,7 +1064,7 @@ int __secure_computing(const struct seccomp_data *sd) return 0; this_syscall = sd ? sd->nr : - syscall_get_nr(current, task_pt_regs(current)); + syscall_get_nr(current, current_pt_regs()); switch (mode) { case SECCOMP_MODE_STRICT: @@ -1109,13 +1113,18 @@ out: } #ifdef CONFIG_SECCOMP_FILTER -static int seccomp_notify_release(struct inode *inode, struct file *file) +static void seccomp_notify_free(struct seccomp_filter *filter) +{ + kfree(filter->notif); + filter->notif = NULL; +} + +static void seccomp_notify_detach(struct seccomp_filter *filter) { - struct seccomp_filter *filter = file->private_data; struct seccomp_knotif *knotif; if (!filter) - return 0; + return; mutex_lock(&filter->notify_lock); @@ -1139,9 +1148,15 @@ static int seccomp_notify_release(struct inode *inode, struct file *file) complete(&knotif->ready); } - kfree(filter->notif); - filter->notif = NULL; + seccomp_notify_free(filter); mutex_unlock(&filter->notify_lock); +} + +static int seccomp_notify_release(struct inode *inode, struct file *file) +{ + struct seccomp_filter *filter = file->private_data; + + seccomp_notify_detach(filter); __put_seccomp_filter(filter); return 0; } @@ -1461,13 +1476,7 @@ static const struct file_operations seccomp_notify_ops = { static struct file *init_listener(struct seccomp_filter *filter) { - struct file *ret = ERR_PTR(-EBUSY); - struct seccomp_filter *cur; - - for (cur = current->seccomp.filter; cur; cur = cur->prev) { - if (cur->notif) - goto out; - } + struct file *ret; ret = ERR_PTR(-ENOMEM); filter->notif = kzalloc(sizeof(*(filter->notif)), GFP_KERNEL); @@ -1488,11 +1497,36 @@ static struct file *init_listener(struct seccomp_filter *filter) out_notif: if (IS_ERR(ret)) - kfree(filter->notif); + seccomp_notify_free(filter); out: return ret; } +/* + * Does @new_child have a listener while an ancestor also has a listener? + * If so, we'll want to reject this filter. + * This only has to be tested for the current process, even in the TSYNC case, + * because TSYNC installs @child with the same parent on all threads. + * Note that @new_child is not hooked up to its parent at this point yet, so + * we use current->seccomp.filter. + */ +static bool has_duplicate_listener(struct seccomp_filter *new_child) +{ + struct seccomp_filter *cur; + + /* must be protected against concurrent TSYNC */ + lockdep_assert_held(¤t->sighand->siglock); + + if (!new_child->notif) + return false; + for (cur = current->seccomp.filter; cur; cur = cur->prev) { + if (cur->notif) + return true; + } + + return false; +} + /** * seccomp_set_mode_filter: internal function for setting seccomp filter * @flags: flags to change filter behavior @@ -1564,6 +1598,11 @@ static long seccomp_set_mode_filter(unsigned int flags, if (!seccomp_may_assign_mode(seccomp_mode)) goto out; + if (has_duplicate_listener(prepared)) { + ret = -EBUSY; + goto out; + } + ret = seccomp_attach_filter(flags, prepared); if (ret) goto out; @@ -1581,6 +1620,7 @@ out_put_fd: listener_f->private_data = NULL; fput(listener_f); put_unused_fd(listener); + seccomp_notify_detach(prepared); } else { fd_install(listener, listener_f); ret = listener; diff --git a/kernel/signal.c b/kernel/signal.c index 61b377e65c46..f67ea9a08ac0 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -851,7 +851,7 @@ static int check_kill_permission(int sig, struct kernel_siginfo *info, */ if (!sid || sid == task_session(current)) break; - /* fall through */ + fallthrough; default: return -EPERM; } diff --git a/kernel/smp.c b/kernel/smp.c index d0ae8eb6bf8b..4d17501433be 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -20,6 +20,9 @@ #include <linux/sched.h> #include <linux/sched/idle.h> #include <linux/hypervisor.h> +#include <linux/sched/clock.h> +#include <linux/nmi.h> +#include <linux/sched/debug.h> #include "smpboot.h" #include "sched/smp.h" @@ -96,6 +99,103 @@ void __init call_function_init(void) smpcfd_prepare_cpu(smp_processor_id()); } +#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG + +static DEFINE_PER_CPU(call_single_data_t *, cur_csd); +static DEFINE_PER_CPU(smp_call_func_t, cur_csd_func); +static DEFINE_PER_CPU(void *, cur_csd_info); + +#define CSD_LOCK_TIMEOUT (5ULL * NSEC_PER_SEC) +static atomic_t csd_bug_count = ATOMIC_INIT(0); + +/* Record current CSD work for current CPU, NULL to erase. */ +static void csd_lock_record(call_single_data_t *csd) +{ + if (!csd) { + smp_mb(); /* NULL cur_csd after unlock. */ + __this_cpu_write(cur_csd, NULL); + return; + } + __this_cpu_write(cur_csd_func, csd->func); + __this_cpu_write(cur_csd_info, csd->info); + smp_wmb(); /* func and info before csd. */ + __this_cpu_write(cur_csd, csd); + smp_mb(); /* Update cur_csd before function call. */ + /* Or before unlock, as the case may be. */ +} + +static __always_inline int csd_lock_wait_getcpu(call_single_data_t *csd) +{ + unsigned int csd_type; + + csd_type = CSD_TYPE(csd); + if (csd_type == CSD_TYPE_ASYNC || csd_type == CSD_TYPE_SYNC) + return csd->dst; /* Other CSD_TYPE_ values might not have ->dst. */ + return -1; +} + +/* + * Complain if too much time spent waiting. Note that only + * the CSD_TYPE_SYNC/ASYNC types provide the destination CPU, + * so waiting on other types gets much less information. + */ +static __always_inline bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, int *bug_id) +{ + int cpu = -1; + int cpux; + bool firsttime; + u64 ts2, ts_delta; + call_single_data_t *cpu_cur_csd; + unsigned int flags = READ_ONCE(csd->flags); + + if (!(flags & CSD_FLAG_LOCK)) { + if (!unlikely(*bug_id)) + return true; + cpu = csd_lock_wait_getcpu(csd); + pr_alert("csd: CSD lock (#%d) got unstuck on CPU#%02d, CPU#%02d released the lock.\n", + *bug_id, raw_smp_processor_id(), cpu); + return true; + } + + ts2 = sched_clock(); + ts_delta = ts2 - *ts1; + if (likely(ts_delta <= CSD_LOCK_TIMEOUT)) + return false; + + firsttime = !*bug_id; + if (firsttime) + *bug_id = atomic_inc_return(&csd_bug_count); + cpu = csd_lock_wait_getcpu(csd); + if (WARN_ONCE(cpu < 0 || cpu >= nr_cpu_ids, "%s: cpu = %d\n", __func__, cpu)) + cpux = 0; + else + cpux = cpu; + cpu_cur_csd = smp_load_acquire(&per_cpu(cur_csd, cpux)); /* Before func and info. */ + pr_alert("csd: %s non-responsive CSD lock (#%d) on CPU#%d, waiting %llu ns for CPU#%02d %pS(%ps).\n", + firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), ts2 - ts0, + cpu, csd->func, csd->info); + if (cpu_cur_csd && csd != cpu_cur_csd) { + pr_alert("\tcsd: CSD lock (#%d) handling prior %pS(%ps) request.\n", + *bug_id, READ_ONCE(per_cpu(cur_csd_func, cpux)), + READ_ONCE(per_cpu(cur_csd_info, cpux))); + } else { + pr_alert("\tcsd: CSD lock (#%d) %s.\n", + *bug_id, !cpu_cur_csd ? "unresponsive" : "handling this request"); + } + if (cpu >= 0) { + if (!trigger_single_cpu_backtrace(cpu)) + dump_cpu_task(cpu); + if (!cpu_cur_csd) { + pr_alert("csd: Re-sending CSD lock (#%d) IPI from CPU#%02d to CPU#%02d\n", *bug_id, raw_smp_processor_id(), cpu); + arch_send_call_function_single_ipi(cpu); + } + } + dump_stack(); + *ts1 = ts2; + + return false; +} + /* * csd_lock/csd_unlock used to serialize access to per-cpu csd resources * @@ -105,8 +205,28 @@ void __init call_function_init(void) */ static __always_inline void csd_lock_wait(call_single_data_t *csd) { + int bug_id = 0; + u64 ts0, ts1; + + ts1 = ts0 = sched_clock(); + for (;;) { + if (csd_lock_wait_toolong(csd, ts0, &ts1, &bug_id)) + break; + cpu_relax(); + } + smp_acquire__after_ctrl_dep(); +} + +#else +static void csd_lock_record(call_single_data_t *csd) +{ +} + +static __always_inline void csd_lock_wait(call_single_data_t *csd) +{ smp_cond_load_acquire(&csd->flags, !(VAL & CSD_FLAG_LOCK)); } +#endif static __always_inline void csd_lock(call_single_data_t *csd) { @@ -166,9 +286,11 @@ static int generic_exec_single(int cpu, call_single_data_t *csd) * We can unlock early even for the synchronous on-stack case, * since we're doing this from the same CPU.. */ + csd_lock_record(csd); csd_unlock(csd); local_irq_save(flags); func(info); + csd_lock_record(NULL); local_irq_restore(flags); return 0; } @@ -268,8 +390,10 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) entry = &csd_next->llist; } + csd_lock_record(csd); func(info); csd_unlock(csd); + csd_lock_record(NULL); } else { prev = &csd->llist; } @@ -296,8 +420,10 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) smp_call_func_t func = csd->func; void *info = csd->info; + csd_lock_record(csd); csd_unlock(csd); func(info); + csd_lock_record(NULL); } else if (type == CSD_TYPE_IRQ_WORK) { irq_work_single(csd); } @@ -375,6 +501,10 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, csd->func = func; csd->info = info; +#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG + csd->src = smp_processor_id(); + csd->dst = cpu; +#endif err = generic_exec_single(cpu, csd); @@ -540,6 +670,10 @@ static void smp_call_function_many_cond(const struct cpumask *mask, csd->flags |= CSD_TYPE_SYNC; csd->func = func; csd->info = info; +#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG + csd->src = smp_processor_id(); + csd->dst = cpu; +#endif if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu))) __cpumask_set_cpu(cpu, cfd->cpumask_ipi); } @@ -741,7 +875,7 @@ EXPORT_SYMBOL(on_each_cpu_mask); * for all the required CPUs to finish. This may include the local * processor. * @cond_func: A callback function that is passed a cpu id and - * the the info parameter. The function is called + * the info parameter. The function is called * with preemption disabled. The function should * return a blooean value indicating whether to IPI * the specified CPU. diff --git a/kernel/softirq.c b/kernel/softirq.c index bf88d7f62433..09229ad82209 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -481,6 +481,7 @@ void raise_softirq(unsigned int nr) void __raise_softirq_irqoff(unsigned int nr) { + lockdep_assert_irqs_disabled(); trace_softirq_raise(nr); or_softirq_pending(1UL << nr); } diff --git a/kernel/stackleak.c b/kernel/stackleak.c index a8fc9ae1d03d..ce161a8e8d97 100644 --- a/kernel/stackleak.c +++ b/kernel/stackleak.c @@ -20,7 +20,7 @@ static DEFINE_STATIC_KEY_FALSE(stack_erasing_bypass); int stack_erasing_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret = 0; int state = !static_branch_unlikely(&stack_erasing_bypass); diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index 946f44a9e86a..9f8117c7cfdd 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -78,8 +78,7 @@ struct stacktrace_cookie { unsigned int len; }; -static bool stack_trace_consume_entry(void *cookie, unsigned long addr, - bool reliable) +static bool stack_trace_consume_entry(void *cookie, unsigned long addr) { struct stacktrace_cookie *c = cookie; @@ -94,12 +93,11 @@ static bool stack_trace_consume_entry(void *cookie, unsigned long addr, return c->len < c->size; } -static bool stack_trace_consume_entry_nosched(void *cookie, unsigned long addr, - bool reliable) +static bool stack_trace_consume_entry_nosched(void *cookie, unsigned long addr) { if (in_sched_functions(addr)) return true; - return stack_trace_consume_entry(cookie, addr, reliable); + return stack_trace_consume_entry(cookie, addr); } /** diff --git a/kernel/static_call.c b/kernel/static_call.c new file mode 100644 index 000000000000..84565c2a41b8 --- /dev/null +++ b/kernel/static_call.c @@ -0,0 +1,482 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/init.h> +#include <linux/static_call.h> +#include <linux/bug.h> +#include <linux/smp.h> +#include <linux/sort.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/cpu.h> +#include <linux/processor.h> +#include <asm/sections.h> + +extern struct static_call_site __start_static_call_sites[], + __stop_static_call_sites[]; + +static bool static_call_initialized; + +/* mutex to protect key modules/sites */ +static DEFINE_MUTEX(static_call_mutex); + +static void static_call_lock(void) +{ + mutex_lock(&static_call_mutex); +} + +static void static_call_unlock(void) +{ + mutex_unlock(&static_call_mutex); +} + +static inline void *static_call_addr(struct static_call_site *site) +{ + return (void *)((long)site->addr + (long)&site->addr); +} + + +static inline struct static_call_key *static_call_key(const struct static_call_site *site) +{ + return (struct static_call_key *) + (((long)site->key + (long)&site->key) & ~STATIC_CALL_SITE_FLAGS); +} + +/* These assume the key is word-aligned. */ +static inline bool static_call_is_init(struct static_call_site *site) +{ + return ((long)site->key + (long)&site->key) & STATIC_CALL_SITE_INIT; +} + +static inline bool static_call_is_tail(struct static_call_site *site) +{ + return ((long)site->key + (long)&site->key) & STATIC_CALL_SITE_TAIL; +} + +static inline void static_call_set_init(struct static_call_site *site) +{ + site->key = ((long)static_call_key(site) | STATIC_CALL_SITE_INIT) - + (long)&site->key; +} + +static int static_call_site_cmp(const void *_a, const void *_b) +{ + const struct static_call_site *a = _a; + const struct static_call_site *b = _b; + const struct static_call_key *key_a = static_call_key(a); + const struct static_call_key *key_b = static_call_key(b); + + if (key_a < key_b) + return -1; + + if (key_a > key_b) + return 1; + + return 0; +} + +static void static_call_site_swap(void *_a, void *_b, int size) +{ + long delta = (unsigned long)_a - (unsigned long)_b; + struct static_call_site *a = _a; + struct static_call_site *b = _b; + struct static_call_site tmp = *a; + + a->addr = b->addr - delta; + a->key = b->key - delta; + + b->addr = tmp.addr + delta; + b->key = tmp.key + delta; +} + +static inline void static_call_sort_entries(struct static_call_site *start, + struct static_call_site *stop) +{ + sort(start, stop - start, sizeof(struct static_call_site), + static_call_site_cmp, static_call_site_swap); +} + +static inline bool static_call_key_has_mods(struct static_call_key *key) +{ + return !(key->type & 1); +} + +static inline struct static_call_mod *static_call_key_next(struct static_call_key *key) +{ + if (!static_call_key_has_mods(key)) + return NULL; + + return key->mods; +} + +static inline struct static_call_site *static_call_key_sites(struct static_call_key *key) +{ + if (static_call_key_has_mods(key)) + return NULL; + + return (struct static_call_site *)(key->type & ~1); +} + +void __static_call_update(struct static_call_key *key, void *tramp, void *func) +{ + struct static_call_site *site, *stop; + struct static_call_mod *site_mod, first; + + cpus_read_lock(); + static_call_lock(); + + if (key->func == func) + goto done; + + key->func = func; + + arch_static_call_transform(NULL, tramp, func, false); + + /* + * If uninitialized, we'll not update the callsites, but they still + * point to the trampoline and we just patched that. + */ + if (WARN_ON_ONCE(!static_call_initialized)) + goto done; + + first = (struct static_call_mod){ + .next = static_call_key_next(key), + .mod = NULL, + .sites = static_call_key_sites(key), + }; + + for (site_mod = &first; site_mod; site_mod = site_mod->next) { + struct module *mod = site_mod->mod; + + if (!site_mod->sites) { + /* + * This can happen if the static call key is defined in + * a module which doesn't use it. + * + * It also happens in the has_mods case, where the + * 'first' entry has no sites associated with it. + */ + continue; + } + + stop = __stop_static_call_sites; + +#ifdef CONFIG_MODULES + if (mod) { + stop = mod->static_call_sites + + mod->num_static_call_sites; + } +#endif + + for (site = site_mod->sites; + site < stop && static_call_key(site) == key; site++) { + void *site_addr = static_call_addr(site); + + if (static_call_is_init(site)) { + /* + * Don't write to call sites which were in + * initmem and have since been freed. + */ + if (!mod && system_state >= SYSTEM_RUNNING) + continue; + if (mod && !within_module_init((unsigned long)site_addr, mod)) + continue; + } + + if (!kernel_text_address((unsigned long)site_addr)) { + WARN_ONCE(1, "can't patch static call site at %pS", + site_addr); + continue; + } + + arch_static_call_transform(site_addr, NULL, func, + static_call_is_tail(site)); + } + } + +done: + static_call_unlock(); + cpus_read_unlock(); +} +EXPORT_SYMBOL_GPL(__static_call_update); + +static int __static_call_init(struct module *mod, + struct static_call_site *start, + struct static_call_site *stop) +{ + struct static_call_site *site; + struct static_call_key *key, *prev_key = NULL; + struct static_call_mod *site_mod; + + if (start == stop) + return 0; + + static_call_sort_entries(start, stop); + + for (site = start; site < stop; site++) { + void *site_addr = static_call_addr(site); + + if ((mod && within_module_init((unsigned long)site_addr, mod)) || + (!mod && init_section_contains(site_addr, 1))) + static_call_set_init(site); + + key = static_call_key(site); + if (key != prev_key) { + prev_key = key; + + /* + * For vmlinux (!mod) avoid the allocation by storing + * the sites pointer in the key itself. Also see + * __static_call_update()'s @first. + * + * This allows architectures (eg. x86) to call + * static_call_init() before memory allocation works. + */ + if (!mod) { + key->sites = site; + key->type |= 1; + goto do_transform; + } + + site_mod = kzalloc(sizeof(*site_mod), GFP_KERNEL); + if (!site_mod) + return -ENOMEM; + + /* + * When the key has a direct sites pointer, extract + * that into an explicit struct static_call_mod, so we + * can have a list of modules. + */ + if (static_call_key_sites(key)) { + site_mod->mod = NULL; + site_mod->next = NULL; + site_mod->sites = static_call_key_sites(key); + + key->mods = site_mod; + + site_mod = kzalloc(sizeof(*site_mod), GFP_KERNEL); + if (!site_mod) + return -ENOMEM; + } + + site_mod->mod = mod; + site_mod->sites = site; + site_mod->next = static_call_key_next(key); + key->mods = site_mod; + } + +do_transform: + arch_static_call_transform(site_addr, NULL, key->func, + static_call_is_tail(site)); + } + + return 0; +} + +static int addr_conflict(struct static_call_site *site, void *start, void *end) +{ + unsigned long addr = (unsigned long)static_call_addr(site); + + if (addr <= (unsigned long)end && + addr + CALL_INSN_SIZE > (unsigned long)start) + return 1; + + return 0; +} + +static int __static_call_text_reserved(struct static_call_site *iter_start, + struct static_call_site *iter_stop, + void *start, void *end) +{ + struct static_call_site *iter = iter_start; + + while (iter < iter_stop) { + if (addr_conflict(iter, start, end)) + return 1; + iter++; + } + + return 0; +} + +#ifdef CONFIG_MODULES + +static int __static_call_mod_text_reserved(void *start, void *end) +{ + struct module *mod; + int ret; + + preempt_disable(); + mod = __module_text_address((unsigned long)start); + WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod); + if (!try_module_get(mod)) + mod = NULL; + preempt_enable(); + + if (!mod) + return 0; + + ret = __static_call_text_reserved(mod->static_call_sites, + mod->static_call_sites + mod->num_static_call_sites, + start, end); + + module_put(mod); + + return ret; +} + +static int static_call_add_module(struct module *mod) +{ + return __static_call_init(mod, mod->static_call_sites, + mod->static_call_sites + mod->num_static_call_sites); +} + +static void static_call_del_module(struct module *mod) +{ + struct static_call_site *start = mod->static_call_sites; + struct static_call_site *stop = mod->static_call_sites + + mod->num_static_call_sites; + struct static_call_key *key, *prev_key = NULL; + struct static_call_mod *site_mod, **prev; + struct static_call_site *site; + + for (site = start; site < stop; site++) { + key = static_call_key(site); + if (key == prev_key) + continue; + + prev_key = key; + + for (prev = &key->mods, site_mod = key->mods; + site_mod && site_mod->mod != mod; + prev = &site_mod->next, site_mod = site_mod->next) + ; + + if (!site_mod) + continue; + + *prev = site_mod->next; + kfree(site_mod); + } +} + +static int static_call_module_notify(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct module *mod = data; + int ret = 0; + + cpus_read_lock(); + static_call_lock(); + + switch (val) { + case MODULE_STATE_COMING: + ret = static_call_add_module(mod); + if (ret) { + WARN(1, "Failed to allocate memory for static calls"); + static_call_del_module(mod); + } + break; + case MODULE_STATE_GOING: + static_call_del_module(mod); + break; + } + + static_call_unlock(); + cpus_read_unlock(); + + return notifier_from_errno(ret); +} + +static struct notifier_block static_call_module_nb = { + .notifier_call = static_call_module_notify, +}; + +#else + +static inline int __static_call_mod_text_reserved(void *start, void *end) +{ + return 0; +} + +#endif /* CONFIG_MODULES */ + +int static_call_text_reserved(void *start, void *end) +{ + int ret = __static_call_text_reserved(__start_static_call_sites, + __stop_static_call_sites, start, end); + + if (ret) + return ret; + + return __static_call_mod_text_reserved(start, end); +} + +int __init static_call_init(void) +{ + int ret; + + if (static_call_initialized) + return 0; + + cpus_read_lock(); + static_call_lock(); + ret = __static_call_init(NULL, __start_static_call_sites, + __stop_static_call_sites); + static_call_unlock(); + cpus_read_unlock(); + + if (ret) { + pr_err("Failed to allocate memory for static_call!\n"); + BUG(); + } + + static_call_initialized = true; + +#ifdef CONFIG_MODULES + register_module_notifier(&static_call_module_nb); +#endif + return 0; +} +early_initcall(static_call_init); + +#ifdef CONFIG_STATIC_CALL_SELFTEST + +static int func_a(int x) +{ + return x+1; +} + +static int func_b(int x) +{ + return x+2; +} + +DEFINE_STATIC_CALL(sc_selftest, func_a); + +static struct static_call_data { + int (*func)(int); + int val; + int expect; +} static_call_data [] __initdata = { + { NULL, 2, 3 }, + { func_b, 2, 4 }, + { func_a, 2, 3 } +}; + +static int __init test_static_call_init(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(static_call_data); i++ ) { + struct static_call_data *scd = &static_call_data[i]; + + if (scd->func) + static_call_update(sc_selftest, scd->func); + + WARN_ON(static_call(sc_selftest)(scd->val) != scd->expect); + } + + return 0; +} +early_initcall(test_static_call_init); + +#endif /* CONFIG_STATIC_CALL_SELFTEST */ diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 865bb0228ab6..890b79cf0e7c 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -178,7 +178,7 @@ static void ack_state(struct multi_stop_data *msdata) set_state(msdata, msdata->state + 1); } -void __weak stop_machine_yield(const struct cpumask *cpumask) +notrace void __weak stop_machine_yield(const struct cpumask *cpumask) { cpu_relax(); } diff --git a/kernel/sys.c b/kernel/sys.c index ca11af9d815d..a730c03ee607 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -373,7 +373,7 @@ long __sys_setregid(gid_t rgid, gid_t egid) if (rgid != (gid_t) -1) { if (gid_eq(old->gid, krgid) || gid_eq(old->egid, krgid) || - ns_capable(old->user_ns, CAP_SETGID)) + ns_capable_setid(old->user_ns, CAP_SETGID)) new->gid = krgid; else goto error; @@ -382,7 +382,7 @@ long __sys_setregid(gid_t rgid, gid_t egid) if (gid_eq(old->gid, kegid) || gid_eq(old->egid, kegid) || gid_eq(old->sgid, kegid) || - ns_capable(old->user_ns, CAP_SETGID)) + ns_capable_setid(old->user_ns, CAP_SETGID)) new->egid = kegid; else goto error; @@ -432,7 +432,7 @@ long __sys_setgid(gid_t gid) old = current_cred(); retval = -EPERM; - if (ns_capable(old->user_ns, CAP_SETGID)) + if (ns_capable_setid(old->user_ns, CAP_SETGID)) new->gid = new->egid = new->sgid = new->fsgid = kgid; else if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->sgid)) new->egid = new->fsgid = kgid; @@ -744,7 +744,7 @@ long __sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) old = current_cred(); retval = -EPERM; - if (!ns_capable(old->user_ns, CAP_SETGID)) { + if (!ns_capable_setid(old->user_ns, CAP_SETGID)) { if (rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) && !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid)) goto error; @@ -871,7 +871,7 @@ long __sys_setfsgid(gid_t gid) if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->egid) || gid_eq(kgid, old->sgid) || gid_eq(kgid, old->fsgid) || - ns_capable(old->user_ns, CAP_SETGID)) { + ns_capable_setid(old->user_ns, CAP_SETGID)) { if (!gid_eq(kgid, old->fsgid)) { new->fsgid = kgid; if (security_task_fix_setgid(new,old,LSM_SETID_FS) == 0) @@ -1753,7 +1753,7 @@ void getrusage(struct task_struct *p, int who, struct rusage *r) if (who == RUSAGE_CHILDREN) break; - /* fall through */ + fallthrough; case RUSAGE_SELF: thread_group_cputime_adjusted(p, &tgutime, &tgstime); @@ -2034,7 +2034,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data * VMAs already unmapped and kernel uses these members for statistics * output in procfs mostly, except * - * - @start_brk/@brk which are used in do_brk but kernel lookups + * - @start_brk/@brk which are used in do_brk_flags but kernel lookups * for VMAs when updating these memvers so anything wrong written * here cause kernel to swear at userspace program but won't lead * to any problem in kernel itself @@ -2238,12 +2238,12 @@ out: } #ifdef CONFIG_CHECKPOINT_RESTORE -static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr) +static int prctl_get_tid_address(struct task_struct *me, int __user * __user *tid_addr) { return put_user(me->clear_child_tid, tid_addr); } #else -static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr) +static int prctl_get_tid_address(struct task_struct *me, int __user * __user *tid_addr) { return -EINVAL; } @@ -2427,7 +2427,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = prctl_set_mm(arg2, arg3, arg4, arg5); break; case PR_GET_TID_ADDRESS: - error = prctl_get_tid_address(me, (int __user **)arg2); + error = prctl_get_tid_address(me, (int __user * __user *)arg2); break; case PR_SET_CHILD_SUBREAPER: me->signal->is_child_subreaper = !!arg2; diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 4d59775ea79c..f27ac94d5fa7 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -280,6 +280,7 @@ COND_SYSCALL(mlockall); COND_SYSCALL(munlockall); COND_SYSCALL(mincore); COND_SYSCALL(madvise); +COND_SYSCALL(process_madvise); COND_SYSCALL(remap_file_pages); COND_SYSCALL(mbind); COND_SYSCALL_COMPAT(mbind); @@ -369,7 +370,6 @@ COND_SYSCALL_COMPAT(fanotify_mark); /* x86 */ COND_SYSCALL(vm86old); COND_SYSCALL(modify_ldt); -COND_SYSCALL_COMPAT(quotactl32); COND_SYSCALL(vm86); COND_SYSCALL(kexec_file_load); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 287862f91717..afad085960b8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -204,8 +204,7 @@ static int max_extfrag_threshold = 1000; #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_SYSCTL) static int bpf_stats_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct static_key *key = (struct static_key *)table->data; static int saved_val; @@ -2913,6 +2912,14 @@ static struct ctl_table vm_table[] = { .proc_handler = percpu_pagelist_fraction_sysctl_handler, .extra1 = SYSCTL_ZERO, }, + { + .procname = "page_lock_unfairness", + .data = &sysctl_page_lock_unfairness, + .maxlen = sizeof(sysctl_page_lock_unfairness), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, #ifdef CONFIG_MMU { .procname = "max_map_count", diff --git a/kernel/task_work.c b/kernel/task_work.c index ae058893913c..15b087286bea 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -37,23 +37,28 @@ static void task_work_notify_signal(struct task_struct *task) * task_work_add - ask the @task to execute @work->func() * @task: the task which should run the callback * @work: the callback to run - * @notify: send the notification if true + * @notify: how to notify the targeted task * - * Queue @work for task_work_run() below and notify the @task if @notify. - * Fails if the @task is exiting/exited and thus it can't process this @work. - * Otherwise @work->func() will be called when the @task returns from kernel - * mode or exits. + * Queue @work for task_work_run() below and notify the @task if @notify + * is @TWA_RESUME or @TWA_SIGNAL. @TWA_SIGNAL works like signals, in that the + * it will interrupt the targeted task and run the task_work. @TWA_RESUME + * work is run only when the task exits the kernel and returns to user mode, + * or before entering guest mode. Fails if the @task is exiting/exited and thus + * it can't process this @work. Otherwise @work->func() will be called when the + * @task goes through one of the aforementioned transitions, or exits. * - * This is like the signal handler which runs in kernel mode, but it doesn't - * try to wake up the @task. + * If the targeted task is exiting, then an error is returned and the work item + * is not queued. It's up to the caller to arrange for an alternative mechanism + * in that case. * - * Note: there is no ordering guarantee on works queued here. + * Note: there is no ordering guarantee on works queued here. The task_work + * list is LIFO. * * RETURNS: * 0 if succeeds or -ESRCH. */ -int -task_work_add(struct task_struct *task, struct callback_head *work, int notify) +int task_work_add(struct task_struct *task, struct callback_head *work, + enum task_work_notify_mode notify) { struct callback_head *head; @@ -65,12 +70,17 @@ task_work_add(struct task_struct *task, struct callback_head *work, int notify) } while (cmpxchg(&task->task_works, head, work) != head); switch (notify) { + case TWA_NONE: + break; case TWA_RESUME: set_notify_resume(task); break; case TWA_SIGNAL: task_work_notify_signal(task); break; + default: + WARN_ON_ONCE(1); + break; } return 0; diff --git a/kernel/taskstats.c b/kernel/taskstats.c index e2ac0e37c4ae..a2802b6ff4bb 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -34,17 +34,13 @@ struct kmem_cache *taskstats_cache; static struct genl_family family; -static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = { +static const struct nla_policy taskstats_cmd_get_policy[] = { [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 }, [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; -/* - * We have to use TASKSTATS_CMD_ATTR_MAX here, it is the maxattr in the family. - * Make sure they are always aligned. - */ -static const struct nla_policy cgroupstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = { +static const struct nla_policy cgroupstats_cmd_get_policy[] = { [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 }, }; @@ -649,47 +645,25 @@ static const struct genl_ops taskstats_ops[] = { .cmd = TASKSTATS_CMD_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = taskstats_user_cmd, - /* policy enforced later */ - .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_HASPOL, + .policy = taskstats_cmd_get_policy, + .maxattr = ARRAY_SIZE(taskstats_cmd_get_policy) - 1, + .flags = GENL_ADMIN_PERM, }, { .cmd = CGROUPSTATS_CMD_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = cgroupstats_user_cmd, - /* policy enforced later */ - .flags = GENL_CMD_CAP_HASPOL, + .policy = cgroupstats_cmd_get_policy, + .maxattr = ARRAY_SIZE(cgroupstats_cmd_get_policy) - 1, }, }; -static int taskstats_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, - struct genl_info *info) -{ - const struct nla_policy *policy = NULL; - - switch (ops->cmd) { - case TASKSTATS_CMD_GET: - policy = taskstats_cmd_get_policy; - break; - case CGROUPSTATS_CMD_GET: - policy = cgroupstats_cmd_get_policy; - break; - default: - return -EINVAL; - } - - return nlmsg_validate_deprecated(info->nlhdr, GENL_HDRLEN, - TASKSTATS_CMD_ATTR_MAX, policy, - info->extack); -} - static struct genl_family family __ro_after_init = { .name = TASKSTATS_GENL_NAME, .version = TASKSTATS_GENL_VERSION, - .maxattr = TASKSTATS_CMD_ATTR_MAX, .module = THIS_MODULE, .ops = taskstats_ops, .n_ops = ARRAY_SIZE(taskstats_ops), - .pre_doit = taskstats_pre_doit, }; /* Needed early in initialization */ diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index ca223a89530a..f4ace1bf8382 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -908,7 +908,7 @@ static int __init alarmtimer_init(void) /* Initialize alarm bases */ alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME; alarm_bases[ALARM_REALTIME].get_ktime = &ktime_get_real; - alarm_bases[ALARM_REALTIME].get_timespec = ktime_get_real_ts64, + alarm_bases[ALARM_REALTIME].get_timespec = ktime_get_real_ts64; alarm_bases[ALARM_BOOTTIME].base_clockid = CLOCK_BOOTTIME; alarm_bases[ALARM_BOOTTIME].get_ktime = &ktime_get_boottime; alarm_bases[ALARM_BOOTTIME].get_timespec = get_boottime_timespec; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index c4038511d5c9..387b4bef7dd1 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -342,7 +342,7 @@ EXPORT_SYMBOL_GPL(ktime_add_safe); #ifdef CONFIG_DEBUG_OBJECTS_TIMERS -static struct debug_obj_descr hrtimer_debug_descr; +static const struct debug_obj_descr hrtimer_debug_descr; static void *hrtimer_debug_hint(void *addr) { @@ -377,7 +377,7 @@ static bool hrtimer_fixup_activate(void *addr, enum debug_obj_state state) switch (state) { case ODEBUG_STATE_ACTIVE: WARN_ON(1); - /* fall through */ + fallthrough; default: return false; } @@ -401,7 +401,7 @@ static bool hrtimer_fixup_free(void *addr, enum debug_obj_state state) } } -static struct debug_obj_descr hrtimer_debug_descr = { +static const struct debug_obj_descr hrtimer_debug_descr = { .name = "hrtimer", .debug_hint = hrtimer_debug_hint, .fixup_init = hrtimer_fixup_init, @@ -425,11 +425,6 @@ static inline void debug_hrtimer_deactivate(struct hrtimer *timer) debug_object_deactivate(timer, &hrtimer_debug_descr); } -static inline void debug_hrtimer_free(struct hrtimer *timer) -{ - debug_object_free(timer, &hrtimer_debug_descr); -} - static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode); diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index ca4e6d57d68b..00629e658ca1 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -172,10 +172,6 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, u64 oval, nval, ointerval, ninterval; struct cpu_itimer *it = &tsk->signal->it[clock_id]; - /* - * Use the to_ktime conversion because that clamps the maximum - * value to KTIME_MAX and avoid multiplication overflows. - */ nval = timespec64_to_ns(&value->it_value); ninterval = timespec64_to_ns(&value->it_interval); diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 07709ac30439..bf540f5a4115 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -439,12 +439,12 @@ static struct pid *good_sigevent(sigevent_t * event) rtn = pid_task(pid, PIDTYPE_PID); if (!rtn || !same_thread_group(rtn, current)) return NULL; - /* FALLTHRU */ + fallthrough; case SIGEV_SIGNAL: case SIGEV_THREAD: if (event->sigev_signo <= 0 || event->sigev_signo > SIGRTMAX) return NULL; - /* FALLTHRU */ + fallthrough; case SIGEV_NONE: return pid; default: diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 1c03eec6ca9b..b1b9b12899f5 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -35,7 +35,7 @@ * into a single 64-byte cache line. */ struct clock_data { - seqcount_t seq; + seqcount_latch_t seq; struct clock_read_data read_data[2]; ktime_t wrap_kt; unsigned long rate; @@ -68,15 +68,15 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) return (cyc * mult) >> shift; } -struct clock_read_data *sched_clock_read_begin(unsigned int *seq) +notrace struct clock_read_data *sched_clock_read_begin(unsigned int *seq) { *seq = raw_read_seqcount_latch(&cd.seq); return cd.read_data + (*seq & 1); } -int sched_clock_read_retry(unsigned int seq) +notrace int sched_clock_read_retry(unsigned int seq) { - return read_seqcount_retry(&cd.seq, seq); + return read_seqcount_latch_retry(&cd.seq, seq); } unsigned long long notrace sched_clock(void) @@ -258,7 +258,7 @@ void __init generic_sched_clock_init(void) */ static u64 notrace suspended_sched_clock_read(void) { - unsigned int seq = raw_read_seqcount(&cd.seq); + unsigned int seq = raw_read_seqcount_latch(&cd.seq); return cd.read_data[seq & 1].epoch_cyc; } diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index e51778c312f1..36d7464c8962 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -381,7 +381,7 @@ void tick_broadcast_control(enum tick_broadcast_mode mode) switch (mode) { case TICK_BROADCAST_FORCE: tick_broadcast_forced = 1; - /* fall through */ + fallthrough; case TICK_BROADCAST_ON: cpumask_set_cpu(cpu, tick_broadcast_on); if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) { diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index f0199a4ba1ad..81632cd5e3b7 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -927,7 +927,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) if (ratelimit < 10 && (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) { - pr_warn("NOHZ: local_softirq_pending %02x\n", + pr_warn("NOHZ tick-stop error: Non-RCU local softirq work is pending, handler #%02x!!!\n", (unsigned int) local_softirq_pending()); ratelimit++; } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 4c47f388a83f..6858a31364b6 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -54,6 +54,9 @@ static struct { static struct timekeeper shadow_timekeeper; +/* flag for if timekeeping is suspended */ +int __read_mostly timekeeping_suspended; + /** * struct tk_fast - NMI safe timekeeper * @seq: Sequence counter for protecting updates. The lowest bit @@ -64,7 +67,7 @@ static struct timekeeper shadow_timekeeper; * See @update_fast_timekeeper() below. */ struct tk_fast { - seqcount_raw_spinlock_t seq; + seqcount_latch_t seq; struct tk_read_base base[2]; }; @@ -73,28 +76,42 @@ static u64 cycles_at_suspend; static u64 dummy_clock_read(struct clocksource *cs) { - return cycles_at_suspend; + if (timekeeping_suspended) + return cycles_at_suspend; + return local_clock(); } static struct clocksource dummy_clock = { .read = dummy_clock_read, }; +/* + * Boot time initialization which allows local_clock() to be utilized + * during early boot when clocksources are not available. local_clock() + * returns nanoseconds already so no conversion is required, hence mult=1 + * and shift=0. When the first proper clocksource is installed then + * the fast time keepers are updated with the correct values. + */ +#define FAST_TK_INIT \ + { \ + .clock = &dummy_clock, \ + .mask = CLOCKSOURCE_MASK(64), \ + .mult = 1, \ + .shift = 0, \ + } + static struct tk_fast tk_fast_mono ____cacheline_aligned = { - .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_fast_mono.seq, &timekeeper_lock), - .base[0] = { .clock = &dummy_clock, }, - .base[1] = { .clock = &dummy_clock, }, + .seq = SEQCNT_LATCH_ZERO(tk_fast_mono.seq), + .base[0] = FAST_TK_INIT, + .base[1] = FAST_TK_INIT, }; static struct tk_fast tk_fast_raw ____cacheline_aligned = { - .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_fast_raw.seq, &timekeeper_lock), - .base[0] = { .clock = &dummy_clock, }, - .base[1] = { .clock = &dummy_clock, }, + .seq = SEQCNT_LATCH_ZERO(tk_fast_raw.seq), + .base[0] = FAST_TK_INIT, + .base[1] = FAST_TK_INIT, }; -/* flag for if timekeeping is suspended */ -int __read_mostly timekeeping_suspended; - static inline void tk_normalize_xtime(struct timekeeper *tk) { while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) { @@ -467,7 +484,7 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) tk_clock_read(tkr), tkr->cycle_last, tkr->mask)); - } while (read_seqcount_retry(&tkf->seq, seq)); + } while (read_seqcount_latch_retry(&tkf->seq, seq)); return now; } @@ -513,29 +530,29 @@ u64 notrace ktime_get_boot_fast_ns(void) } EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns); - /* * See comment for __ktime_get_fast_ns() vs. timestamp ordering */ -static __always_inline u64 __ktime_get_real_fast_ns(struct tk_fast *tkf) +static __always_inline u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono) { struct tk_read_base *tkr; + u64 basem, baser, delta; unsigned int seq; - u64 now; do { seq = raw_read_seqcount_latch(&tkf->seq); tkr = tkf->base + (seq & 0x01); - now = ktime_to_ns(tkr->base_real); + basem = ktime_to_ns(tkr->base); + baser = ktime_to_ns(tkr->base_real); - now += timekeeping_delta_to_ns(tkr, - clocksource_delta( - tk_clock_read(tkr), - tkr->cycle_last, - tkr->mask)); - } while (read_seqcount_retry(&tkf->seq, seq)); + delta = timekeeping_delta_to_ns(tkr, + clocksource_delta(tk_clock_read(tkr), + tkr->cycle_last, tkr->mask)); + } while (read_seqcount_latch_retry(&tkf->seq, seq)); - return now; + if (mono) + *mono = basem + delta; + return baser + delta; } /** @@ -543,11 +560,65 @@ static __always_inline u64 __ktime_get_real_fast_ns(struct tk_fast *tkf) */ u64 ktime_get_real_fast_ns(void) { - return __ktime_get_real_fast_ns(&tk_fast_mono); + return __ktime_get_real_fast(&tk_fast_mono, NULL); } EXPORT_SYMBOL_GPL(ktime_get_real_fast_ns); /** + * ktime_get_fast_timestamps: - NMI safe timestamps + * @snapshot: Pointer to timestamp storage + * + * Stores clock monotonic, boottime and realtime timestamps. + * + * Boot time is a racy access on 32bit systems if the sleep time injection + * happens late during resume and not in timekeeping_resume(). That could + * be avoided by expanding struct tk_read_base with boot offset for 32bit + * and adding more overhead to the update. As this is a hard to observe + * once per resume event which can be filtered with reasonable effort using + * the accurate mono/real timestamps, it's probably not worth the trouble. + * + * Aside of that it might be possible on 32 and 64 bit to observe the + * following when the sleep time injection happens late: + * + * CPU 0 CPU 1 + * timekeeping_resume() + * ktime_get_fast_timestamps() + * mono, real = __ktime_get_real_fast() + * inject_sleep_time() + * update boot offset + * boot = mono + bootoffset; + * + * That means that boot time already has the sleep time adjustment, but + * real time does not. On the next readout both are in sync again. + * + * Preventing this for 64bit is not really feasible without destroying the + * careful cache layout of the timekeeper because the sequence count and + * struct tk_read_base would then need two cache lines instead of one. + * + * Access to the time keeper clock source is disabled accross the innermost + * steps of suspend/resume. The accessors still work, but the timestamps + * are frozen until time keeping is resumed which happens very early. + * + * For regular suspend/resume there is no observable difference vs. sched + * clock, but it might affect some of the nasty low level debug printks. + * + * OTOH, access to sched clock is not guaranteed accross suspend/resume on + * all systems either so it depends on the hardware in use. + * + * If that turns out to be a real problem then this could be mitigated by + * using sched clock in a similar way as during early boot. But it's not as + * trivial as on early boot because it needs some careful protection + * against the clock monotonic timestamp jumping backwards on resume. + */ +void ktime_get_fast_timestamps(struct ktime_timestamps *snapshot) +{ + struct timekeeper *tk = &tk_core.timekeeper; + + snapshot->real = __ktime_get_real_fast(&tk_fast_mono, &snapshot->mono); + snapshot->boot = snapshot->mono + ktime_to_ns(data_race(tk->offs_boot)); +} + +/** * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource. * @tk: Timekeeper to snapshot. * diff --git a/kernel/time/timer.c b/kernel/time/timer.c index a16764b0116e..c3ad64fb9d8b 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -611,7 +611,7 @@ static void internal_add_timer(struct timer_base *base, struct timer_list *timer #ifdef CONFIG_DEBUG_OBJECTS_TIMERS -static struct debug_obj_descr timer_debug_descr; +static const struct debug_obj_descr timer_debug_descr; static void *timer_debug_hint(void *addr) { @@ -666,7 +666,7 @@ static bool timer_fixup_activate(void *addr, enum debug_obj_state state) case ODEBUG_STATE_ACTIVE: WARN_ON(1); - /* fall through */ + fallthrough; default: return false; } @@ -707,7 +707,7 @@ static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state) } } -static struct debug_obj_descr timer_debug_descr = { +static const struct debug_obj_descr timer_debug_descr = { .name = "timer_list", .debug_hint = timer_debug_hint, .is_static_object = timer_is_static_object, @@ -732,11 +732,6 @@ static inline void debug_timer_deactivate(struct timer_list *timer) debug_object_deactivate(timer, &timer_debug_descr); } -static inline void debug_timer_free(struct timer_list *timer) -{ - debug_object_free(timer, &timer_debug_descr); -} - static inline void debug_timer_assert_init(struct timer_list *timer) { debug_object_assert_init(timer, &timer_debug_descr); @@ -794,6 +789,8 @@ static void do_init_timer(struct timer_list *timer, { timer->entry.pprev = NULL; timer->function = func; + if (WARN_ON_ONCE(flags & ~TIMER_INIT_FLAGS)) + flags &= TIMER_INIT_FLAGS; timer->flags = flags | raw_smp_processor_id(); lockdep_init_map(&timer->lockdep_map, name, key, 0); } @@ -1704,6 +1701,8 @@ void update_process_times(int user_tick) { struct task_struct *p = current; + PRANDOM_ADD_NOISE(jiffies, user_tick, p, 0); + /* Note: this timer irq context must be accounted for as well. */ account_process_tick(p, user_tick); run_local_timers(); @@ -1715,13 +1714,6 @@ void update_process_times(int user_tick) scheduler_tick(); if (IS_ENABLED(CONFIG_POSIX_TIMERS)) run_posix_cpu_timers(); - - /* The current CPU might make use of net randoms without receiving IRQs - * to renew them often enough. Let's update the net_rand_state from a - * non-constant value that's not affine to the number of calls to make - * sure it's updated when there's some activity (we don't care in idle). - */ - this_cpu_add(net_rand_state.s1, rol32(jiffies, 24) + user_tick); } /** diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 7ba62d68885a..f1022945e346 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -527,7 +527,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, * and scsi-generic block devices we create a temporary new debugfs * directory that will be removed once the trace ends. */ - if (bdev && bdev == bdev->bd_contains) + if (bdev && !bdev_is_partition(bdev)) dir = q->debugfs_dir; else bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root); @@ -745,7 +745,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) #endif case BLKTRACESTART: start = 1; - /* fall through */ + fallthrough; case BLKTRACESTOP: ret = __blk_trace_startstop(q, start); break; @@ -793,7 +793,7 @@ static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) return cgroup_id(bio_blkcg(bio)->css.cgroup); } #else -u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) +static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) { return 0; } @@ -1827,13 +1827,11 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct hd_struct *p = dev_to_part(dev); + struct block_device *bdev = bdget_part(dev_to_part(dev)); struct request_queue *q; - struct block_device *bdev; struct blk_trace *bt; ssize_t ret = -ENXIO; - bdev = bdget(part_devt(p)); if (bdev == NULL) goto out; @@ -1875,7 +1873,6 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, { struct block_device *bdev; struct request_queue *q; - struct hd_struct *p; struct blk_trace *bt; u64 value; ssize_t ret = -EINVAL; @@ -1895,9 +1892,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, goto out; ret = -ENXIO; - - p = dev_to_part(dev); - bdev = bdget(part_devt(p)); + bdev = bdget_part(dev_to_part(dev)); if (bdev == NULL) goto out; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index a8d4f253ed77..4517c8b66518 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -7,6 +7,7 @@ #include <linux/slab.h> #include <linux/bpf.h> #include <linux/bpf_perf_event.h> +#include <linux/btf.h> #include <linux/filter.h> #include <linux/uaccess.h> #include <linux/ctype.h> @@ -16,6 +17,9 @@ #include <linux/error-injection.h> #include <linux/btf_ids.h> +#include <uapi/linux/bpf.h> +#include <uapi/linux/btf.h> + #include <asm/tlb.h> #include "trace_probe.h" @@ -67,6 +71,10 @@ static struct bpf_raw_event_map *bpf_get_raw_tracepoint_module(const char *name) u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); u64 bpf_get_stack(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); +static int bpf_btf_printf_prepare(struct btf_ptr *ptr, u32 btf_ptr_size, + u64 flags, const struct btf **btf, + s32 *btf_id); + /** * trace_call_bpf - invoke BPF program * @call: tracepoint event @@ -743,19 +751,18 @@ out: return err; } -BTF_ID_LIST(bpf_seq_printf_btf_ids) -BTF_ID(struct, seq_file) +BTF_ID_LIST_SINGLE(btf_seq_file_ids, struct, seq_file) static const struct bpf_func_proto bpf_seq_printf_proto = { .func = bpf_seq_printf, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_seq_file_ids[0], .arg2_type = ARG_PTR_TO_MEM, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_PTR_TO_MEM_OR_NULL, .arg5_type = ARG_CONST_SIZE_OR_ZERO, - .btf_id = bpf_seq_printf_btf_ids, }; BPF_CALL_3(bpf_seq_write, struct seq_file *, m, const void *, data, u32, len) @@ -763,17 +770,39 @@ BPF_CALL_3(bpf_seq_write, struct seq_file *, m, const void *, data, u32, len) return seq_write(m, data, len) ? -EOVERFLOW : 0; } -BTF_ID_LIST(bpf_seq_write_btf_ids) -BTF_ID(struct, seq_file) - static const struct bpf_func_proto bpf_seq_write_proto = { .func = bpf_seq_write, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_seq_file_ids[0], + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, +}; + +BPF_CALL_4(bpf_seq_printf_btf, struct seq_file *, m, struct btf_ptr *, ptr, + u32, btf_ptr_size, u64, flags) +{ + const struct btf *btf; + s32 btf_id; + int ret; + + ret = bpf_btf_printf_prepare(ptr, btf_ptr_size, flags, &btf, &btf_id); + if (ret) + return ret; + + return btf_type_seq_show_flags(btf, btf_id, ptr->ptr, m, flags); +} + +static const struct bpf_func_proto bpf_seq_printf_btf_proto = { + .func = bpf_seq_printf_btf, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_seq_file_ids[0], .arg2_type = ARG_PTR_TO_MEM, .arg3_type = ARG_CONST_SIZE_OR_ZERO, - .btf_id = bpf_seq_write_btf_ids, + .arg4_type = ARG_ANYTHING, }; static __always_inline int @@ -1098,6 +1127,118 @@ static const struct bpf_func_proto bpf_send_signal_thread_proto = { .arg1_type = ARG_ANYTHING, }; +BPF_CALL_3(bpf_d_path, struct path *, path, char *, buf, u32, sz) +{ + long len; + char *p; + + if (!sz) + return 0; + + p = d_path(path, buf, sz); + if (IS_ERR(p)) { + len = PTR_ERR(p); + } else { + len = buf + sz - p; + memmove(buf, p, len); + } + + return len; +} + +BTF_SET_START(btf_allowlist_d_path) +#ifdef CONFIG_SECURITY +BTF_ID(func, security_file_permission) +BTF_ID(func, security_inode_getattr) +BTF_ID(func, security_file_open) +#endif +#ifdef CONFIG_SECURITY_PATH +BTF_ID(func, security_path_truncate) +#endif +BTF_ID(func, vfs_truncate) +BTF_ID(func, vfs_fallocate) +BTF_ID(func, dentry_open) +BTF_ID(func, vfs_getattr) +BTF_ID(func, filp_close) +BTF_SET_END(btf_allowlist_d_path) + +static bool bpf_d_path_allowed(const struct bpf_prog *prog) +{ + return btf_id_set_contains(&btf_allowlist_d_path, prog->aux->attach_btf_id); +} + +BTF_ID_LIST_SINGLE(bpf_d_path_btf_ids, struct, path) + +static const struct bpf_func_proto bpf_d_path_proto = { + .func = bpf_d_path, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &bpf_d_path_btf_ids[0], + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .allowed = bpf_d_path_allowed, +}; + +#define BTF_F_ALL (BTF_F_COMPACT | BTF_F_NONAME | \ + BTF_F_PTR_RAW | BTF_F_ZERO) + +static int bpf_btf_printf_prepare(struct btf_ptr *ptr, u32 btf_ptr_size, + u64 flags, const struct btf **btf, + s32 *btf_id) +{ + const struct btf_type *t; + + if (unlikely(flags & ~(BTF_F_ALL))) + return -EINVAL; + + if (btf_ptr_size != sizeof(struct btf_ptr)) + return -EINVAL; + + *btf = bpf_get_btf_vmlinux(); + + if (IS_ERR_OR_NULL(*btf)) + return PTR_ERR(*btf); + + if (ptr->type_id > 0) + *btf_id = ptr->type_id; + else + return -EINVAL; + + if (*btf_id > 0) + t = btf_type_by_id(*btf, *btf_id); + if (*btf_id <= 0 || !t) + return -ENOENT; + + return 0; +} + +BPF_CALL_5(bpf_snprintf_btf, char *, str, u32, str_size, struct btf_ptr *, ptr, + u32, btf_ptr_size, u64, flags) +{ + const struct btf *btf; + s32 btf_id; + int ret; + + ret = bpf_btf_printf_prepare(ptr, btf_ptr_size, flags, &btf, &btf_id); + if (ret) + return ret; + + return btf_type_snprintf_show(btf, btf_id, ptr->ptr, str, str_size, + flags); +} + +const struct bpf_func_proto bpf_snprintf_btf_proto = { + .func = bpf_snprintf_btf, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_CONST_SIZE, + .arg3_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_CONST_SIZE, + .arg5_type = ARG_ANYTHING, +}; + const struct bpf_func_proto * bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -1182,6 +1323,14 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_jiffies64_proto; case BPF_FUNC_get_task_stack: return &bpf_get_task_stack_proto; + case BPF_FUNC_copy_from_user: + return prog->aux->sleepable ? &bpf_copy_from_user_proto : NULL; + case BPF_FUNC_snprintf_btf: + return &bpf_snprintf_btf_proto; + case BPF_FUNC_bpf_per_cpu_ptr: + return &bpf_per_cpu_ptr_proto; + case BPF_FUNC_bpf_this_cpu_ptr: + return &bpf_this_cpu_ptr_proto; default: return NULL; } @@ -1579,6 +1728,12 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return prog->expected_attach_type == BPF_TRACE_ITER ? &bpf_seq_write_proto : NULL; + case BPF_FUNC_seq_printf_btf: + return prog->expected_attach_type == BPF_TRACE_ITER ? + &bpf_seq_printf_btf_proto : + NULL; + case BPF_FUNC_d_path: + return &bpf_d_path_proto; default: return raw_tp_prog_func_proto(func_id, prog); } @@ -1625,6 +1780,9 @@ const struct bpf_verifier_ops raw_tracepoint_verifier_ops = { }; const struct bpf_prog_ops raw_tracepoint_prog_ops = { +#ifdef CONFIG_NET + .test_run = bpf_prog_test_run_raw_tp, +#endif }; const struct bpf_verifier_ops tracing_verifier_ops = { @@ -2027,10 +2185,11 @@ static int bpf_event_notify(struct notifier_block *nb, unsigned long op, { struct bpf_trace_module *btm, *tmp; struct module *mod = module; + int ret = 0; if (mod->num_bpf_raw_events == 0 || (op != MODULE_STATE_COMING && op != MODULE_STATE_GOING)) - return 0; + goto out; mutex_lock(&bpf_module_mutex); @@ -2040,6 +2199,8 @@ static int bpf_event_notify(struct notifier_block *nb, unsigned long op, if (btm) { btm->module = module; list_add(&btm->list, &bpf_trace_modules); + } else { + ret = -ENOMEM; } break; case MODULE_STATE_GOING: @@ -2055,7 +2216,8 @@ static int bpf_event_notify(struct notifier_block *nb, unsigned long op, mutex_unlock(&bpf_module_mutex); - return 0; +out: + return notifier_from_errno(ret); } static struct notifier_block bpf_module_nb = { diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 1af321dec0f1..5658f13037b3 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -387,8 +387,8 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) } } - read_lock(&tasklist_lock); - do_each_thread(g, t) { + rcu_read_lock(); + for_each_process_thread(g, t) { if (start == end) { ret = -EAGAIN; goto unlock; @@ -403,10 +403,10 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) smp_wmb(); t->ret_stack = ret_stack_list[start++]; } - } while_each_thread(g, t); + } unlock: - read_unlock(&tasklist_lock); + rcu_read_unlock(); free: for (i = start; i < end; i++) kfree(ret_stack_list[i]); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 275441254bb5..8185f7240095 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -230,7 +230,7 @@ static void update_ftrace_function(void) /* * For static tracing, we need to be a bit more careful. * The function change takes affect immediately. Thus, - * we need to coorditate the setting of the function_trace_ops + * we need to coordinate the setting of the function_trace_ops * with the setting of the ftrace_trace_function. * * Set the function to the list ops, which will call the @@ -1368,10 +1368,10 @@ static struct ftrace_hash *dup_hash(struct ftrace_hash *src, int size) int i; /* - * Make the hash size about 1/2 the # found + * Use around half the size (max bit of it), but + * a minimum of 2 is fine (as size of 0 or 1 both give 1 for bits). */ - for (size /= 2; size; size >>= 1) - bits++; + bits = fls(size / 2); /* Don't allocate too much */ if (bits > FTRACE_HASH_MAX_BITS) @@ -1451,7 +1451,7 @@ static bool hash_contains_ip(unsigned long ip, { /* * The function record is a match if it exists in the filter - * hash and not in the notrace hash. Note, an emty hash is + * hash and not in the notrace hash. Note, an empty hash is * considered a match for the filter hash, but an empty * notrace hash is considered not in the notrace hash. */ @@ -2402,7 +2402,7 @@ struct ftrace_ops direct_ops = { * * If the record has the FTRACE_FL_REGS set, that means that it * wants to convert to a callback that saves all regs. If FTRACE_FL_REGS - * is not not set, then it wants to convert to the normal callback. + * is not set, then it wants to convert to the normal callback. * * Returns the address of the trampoline to set to */ @@ -2782,6 +2782,7 @@ static void ftrace_remove_trampoline_from_kallsyms(struct ftrace_ops *ops) { lockdep_assert_held(&ftrace_lock); list_del_rcu(&ops->list); + synchronize_rcu(); } /* @@ -2862,6 +2863,8 @@ int ftrace_startup(struct ftrace_ops *ops, int command) __unregister_ftrace_function(ops); ftrace_start_up--; ops->flags &= ~FTRACE_OPS_FL_ENABLED; + if (ops->flags & FTRACE_OPS_FL_DYNAMIC) + ftrace_trampoline_free(ops); return ret; } @@ -2973,7 +2976,7 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command) synchronize_rcu_tasks_rude(); /* - * When the kernel is preeptive, tasks can be preempted + * When the kernel is preemptive, tasks can be preempted * while on a ftrace trampoline. Just scheduling a task on * a CPU is not good enough to flush them. Calling * synchornize_rcu_tasks() will wait for those tasks to @@ -3126,18 +3129,20 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs) static int ftrace_allocate_records(struct ftrace_page *pg, int count) { int order; + int pages; int cnt; if (WARN_ON(!count)) return -EINVAL; - order = get_count_order(DIV_ROUND_UP(count, ENTRIES_PER_PAGE)); + pages = DIV_ROUND_UP(count, ENTRIES_PER_PAGE); + order = get_count_order(pages); /* * We want to fill as much as possible. No more than a page * may be empty. */ - while ((PAGE_SIZE << order) / ENTRY_SIZE >= count + ENTRIES_PER_PAGE) + if (!is_power_of_2(pages)) order--; again: @@ -4365,7 +4370,7 @@ void **ftrace_func_mapper_find_ip(struct ftrace_func_mapper *mapper, * @ip: The instruction pointer address to map @data to * @data: The data to map to @ip * - * Returns 0 on succes otherwise an error. + * Returns 0 on success otherwise an error. */ int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper, unsigned long ip, void *data) @@ -4533,7 +4538,7 @@ register_ftrace_function_probe(char *glob, struct trace_array *tr, /* * Note, there's a small window here that the func_hash->filter_hash - * may be NULL or empty. Need to be carefule when reading the loop. + * may be NULL or empty. Need to be careful when reading the loop. */ mutex_lock(&probe->ops.func_hash->regex_lock); @@ -6990,16 +6995,14 @@ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip, { int bit; - if ((op->flags & FTRACE_OPS_FL_RCU) && !rcu_is_watching()) - return; - bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX); if (bit < 0) return; preempt_disable_notrace(); - op->func(ip, parent_ip, op, regs); + if (!(op->flags & FTRACE_OPS_FL_RCU) || rcu_is_watching()) + op->func(ip, parent_ip, op, regs); preempt_enable_notrace(); trace_clear_recursion(bit); @@ -7531,8 +7534,7 @@ static bool is_permanent_ops_registered(void) int ftrace_enable_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret = -ENODEV; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 93ef0ab6ea20..7f45fd9d5a45 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -793,7 +793,7 @@ static void rb_wake_up_waiters(struct irq_work *work) * ring_buffer_wait - wait for input to the ring buffer * @buffer: buffer to wait on * @cpu: the cpu buffer to wait on - * @full: wait until a full page is available, if @cpu != RING_BUFFER_ALL_CPUS + * @full: wait until the percentage of pages are available, if @cpu != RING_BUFFER_ALL_CPUS * * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon * as data is added to any of the @buffer's cpu buffers. Otherwise @@ -1952,18 +1952,18 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, { struct ring_buffer_per_cpu *cpu_buffer; unsigned long nr_pages; - int cpu, err = 0; + int cpu, err; /* * Always succeed at resizing a non-existent buffer: */ if (!buffer) - return size; + return 0; /* Make sure the requested buffer exists */ if (cpu_id != RING_BUFFER_ALL_CPUS && !cpumask_test_cpu(cpu_id, buffer->cpumask)) - return size; + return 0; nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); @@ -2119,7 +2119,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, } mutex_unlock(&buffer->mutex); - return size; + return 0; out_err: for_each_buffer_cpu(buffer, cpu) { @@ -4866,6 +4866,9 @@ void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu) if (!cpumask_test_cpu(cpu, buffer->cpumask)) return; + /* prevent another thread from changing buffer sizes */ + mutex_lock(&buffer->mutex); + atomic_inc(&cpu_buffer->resize_disabled); atomic_inc(&cpu_buffer->record_disabled); @@ -4876,6 +4879,8 @@ void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu) atomic_dec(&cpu_buffer->record_disabled); atomic_dec(&cpu_buffer->resize_disabled); + + mutex_unlock(&buffer->mutex); } EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); @@ -4889,6 +4894,9 @@ void ring_buffer_reset_online_cpus(struct trace_buffer *buffer) struct ring_buffer_per_cpu *cpu_buffer; int cpu; + /* prevent another thread from changing buffer sizes */ + mutex_lock(&buffer->mutex); + for_each_online_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; @@ -4907,6 +4915,8 @@ void ring_buffer_reset_online_cpus(struct trace_buffer *buffer) atomic_dec(&cpu_buffer->record_disabled); atomic_dec(&cpu_buffer->resize_disabled); } + + mutex_unlock(&buffer->mutex); } /** diff --git a/kernel/trace/synth_event_gen_test.c b/kernel/trace/synth_event_gen_test.c index 7d56d621ffea..edd912cd14aa 100644 --- a/kernel/trace/synth_event_gen_test.c +++ b/kernel/trace/synth_event_gen_test.c @@ -242,9 +242,11 @@ static struct synth_field_desc create_synth_test_fields[] = { { .type = "pid_t", .name = "next_pid_field" }, { .type = "char[16]", .name = "next_comm_field" }, { .type = "u64", .name = "ts_ns" }, + { .type = "char[]", .name = "dynstring_field_1" }, { .type = "u64", .name = "ts_ms" }, { .type = "unsigned int", .name = "cpu" }, { .type = "char[64]", .name = "my_string_field" }, + { .type = "char[]", .name = "dynstring_field_2" }, { .type = "int", .name = "my_int_field" }, }; @@ -254,7 +256,7 @@ static struct synth_field_desc create_synth_test_fields[] = { */ static int __init test_create_synth_event(void) { - u64 vals[7]; + u64 vals[9]; int ret; /* Create the create_synth_test event with the fields above */ @@ -292,10 +294,12 @@ static int __init test_create_synth_event(void) vals[0] = 777; /* next_pid_field */ vals[1] = (u64)(long)"tiddlywinks"; /* next_comm_field */ vals[2] = 1000000; /* ts_ns */ - vals[3] = 1000; /* ts_ms */ - vals[4] = raw_smp_processor_id(); /* cpu */ - vals[5] = (u64)(long)"thneed"; /* my_string_field */ - vals[6] = 398; /* my_int_field */ + vals[3] = (u64)(long)"xrayspecs"; /* dynstring_field_1 */ + vals[4] = 1000; /* ts_ms */ + vals[5] = raw_smp_processor_id(); /* cpu */ + vals[6] = (u64)(long)"thneed"; /* my_string_field */ + vals[7] = (u64)(long)"kerplunk"; /* dynstring_field_2 */ + vals[8] = 398; /* my_int_field */ /* Now generate a create_synth_test event */ ret = synth_event_trace_array(create_synth_test, vals, ARRAY_SIZE(vals)); @@ -422,13 +426,15 @@ static int __init test_trace_synth_event(void) int ret; /* Trace some bogus values just for testing */ - ret = synth_event_trace(create_synth_test, 7, /* number of values */ + ret = synth_event_trace(create_synth_test, 9, /* number of values */ (u64)444, /* next_pid_field */ (u64)(long)"clackers", /* next_comm_field */ (u64)1000000, /* ts_ns */ + (u64)(long)"viewmaster",/* dynstring_field_1 */ (u64)1000, /* ts_ms */ (u64)raw_smp_processor_id(), /* cpu */ (u64)(long)"Thneed", /* my_string_field */ + (u64)(long)"yoyos", /* dynstring_field_2 */ (u64)999); /* my_int_field */ return ret; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f40d850ebabc..528971714fc6 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -251,6 +251,145 @@ unsigned long long ns2usecs(u64 nsec) return nsec; } +static void +trace_process_export(struct trace_export *export, + struct ring_buffer_event *event, int flag) +{ + struct trace_entry *entry; + unsigned int size = 0; + + if (export->flags & flag) { + entry = ring_buffer_event_data(event); + size = ring_buffer_event_length(event); + export->write(export, entry, size); + } +} + +static DEFINE_MUTEX(ftrace_export_lock); + +static struct trace_export __rcu *ftrace_exports_list __read_mostly; + +static DEFINE_STATIC_KEY_FALSE(trace_function_exports_enabled); +static DEFINE_STATIC_KEY_FALSE(trace_event_exports_enabled); +static DEFINE_STATIC_KEY_FALSE(trace_marker_exports_enabled); + +static inline void ftrace_exports_enable(struct trace_export *export) +{ + if (export->flags & TRACE_EXPORT_FUNCTION) + static_branch_inc(&trace_function_exports_enabled); + + if (export->flags & TRACE_EXPORT_EVENT) + static_branch_inc(&trace_event_exports_enabled); + + if (export->flags & TRACE_EXPORT_MARKER) + static_branch_inc(&trace_marker_exports_enabled); +} + +static inline void ftrace_exports_disable(struct trace_export *export) +{ + if (export->flags & TRACE_EXPORT_FUNCTION) + static_branch_dec(&trace_function_exports_enabled); + + if (export->flags & TRACE_EXPORT_EVENT) + static_branch_dec(&trace_event_exports_enabled); + + if (export->flags & TRACE_EXPORT_MARKER) + static_branch_dec(&trace_marker_exports_enabled); +} + +static void ftrace_exports(struct ring_buffer_event *event, int flag) +{ + struct trace_export *export; + + preempt_disable_notrace(); + + export = rcu_dereference_raw_check(ftrace_exports_list); + while (export) { + trace_process_export(export, event, flag); + export = rcu_dereference_raw_check(export->next); + } + + preempt_enable_notrace(); +} + +static inline void +add_trace_export(struct trace_export **list, struct trace_export *export) +{ + rcu_assign_pointer(export->next, *list); + /* + * We are entering export into the list but another + * CPU might be walking that list. We need to make sure + * the export->next pointer is valid before another CPU sees + * the export pointer included into the list. + */ + rcu_assign_pointer(*list, export); +} + +static inline int +rm_trace_export(struct trace_export **list, struct trace_export *export) +{ + struct trace_export **p; + + for (p = list; *p != NULL; p = &(*p)->next) + if (*p == export) + break; + + if (*p != export) + return -1; + + rcu_assign_pointer(*p, (*p)->next); + + return 0; +} + +static inline void +add_ftrace_export(struct trace_export **list, struct trace_export *export) +{ + ftrace_exports_enable(export); + + add_trace_export(list, export); +} + +static inline int +rm_ftrace_export(struct trace_export **list, struct trace_export *export) +{ + int ret; + + ret = rm_trace_export(list, export); + ftrace_exports_disable(export); + + return ret; +} + +int register_ftrace_export(struct trace_export *export) +{ + if (WARN_ON_ONCE(!export->write)) + return -1; + + mutex_lock(&ftrace_export_lock); + + add_ftrace_export(&ftrace_exports_list, export); + + mutex_unlock(&ftrace_export_lock); + + return 0; +} +EXPORT_SYMBOL_GPL(register_ftrace_export); + +int unregister_ftrace_export(struct trace_export *export) +{ + int ret; + + mutex_lock(&ftrace_export_lock); + + ret = rm_ftrace_export(&ftrace_exports_list, export); + + mutex_unlock(&ftrace_export_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(unregister_ftrace_export); + /* trace_flags holds trace_options default values */ #define TRACE_DEFAULT_FLAGS \ (FUNCTION_DEFAULT_FLAGS | \ @@ -2511,7 +2650,7 @@ void trace_buffered_event_enable(void) preempt_disable(); if (cpu == smp_processor_id() && - this_cpu_read(trace_buffered_event) != + __this_cpu_read(trace_buffered_event) != per_cpu(trace_buffered_event, cpu)) WARN_ON_ONCE(1); preempt_enable(); @@ -2699,6 +2838,8 @@ void trace_event_buffer_commit(struct trace_event_buffer *fbuffer) if (static_key_false(&tracepoint_printk_key.key)) output_printk(fbuffer); + if (static_branch_unlikely(&trace_event_exports_enabled)) + ftrace_exports(fbuffer->event, TRACE_EXPORT_EVENT); event_trigger_unlock_commit_regs(fbuffer->trace_file, fbuffer->buffer, fbuffer->event, fbuffer->entry, fbuffer->flags, fbuffer->pc, fbuffer->regs); @@ -2742,129 +2883,6 @@ trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer, __buffer_unlock_commit(buffer, event); } -static void -trace_process_export(struct trace_export *export, - struct ring_buffer_event *event) -{ - struct trace_entry *entry; - unsigned int size = 0; - - entry = ring_buffer_event_data(event); - size = ring_buffer_event_length(event); - export->write(export, entry, size); -} - -static DEFINE_MUTEX(ftrace_export_lock); - -static struct trace_export __rcu *ftrace_exports_list __read_mostly; - -static DEFINE_STATIC_KEY_FALSE(ftrace_exports_enabled); - -static inline void ftrace_exports_enable(void) -{ - static_branch_enable(&ftrace_exports_enabled); -} - -static inline void ftrace_exports_disable(void) -{ - static_branch_disable(&ftrace_exports_enabled); -} - -static void ftrace_exports(struct ring_buffer_event *event) -{ - struct trace_export *export; - - preempt_disable_notrace(); - - export = rcu_dereference_raw_check(ftrace_exports_list); - while (export) { - trace_process_export(export, event); - export = rcu_dereference_raw_check(export->next); - } - - preempt_enable_notrace(); -} - -static inline void -add_trace_export(struct trace_export **list, struct trace_export *export) -{ - rcu_assign_pointer(export->next, *list); - /* - * We are entering export into the list but another - * CPU might be walking that list. We need to make sure - * the export->next pointer is valid before another CPU sees - * the export pointer included into the list. - */ - rcu_assign_pointer(*list, export); -} - -static inline int -rm_trace_export(struct trace_export **list, struct trace_export *export) -{ - struct trace_export **p; - - for (p = list; *p != NULL; p = &(*p)->next) - if (*p == export) - break; - - if (*p != export) - return -1; - - rcu_assign_pointer(*p, (*p)->next); - - return 0; -} - -static inline void -add_ftrace_export(struct trace_export **list, struct trace_export *export) -{ - if (*list == NULL) - ftrace_exports_enable(); - - add_trace_export(list, export); -} - -static inline int -rm_ftrace_export(struct trace_export **list, struct trace_export *export) -{ - int ret; - - ret = rm_trace_export(list, export); - if (*list == NULL) - ftrace_exports_disable(); - - return ret; -} - -int register_ftrace_export(struct trace_export *export) -{ - if (WARN_ON_ONCE(!export->write)) - return -1; - - mutex_lock(&ftrace_export_lock); - - add_ftrace_export(&ftrace_exports_list, export); - - mutex_unlock(&ftrace_export_lock); - - return 0; -} -EXPORT_SYMBOL_GPL(register_ftrace_export); - -int unregister_ftrace_export(struct trace_export *export) -{ - int ret; - - mutex_lock(&ftrace_export_lock); - - ret = rm_ftrace_export(&ftrace_exports_list, export); - - mutex_unlock(&ftrace_export_lock); - - return ret; -} -EXPORT_SYMBOL_GPL(unregister_ftrace_export); - void trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned long flags, @@ -2884,8 +2902,8 @@ trace_function(struct trace_array *tr, entry->parent_ip = parent_ip; if (!call_filter_check_discard(call, entry, buffer, event)) { - if (static_branch_unlikely(&ftrace_exports_enabled)) - ftrace_exports(event); + if (static_branch_unlikely(&trace_function_exports_enabled)) + ftrace_exports(event, TRACE_EXPORT_FUNCTION); __buffer_unlock_commit(buffer, event); } } @@ -3546,13 +3564,15 @@ struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, if (iter->ent && iter->ent != iter->temp) { if ((!iter->temp || iter->temp_size < iter->ent_size) && !WARN_ON_ONCE(iter->temp == static_temp_buf)) { - kfree(iter->temp); - iter->temp = kmalloc(iter->ent_size, GFP_KERNEL); - if (!iter->temp) + void *temp; + temp = kmalloc(iter->ent_size, GFP_KERNEL); + if (!temp) return NULL; + kfree(iter->temp); + iter->temp = temp; + iter->temp_size = iter->ent_size; } memcpy(iter->temp, iter->ent, iter->ent_size); - iter->temp_size = iter->ent_size; iter->ent = iter->temp; } entry = __find_next_entry(iter, ent_cpu, NULL, ent_ts); @@ -3782,14 +3802,14 @@ unsigned long trace_total_entries(struct trace_array *tr) static void print_lat_help_header(struct seq_file *m) { - seq_puts(m, "# _------=> CPU# \n" - "# / _-----=> irqs-off \n" - "# | / _----=> need-resched \n" - "# || / _---=> hardirq/softirq \n" - "# ||| / _--=> preempt-depth \n" - "# |||| / delay \n" - "# cmd pid ||||| time | caller \n" - "# \\ / ||||| \\ | / \n"); + seq_puts(m, "# _------=> CPU# \n" + "# / _-----=> irqs-off \n" + "# | / _----=> need-resched \n" + "# || / _---=> hardirq/softirq \n" + "# ||| / _--=> preempt-depth \n" + "# |||| / delay \n" + "# cmd pid ||||| time | caller \n" + "# \\ / ||||| \\ | / \n"); } static void print_event_info(struct array_buffer *buf, struct seq_file *m) @@ -3810,26 +3830,26 @@ static void print_func_help_header(struct array_buffer *buf, struct seq_file *m, print_event_info(buf, m); - seq_printf(m, "# TASK-PID %s CPU# TIMESTAMP FUNCTION\n", tgid ? "TGID " : ""); - seq_printf(m, "# | | %s | | |\n", tgid ? " | " : ""); + seq_printf(m, "# TASK-PID %s CPU# TIMESTAMP FUNCTION\n", tgid ? " TGID " : ""); + seq_printf(m, "# | | %s | | |\n", tgid ? " | " : ""); } static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file *m, unsigned int flags) { bool tgid = flags & TRACE_ITER_RECORD_TGID; - const char *space = " "; - int prec = tgid ? 10 : 2; + const char *space = " "; + int prec = tgid ? 12 : 2; print_event_info(buf, m); - seq_printf(m, "# %.*s _-----=> irqs-off\n", prec, space); - seq_printf(m, "# %.*s / _----=> need-resched\n", prec, space); - seq_printf(m, "# %.*s| / _---=> hardirq/softirq\n", prec, space); - seq_printf(m, "# %.*s|| / _--=> preempt-depth\n", prec, space); - seq_printf(m, "# %.*s||| / delay\n", prec, space); - seq_printf(m, "# TASK-PID %.*sCPU# |||| TIMESTAMP FUNCTION\n", prec, " TGID "); - seq_printf(m, "# | | %.*s | |||| | |\n", prec, " | "); + seq_printf(m, "# %.*s _-----=> irqs-off\n", prec, space); + seq_printf(m, "# %.*s / _----=> need-resched\n", prec, space); + seq_printf(m, "# %.*s| / _---=> hardirq/softirq\n", prec, space); + seq_printf(m, "# %.*s|| / _--=> preempt-depth\n", prec, space); + seq_printf(m, "# %.*s||| / delay\n", prec, space); + seq_printf(m, "# TASK-PID %.*s CPU# |||| TIMESTAMP FUNCTION\n", prec, " TGID "); + seq_printf(m, "# | | %.*s | |||| | |\n", prec, " | "); } void @@ -5122,10 +5142,10 @@ static const char readme_msg[] = "\t -:[<group>/]<event>\n" #ifdef CONFIG_KPROBE_EVENTS "\t place: [<module>:]<symbol>[+<offset>]|<memaddr>\n" - "place (kretprobe): [<module>:]<symbol>[+<offset>]|<memaddr>\n" + "place (kretprobe): [<module>:]<symbol>[+<offset>]%return|<memaddr>\n" #endif #ifdef CONFIG_UPROBE_EVENTS - " place (uprobe): <path>:<offset>[(ref_ctr_offset)]\n" + " place (uprobe): <path>:<offset>[%return][(ref_ctr_offset)]\n" #endif "\t args: <name>=fetcharg[:type]\n" "\t fetcharg: %<register>, @<address>, @<symbol>[+|-<offset>],\n" @@ -5249,7 +5269,12 @@ static const char readme_msg[] = "\t trace(<synthetic_event>,param list) - generate synthetic event\n" "\t save(field,...) - save current event fields\n" #ifdef CONFIG_TRACER_SNAPSHOT - "\t snapshot() - snapshot the trace buffer\n" + "\t snapshot() - snapshot the trace buffer\n\n" +#endif +#ifdef CONFIG_SYNTH_EVENTS + " events/synthetic_events\t- Create/append/remove/show synthetic events\n" + "\t Write into this file to define/undefine new synthetic events.\n" + "\t example: echo 'myevent u64 lat; char name[]' >> synthetic_events\n" #endif #endif ; @@ -6662,7 +6687,6 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, written = -EFAULT; } else written = cnt; - len = cnt; if (tr->trace_marker_file && !list_empty(&tr->trace_marker_file->triggers)) { /* do not add \n before testing triggers, but add \0 */ @@ -6676,6 +6700,8 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, } else entry->buf[cnt] = '\0'; + if (static_branch_unlikely(&trace_marker_exports_enabled)) + ftrace_exports(event, TRACE_EXPORT_MARKER); __buffer_unlock_commit(buffer, event); if (tt) @@ -8636,6 +8662,24 @@ struct trace_array *trace_array_find_get(const char *instance) return tr; } +static int trace_array_create_dir(struct trace_array *tr) +{ + int ret; + + tr->dir = tracefs_create_dir(tr->name, trace_instance_dir); + if (!tr->dir) + return -EINVAL; + + ret = event_trace_add_tracer(tr->dir, tr); + if (ret) + tracefs_remove(tr->dir); + + init_tracer_tracefs(tr, tr->dir); + __update_tracer_options(tr); + + return ret; +} + static struct trace_array *trace_array_create(const char *name) { struct trace_array *tr; @@ -8671,30 +8715,28 @@ static struct trace_array *trace_array_create(const char *name) if (allocate_trace_buffers(tr, trace_buf_size) < 0) goto out_free_tr; - tr->dir = tracefs_create_dir(name, trace_instance_dir); - if (!tr->dir) - goto out_free_tr; - - ret = event_trace_add_tracer(tr->dir, tr); - if (ret) { - tracefs_remove(tr->dir); + if (ftrace_allocate_ftrace_ops(tr) < 0) goto out_free_tr; - } ftrace_init_trace_array(tr); - init_tracer_tracefs(tr, tr->dir); init_trace_flags_index(tr); - __update_tracer_options(tr); + + if (trace_instance_dir) { + ret = trace_array_create_dir(tr); + if (ret) + goto out_free_tr; + } else + __trace_early_add_events(tr); list_add(&tr->list, &ftrace_trace_arrays); tr->ref++; - return tr; out_free_tr: + ftrace_free_ftrace_ops(tr); free_trace_buffers(tr); free_cpumask_var(tr->tracing_cpumask); kfree(tr->name); @@ -8799,7 +8841,6 @@ static int __remove_instance(struct trace_array *tr) free_cpumask_var(tr->tracing_cpumask); kfree(tr->name); kfree(tr); - tr = NULL; return 0; } @@ -8853,11 +8894,27 @@ static int instance_rmdir(const char *name) static __init void create_trace_instances(struct dentry *d_tracer) { + struct trace_array *tr; + trace_instance_dir = tracefs_create_instance_dir("instances", d_tracer, instance_mkdir, instance_rmdir); if (MEM_FAIL(!trace_instance_dir, "Failed to create instances directory\n")) return; + + mutex_lock(&event_mutex); + mutex_lock(&trace_types_lock); + + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (!tr->name) + continue; + if (MEM_FAIL(trace_array_create_dir(tr) < 0, + "Failed to create instance directory\n")) + break; + } + + mutex_unlock(&trace_types_lock); + mutex_unlock(&event_mutex); } static void @@ -8971,21 +9028,21 @@ static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore) * directory. It is called via fs_initcall() by any of the boot up code * and expects to return the dentry of the top level tracing directory. */ -struct dentry *tracing_init_dentry(void) +int tracing_init_dentry(void) { struct trace_array *tr = &global_trace; if (security_locked_down(LOCKDOWN_TRACEFS)) { pr_warn("Tracing disabled due to lockdown\n"); - return ERR_PTR(-EPERM); + return -EPERM; } /* The top level trace array uses NULL as parent */ if (tr->dir) - return NULL; + return 0; if (WARN_ON(!tracefs_initialized())) - return ERR_PTR(-ENODEV); + return -ENODEV; /* * As there may still be users that expect the tracing @@ -8996,7 +9053,7 @@ struct dentry *tracing_init_dentry(void) tr->dir = debugfs_create_automount("tracing", NULL, trace_automount, NULL); - return NULL; + return 0; } extern struct trace_eval_map *__start_ftrace_eval_maps[]; @@ -9072,7 +9129,7 @@ static int trace_module_notify(struct notifier_block *self, break; } - return 0; + return NOTIFY_OK; } static struct notifier_block trace_module_nb = { @@ -9083,48 +9140,48 @@ static struct notifier_block trace_module_nb = { static __init int tracer_init_tracefs(void) { - struct dentry *d_tracer; + int ret; trace_access_lock_init(); - d_tracer = tracing_init_dentry(); - if (IS_ERR(d_tracer)) + ret = tracing_init_dentry(); + if (ret) return 0; event_trace_init(); - init_tracer_tracefs(&global_trace, d_tracer); - ftrace_init_tracefs_toplevel(&global_trace, d_tracer); + init_tracer_tracefs(&global_trace, NULL); + ftrace_init_tracefs_toplevel(&global_trace, NULL); - trace_create_file("tracing_thresh", 0644, d_tracer, + trace_create_file("tracing_thresh", 0644, NULL, &global_trace, &tracing_thresh_fops); - trace_create_file("README", 0444, d_tracer, + trace_create_file("README", 0444, NULL, NULL, &tracing_readme_fops); - trace_create_file("saved_cmdlines", 0444, d_tracer, + trace_create_file("saved_cmdlines", 0444, NULL, NULL, &tracing_saved_cmdlines_fops); - trace_create_file("saved_cmdlines_size", 0644, d_tracer, + trace_create_file("saved_cmdlines_size", 0644, NULL, NULL, &tracing_saved_cmdlines_size_fops); - trace_create_file("saved_tgids", 0444, d_tracer, + trace_create_file("saved_tgids", 0444, NULL, NULL, &tracing_saved_tgids_fops); trace_eval_init(); - trace_create_eval_file(d_tracer); + trace_create_eval_file(NULL); #ifdef CONFIG_MODULES register_module_notifier(&trace_module_nb); #endif #ifdef CONFIG_DYNAMIC_FTRACE - trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, + trace_create_file("dyn_ftrace_total_info", 0444, NULL, NULL, &tracing_dyn_info_fops); #endif - create_trace_instances(d_tracer); + create_trace_instances(NULL); update_tracer_options(&global_trace); @@ -9287,7 +9344,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) } /* - * We need to stop all tracing on all CPUS to read the + * We need to stop all tracing on all CPUS to read * the next buffer. This is a bit expensive, but is * not done often. We fill all what we can read, * and then release the locks again. @@ -9430,7 +9487,7 @@ __init static int tracer_alloc_buffers(void) } /* - * Make sure we don't accidently add more trace options + * Make sure we don't accidentally add more trace options * than we have bits for. */ BUILD_BUG_ON(TRACE_ITER_LAST_BIT > TRACE_FLAGS_MAX_SIZE); @@ -9459,7 +9516,7 @@ __init static int tracer_alloc_buffers(void) /* * The prepare callbacks allocates some memory for the ring buffer. We - * don't free the buffer if the if the CPU goes down. If we were to free + * don't free the buffer if the CPU goes down. If we were to free * the buffer, then the user would lose any trace that was in the * buffer. The memory will be removed once the "instance" is removed. */ diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 610d21355526..f3f5e77123ad 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -19,6 +19,7 @@ #include <linux/glob.h> #include <linux/irq_work.h> #include <linux/workqueue.h> +#include <linux/ctype.h> #ifdef CONFIG_FTRACE_SYSCALLS #include <asm/unistd.h> /* For NR_SYSCALLS */ @@ -98,7 +99,7 @@ enum trace_type { /* Use this for memory failure errors */ #define MEM_FAIL(condition, fmt, ...) ({ \ - static bool __section(.data.once) __warned; \ + static bool __section(".data.once") __warned; \ int __ret_warn_once = !!(condition); \ \ if (unlikely(__ret_warn_once && !__warned)) { \ @@ -246,7 +247,7 @@ typedef bool (*cond_update_fn_t)(struct trace_array *tr, void *cond_data); * tracing_snapshot_cond(tr, cond_data), the cond_data passed in is * passed in turn to the cond_snapshot.update() function. That data * can be compared by the update() implementation with the cond_data - * contained wihin the struct cond_snapshot instance associated with + * contained within the struct cond_snapshot instance associated with * the trace_array. Because the tr->max_lock is held throughout the * update() call, the update() function can directly retrieve the * cond_snapshot and cond_data associated with the per-instance @@ -271,7 +272,7 @@ typedef bool (*cond_update_fn_t)(struct trace_array *tr, void *cond_data); * take the snapshot, by returning 'true' if so, 'false' if no * snapshot should be taken. Because the max_lock is held for * the duration of update(), the implementation is safe to - * directly retrieven and save any implementation data it needs + * directly retrieved and save any implementation data it needs * to in association with the snapshot. */ struct cond_snapshot { @@ -573,7 +574,7 @@ struct tracer { * The function callback, which can use the FTRACE bits to * check for recursion. * - * Now if the arch does not suppport a feature, and it calls + * Now if the arch does not support a feature, and it calls * the global list function which calls the ftrace callback * all three of these steps will do a recursion protection. * There's no reason to do one if the previous caller already @@ -737,7 +738,7 @@ struct dentry *trace_create_file(const char *name, void *data, const struct file_operations *fops); -struct dentry *tracing_init_dentry(void); +int tracing_init_dentry(void); struct ring_buffer_event; @@ -1125,6 +1126,8 @@ extern int ftrace_is_dead(void); int ftrace_create_function_files(struct trace_array *tr, struct dentry *parent); void ftrace_destroy_function_files(struct trace_array *tr); +int ftrace_allocate_ftrace_ops(struct trace_array *tr); +void ftrace_free_ftrace_ops(struct trace_array *tr); void ftrace_init_global_array_ops(struct trace_array *tr); void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func); void ftrace_reset_array_ops(struct trace_array *tr); @@ -1146,6 +1149,11 @@ ftrace_create_function_files(struct trace_array *tr, { return 0; } +static inline int ftrace_allocate_ftrace_ops(struct trace_array *tr) +{ + return 0; +} +static inline void ftrace_free_ftrace_ops(struct trace_array *tr) { } static inline void ftrace_destroy_function_files(struct trace_array *tr) { } static inline __init void ftrace_init_global_array_ops(struct trace_array *tr) { } @@ -1472,7 +1480,7 @@ __trace_event_discard_commit(struct trace_buffer *buffer, /* * Helper function for event_trigger_unlock_commit{_regs}(). * If there are event triggers attached to this event that requires - * filtering against its fields, then they wil be called as the + * filtering against its fields, then they will be called as the * entry already holds the field information of the current event. * * It also checks if the event should be discarded or not. @@ -1651,6 +1659,7 @@ extern void trace_event_enable_tgid_record(bool enable); extern int event_trace_init(void); extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr); extern int event_trace_del_tracer(struct trace_array *tr); +extern void __trace_early_add_events(struct trace_array *tr); extern struct trace_event_file *__find_event_file(struct trace_array *tr, const char *system, @@ -2082,4 +2091,16 @@ static __always_inline void trace_iterator_reset(struct trace_iterator *iter) iter->pos = -1; } +/* Check the name is good for event/group/fields */ +static inline bool is_good_name(const char *name) +{ + if (!isalpha(*name) && *name != '_') + return false; + while (*++name != '\0') { + if (!isalpha(*name) && !isdigit(*name) && *name != '_') + return false; + } + return true; +} + #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index fa0fc08c6ef8..c22a152ef0b4 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -40,6 +40,16 @@ trace_boot_set_instance_options(struct trace_array *tr, struct xbc_node *node) pr_err("Failed to set option: %s\n", buf); } + p = xbc_node_find_value(node, "tracing_on", NULL); + if (p && *p != '\0') { + if (kstrtoul(p, 10, &v)) + pr_err("Failed to set tracing on: %s\n", p); + if (v) + tracer_tracing_on(tr); + else + tracer_tracing_off(tr); + } + p = xbc_node_find_value(node, "trace_clock", NULL); if (p && *p != '\0') { if (tracing_set_clock(tr, p) < 0) @@ -274,6 +284,12 @@ trace_boot_enable_tracer(struct trace_array *tr, struct xbc_node *node) if (tracing_set_tracer(tr, p) < 0) pr_err("Failed to set given tracer: %s\n", p); } + + /* Since tracer can free snapshot buffer, allocate snapshot here.*/ + if (xbc_node_find_value(node, "alloc_snapshot", NULL)) { + if (tracing_alloc_snapshot_instance(tr) < 0) + pr_err("Failed to allocate snapshot buffer\n"); + } } static void __init @@ -330,5 +346,8 @@ static int __init trace_boot_init(void) return 0; } - -fs_initcall(trace_boot_init); +/* + * Start tracing at the end of core-initcall, so that it starts tracing + * from the beginning of postcore_initcall. + */ +core_initcall_sync(trace_boot_init); diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index 9f2e8520b748..5fa49cfd2bb6 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -206,14 +206,14 @@ static const struct file_operations dynamic_events_ops = { /* Make a tracefs interface for controlling dynamic events */ static __init int init_dynamic_event(void) { - struct dentry *d_tracer; struct dentry *entry; + int ret; - d_tracer = tracing_init_dentry(); - if (IS_ERR(d_tracer)) + ret = tracing_init_dentry(); + if (ret) return 0; - entry = tracefs_create_file("dynamic_events", 0644, d_tracer, + entry = tracefs_create_file("dynamic_events", 0644, NULL, NULL, &dynamic_events_ops); /* Event list interface */ @@ -402,7 +402,7 @@ void dynevent_arg_init(struct dynevent_arg *arg, * whitespace, all followed by a separator, if applicable. After the * first arg string is successfully appended to the command string, * the optional @operator is appended, followed by the second arg and - * and optional @separator. If no separator was specified when + * optional @separator. If no separator was specified when * initializing the arg, a space will be appended. */ void dynevent_arg_pair_init(struct dynevent_arg_pair *arg_pair, diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index a85effb2373b..47a71f96e5bc 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -38,6 +38,7 @@ DEFINE_MUTEX(event_mutex); LIST_HEAD(ftrace_events); static LIST_HEAD(ftrace_generic_fields); static LIST_HEAD(ftrace_common_fields); +static bool eventdir_initialized; #define GFP_TRACE (GFP_KERNEL | __GFP_ZERO) @@ -2124,11 +2125,47 @@ event_subsystem_dir(struct trace_array *tr, const char *name, } static int +event_define_fields(struct trace_event_call *call) +{ + struct list_head *head; + int ret = 0; + + /* + * Other events may have the same class. Only update + * the fields if they are not already defined. + */ + head = trace_get_fields(call); + if (list_empty(head)) { + struct trace_event_fields *field = call->class->fields_array; + unsigned int offset = sizeof(struct trace_entry); + + for (; field->type; field++) { + if (field->type == TRACE_FUNCTION_TYPE) { + field->define_fields(call); + break; + } + + offset = ALIGN(offset, field->align); + ret = trace_define_field(call, field->type, field->name, + offset, field->size, + field->is_signed, field->filter_type); + if (WARN_ON_ONCE(ret)) { + pr_err("error code is %d\n", ret); + break; + } + + offset += field->size; + } + } + + return ret; +} + +static int event_create_dir(struct dentry *parent, struct trace_event_file *file) { struct trace_event_call *call = file->event_call; struct trace_array *tr = file->tr; - struct list_head *head; struct dentry *d_events; const char *name; int ret; @@ -2162,35 +2199,10 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file) &ftrace_event_id_fops); #endif - /* - * Other events may have the same class. Only update - * the fields if they are not already defined. - */ - head = trace_get_fields(call); - if (list_empty(head)) { - struct trace_event_fields *field = call->class->fields_array; - unsigned int offset = sizeof(struct trace_entry); - - for (; field->type; field++) { - if (field->type == TRACE_FUNCTION_TYPE) { - ret = field->define_fields(call); - break; - } - - offset = ALIGN(offset, field->align); - ret = trace_define_field(call, field->type, field->name, - offset, field->size, - field->is_signed, field->filter_type); - if (ret) - break; - - offset += field->size; - } - if (ret < 0) { - pr_warn("Could not initialize trace point events/%s\n", - name); - return -1; - } + ret = event_define_fields(call); + if (ret < 0) { + pr_warn("Could not initialize trace point events/%s\n", name); + return ret; } /* @@ -2475,7 +2487,10 @@ __trace_add_new_event(struct trace_event_call *call, struct trace_array *tr) if (!file) return -ENOMEM; - return event_create_dir(tr->event_dir, file); + if (eventdir_initialized) + return event_create_dir(tr->event_dir, file); + else + return event_define_fields(call); } /* @@ -2483,7 +2498,7 @@ __trace_add_new_event(struct trace_event_call *call, struct trace_array *tr) * for enabling events at boot. We want to enable events before * the filesystem is initialized. */ -static __init int +static int __trace_early_add_new_event(struct trace_event_call *call, struct trace_array *tr) { @@ -2493,7 +2508,7 @@ __trace_early_add_new_event(struct trace_event_call *call, if (!file) return -ENOMEM; - return 0; + return event_define_fields(call); } struct ftrace_module_file_ops; @@ -2646,7 +2661,7 @@ static int trace_module_notify(struct notifier_block *self, mutex_unlock(&trace_types_lock); mutex_unlock(&event_mutex); - return 0; + return NOTIFY_OK; } static struct notifier_block trace_module_nb = { @@ -3116,14 +3131,13 @@ static inline int register_event_cmds(void) { return 0; } #endif /* CONFIG_DYNAMIC_FTRACE */ /* - * The top level array has already had its trace_event_file - * descriptors created in order to allow for early events to - * be recorded. This function is called after the tracefs has been - * initialized, and we now have to create the files associated - * to the events. + * The top level array and trace arrays created by boot-time tracing + * have already had its trace_event_file descriptors created in order + * to allow for early events to be recorded. + * This function is called after the tracefs has been initialized, + * and we now have to create the files associated to the events. */ -static __init void -__trace_early_add_event_dirs(struct trace_array *tr) +static void __trace_early_add_event_dirs(struct trace_array *tr) { struct trace_event_file *file; int ret; @@ -3138,13 +3152,12 @@ __trace_early_add_event_dirs(struct trace_array *tr) } /* - * For early boot up, the top trace array requires to have - * a list of events that can be enabled. This must be done before - * the filesystem is set up in order to allow events to be traced - * early. + * For early boot up, the top trace array and the trace arrays created + * by boot-time tracing require to have a list of events that can be + * enabled. This must be done before the filesystem is set up in order + * to allow events to be traced early. */ -static __init void -__trace_early_add_events(struct trace_array *tr) +void __trace_early_add_events(struct trace_array *tr) { struct trace_event_call *call; int ret; @@ -3275,7 +3288,11 @@ int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr) goto out; down_write(&trace_event_sem); - __trace_add_event_dirs(tr); + /* If tr already has the event list, it is initialized in early boot. */ + if (unlikely(!list_empty(&tr->events))) + __trace_early_add_event_dirs(tr); + else + __trace_add_event_dirs(tr); up_write(&trace_event_sem); out: @@ -3431,10 +3448,21 @@ static __init int event_trace_enable_again(void) early_initcall(event_trace_enable_again); +/* Init fields which doesn't related to the tracefs */ +static __init int event_trace_init_fields(void) +{ + if (trace_define_generic_fields()) + pr_warn("tracing: Failed to allocated generic fields"); + + if (trace_define_common_fields()) + pr_warn("tracing: Failed to allocate common fields"); + + return 0; +} + __init int event_trace_init(void) { struct trace_array *tr; - struct dentry *d_tracer; struct dentry *entry; int ret; @@ -3442,22 +3470,12 @@ __init int event_trace_init(void) if (!tr) return -ENODEV; - d_tracer = tracing_init_dentry(); - if (IS_ERR(d_tracer)) - return 0; - - entry = tracefs_create_file("available_events", 0444, d_tracer, + entry = tracefs_create_file("available_events", 0444, NULL, tr, &ftrace_avail_fops); if (!entry) pr_warn("Could not create tracefs 'available_events' entry\n"); - if (trace_define_generic_fields()) - pr_warn("tracing: Failed to allocated generic fields"); - - if (trace_define_common_fields()) - pr_warn("tracing: Failed to allocate common fields"); - - ret = early_event_add_tracer(d_tracer, tr); + ret = early_event_add_tracer(NULL, tr); if (ret) return ret; @@ -3466,6 +3484,9 @@ __init int event_trace_init(void) if (ret) pr_warn("Failed to register trace events module notifier\n"); #endif + + eventdir_initialized = true; + return 0; } @@ -3474,6 +3495,7 @@ void __init trace_event_init(void) event_trace_memsetup(); init_ftrace_syscalls(); event_trace_enable(); + event_trace_init_fields(); } #ifdef CONFIG_EVENT_TRACE_STARTUP_TEST diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index bf44f6bbd0c3..78a678eeb140 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -499,7 +499,7 @@ predicate_parse(const char *str, int nr_parens, int nr_preds, ptr++; break; } - /* fall through */ + fallthrough; default: parse_error(pe, FILT_ERR_TOO_MANY_PREDS, next - str); @@ -1273,7 +1273,7 @@ static int parse_pred(const char *str, void *data, switch (op) { case OP_NE: pred->not = 1; - /* Fall through */ + fallthrough; case OP_GLOB: case OP_EQ: break; diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 0b933546142e..96c3f86b81c5 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -147,6 +147,8 @@ struct hist_field { */ unsigned int var_ref_idx; bool read_once; + + unsigned int var_str_idx; }; static u64 hist_field_none(struct hist_field *field, @@ -349,6 +351,7 @@ struct hist_trigger_data { unsigned int n_keys; unsigned int n_fields; unsigned int n_vars; + unsigned int n_var_str; unsigned int key_size; struct tracing_map_sort_key sort_keys[TRACING_MAP_SORT_KEYS_MAX]; unsigned int n_sort_keys; @@ -1396,7 +1399,14 @@ static int hist_trigger_elt_data_alloc(struct tracing_map_elt *elt) } } - n_str = hist_data->n_field_var_str + hist_data->n_save_var_str; + n_str = hist_data->n_field_var_str + hist_data->n_save_var_str + + hist_data->n_var_str; + if (n_str > SYNTH_FIELDS_MAX) { + hist_elt_data_free(elt_data); + return -EINVAL; + } + + BUILD_BUG_ON(STR_VAR_LEN_MAX & (sizeof(u64) - 1)); size = STR_VAR_LEN_MAX; @@ -3279,6 +3289,15 @@ static int check_synth_field(struct synth_event *event, field = event->fields[field_pos]; + /* + * A dynamic string synth field can accept static or + * dynamic. A static string synth field can only accept a + * same-sized static string, which is checked for later. + */ + if (strstr(hist_field->type, "char[") && field->is_string + && field->is_dynamic) + return 0; + if (strcmp(field->type, hist_field->type) != 0) { if (field->size != hist_field->size || field->is_signed != hist_field->is_signed) @@ -3651,6 +3670,7 @@ static int create_var_field(struct hist_trigger_data *hist_data, { struct trace_array *tr = hist_data->event_file->tr; unsigned long flags = 0; + int ret; if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX + TRACING_MAP_VARS_MAX)) return -EINVAL; @@ -3665,7 +3685,12 @@ static int create_var_field(struct hist_trigger_data *hist_data, if (WARN_ON(hist_data->n_vars > TRACING_MAP_VARS_MAX)) return -EINVAL; - return __create_val_field(hist_data, val_idx, file, var_name, expr_str, flags); + ret = __create_val_field(hist_data, val_idx, file, var_name, expr_str, flags); + + if (!ret && hist_data->fields[val_idx]->flags & HIST_FIELD_FL_STRING) + hist_data->fields[val_idx]->var_str_idx = hist_data->n_var_str++; + + return ret; } static int create_val_fields(struct hist_trigger_data *hist_data, @@ -3865,7 +3890,6 @@ static int parse_var_defs(struct hist_trigger_data *hist_data) s = kstrdup(field_str, GFP_KERNEL); if (!s) { - kfree(hist_data->attrs->var_defs.name[n_vars]); ret = -ENOMEM; goto free; } @@ -4393,6 +4417,22 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, hist_val = hist_field->fn(hist_field, elt, rbe, rec); if (hist_field->flags & HIST_FIELD_FL_VAR) { var_idx = hist_field->var.idx; + + if (hist_field->flags & HIST_FIELD_FL_STRING) { + unsigned int str_start, var_str_idx, idx; + char *str, *val_str; + + str_start = hist_data->n_field_var_str + + hist_data->n_save_var_str; + var_str_idx = hist_field->var_str_idx; + idx = str_start + var_str_idx; + + str = elt_data->field_var_str[idx]; + val_str = (char *)(uintptr_t)hist_val; + strscpy(str, val_str, STR_VAR_LEN_MAX); + + hist_val = (u64)(uintptr_t)str; + } tracing_map_set_var(elt, var_idx, hist_val); continue; } diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index c6cca0d1d584..84b7cab55291 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -20,6 +20,48 @@ #include "trace_synth.h" +#undef ERRORS +#define ERRORS \ + C(BAD_NAME, "Illegal name"), \ + C(CMD_INCOMPLETE, "Incomplete command"), \ + C(EVENT_EXISTS, "Event already exists"), \ + C(TOO_MANY_FIELDS, "Too many fields"), \ + C(INCOMPLETE_TYPE, "Incomplete type"), \ + C(INVALID_TYPE, "Invalid type"), \ + C(INVALID_FIELD, "Invalid field"), \ + C(CMD_TOO_LONG, "Command too long"), + +#undef C +#define C(a, b) SYNTH_ERR_##a + +enum { ERRORS }; + +#undef C +#define C(a, b) b + +static const char *err_text[] = { ERRORS }; + +static char last_cmd[MAX_FILTER_STR_VAL]; + +static int errpos(const char *str) +{ + return err_pos(last_cmd, str); +} + +static void last_cmd_set(char *str) +{ + if (!str) + return; + + strncpy(last_cmd, str, MAX_FILTER_STR_VAL - 1); +} + +static void synth_err(u8 err_type, u8 err_pos) +{ + tracing_log_err(NULL, "synthetic_events", last_cmd, err_text, + err_type, err_pos); +} + static int create_synth_event(int argc, const char **argv); static int synth_event_show(struct seq_file *m, struct dyn_event *ev); static int synth_event_release(struct dyn_event *ev); @@ -88,7 +130,7 @@ static int synth_event_define_fields(struct trace_event_call *call) event->fields[i]->offset = n_u64; - if (event->fields[i]->is_string) { + if (event->fields[i]->is_string && !event->fields[i]->is_dynamic) { offset += STR_VAR_LEN_MAX; n_u64 += STR_VAR_LEN_MAX / sizeof(u64); } else { @@ -132,13 +174,16 @@ static int synth_field_string_size(char *type) start += sizeof("char[") - 1; end = strchr(type, ']'); - if (!end || end < start) + if (!end || end < start || type + strlen(type) > end + 1) return -EINVAL; len = end - start; if (len > 3) return -EINVAL; + if (len == 0) + return 0; /* variable-length string */ + strncpy(buf, start, len); buf[len] = '\0'; @@ -184,6 +229,8 @@ static int synth_field_size(char *type) size = sizeof(long); else if (strcmp(type, "unsigned long") == 0) size = sizeof(unsigned long); + else if (strcmp(type, "bool") == 0) + size = sizeof(bool); else if (strcmp(type, "pid_t") == 0) size = sizeof(pid_t); else if (strcmp(type, "gfp_t") == 0) @@ -226,12 +273,14 @@ static const char *synth_field_fmt(char *type) fmt = "%ld"; else if (strcmp(type, "unsigned long") == 0) fmt = "%lu"; + else if (strcmp(type, "bool") == 0) + fmt = "%d"; else if (strcmp(type, "pid_t") == 0) fmt = "%d"; else if (strcmp(type, "gfp_t") == 0) fmt = "%x"; else if (synth_field_is_string(type)) - fmt = "%s"; + fmt = "%.*s"; return fmt; } @@ -290,10 +339,27 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter, /* parameter values */ if (se->fields[i]->is_string) { - trace_seq_printf(s, print_fmt, se->fields[i]->name, - (char *)&entry->fields[n_u64], - i == se->n_fields - 1 ? "" : " "); - n_u64 += STR_VAR_LEN_MAX / sizeof(u64); + if (se->fields[i]->is_dynamic) { + u32 offset, data_offset; + char *str_field; + + offset = (u32)entry->fields[n_u64]; + data_offset = offset & 0xffff; + + str_field = (char *)entry + data_offset; + + trace_seq_printf(s, print_fmt, se->fields[i]->name, + STR_VAR_LEN_MAX, + str_field, + i == se->n_fields - 1 ? "" : " "); + n_u64++; + } else { + trace_seq_printf(s, print_fmt, se->fields[i]->name, + STR_VAR_LEN_MAX, + (char *)&entry->fields[n_u64], + i == se->n_fields - 1 ? "" : " "); + n_u64 += STR_VAR_LEN_MAX / sizeof(u64); + } } else { struct trace_print_flags __flags[] = { __def_gfpflag_names, {-1, NULL} }; @@ -325,16 +391,52 @@ static struct trace_event_functions synth_event_funcs = { .trace = print_synth_event }; +static unsigned int trace_string(struct synth_trace_event *entry, + struct synth_event *event, + char *str_val, + bool is_dynamic, + unsigned int data_size, + unsigned int *n_u64) +{ + unsigned int len = 0; + char *str_field; + + if (is_dynamic) { + u32 data_offset; + + data_offset = offsetof(typeof(*entry), fields); + data_offset += event->n_u64 * sizeof(u64); + data_offset += data_size; + + str_field = (char *)entry + data_offset; + + len = strlen(str_val) + 1; + strscpy(str_field, str_val, len); + + data_offset |= len << 16; + *(u32 *)&entry->fields[*n_u64] = data_offset; + + (*n_u64)++; + } else { + str_field = (char *)&entry->fields[*n_u64]; + + strscpy(str_field, str_val, STR_VAR_LEN_MAX); + (*n_u64) += STR_VAR_LEN_MAX / sizeof(u64); + } + + return len; +} + static notrace void trace_event_raw_event_synth(void *__data, u64 *var_ref_vals, unsigned int *var_ref_idx) { + unsigned int i, n_u64, val_idx, len, data_size = 0; struct trace_event_file *trace_file = __data; struct synth_trace_event *entry; struct trace_event_buffer fbuffer; struct trace_buffer *buffer; struct synth_event *event; - unsigned int i, n_u64, val_idx; int fields_size = 0; event = trace_file->event_call->data; @@ -344,6 +446,18 @@ static notrace void trace_event_raw_event_synth(void *__data, fields_size = event->n_u64 * sizeof(u64); + for (i = 0; i < event->n_dynamic_fields; i++) { + unsigned int field_pos = event->dynamic_fields[i]->field_pos; + char *str_val; + + val_idx = var_ref_idx[field_pos]; + str_val = (char *)(long)var_ref_vals[val_idx]; + + len = strlen(str_val) + 1; + + fields_size += len; + } + /* * Avoid ring buffer recursion detection, as this event * is being performed within another event. @@ -360,10 +474,11 @@ static notrace void trace_event_raw_event_synth(void *__data, val_idx = var_ref_idx[i]; if (event->fields[i]->is_string) { char *str_val = (char *)(long)var_ref_vals[val_idx]; - char *str_field = (char *)&entry->fields[n_u64]; - strscpy(str_field, str_val, STR_VAR_LEN_MAX); - n_u64 += STR_VAR_LEN_MAX / sizeof(u64); + len = trace_string(entry, event, str_val, + event->fields[i]->is_dynamic, + data_size, &n_u64); + data_size += len; /* only dynamic string increments */ } else { struct synth_field *field = event->fields[i]; u64 val = var_ref_vals[val_idx]; @@ -422,8 +537,13 @@ static int __set_synth_event_print_fmt(struct synth_event *event, pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); for (i = 0; i < event->n_fields; i++) { - pos += snprintf(buf + pos, LEN_OR_ZERO, - ", REC->%s", event->fields[i]->name); + if (event->fields[i]->is_string && + event->fields[i]->is_dynamic) + pos += snprintf(buf + pos, LEN_OR_ZERO, + ", __get_str(%s)", event->fields[i]->name); + else + pos += snprintf(buf + pos, LEN_OR_ZERO, + ", REC->%s", event->fields[i]->name); } #undef LEN_OR_ZERO @@ -465,13 +585,17 @@ static struct synth_field *parse_synth_field(int argc, const char **argv, struct synth_field *field; const char *prefix = NULL, *field_type = argv[0], *field_name, *array; int len, ret = 0; + struct seq_buf s; + ssize_t size; if (field_type[0] == ';') field_type++; if (!strcmp(field_type, "unsigned")) { - if (argc < 3) + if (argc < 3) { + synth_err(SYNTH_ERR_INCOMPLETE_TYPE, errpos(field_type)); return ERR_PTR(-EINVAL); + } prefix = "unsigned "; field_type = argv[1]; field_name = argv[2]; @@ -497,12 +621,19 @@ static struct synth_field *parse_synth_field(int argc, const char **argv, ret = -ENOMEM; goto free; } + if (!is_good_name(field->name)) { + synth_err(SYNTH_ERR_BAD_NAME, errpos(field_name)); + ret = -EINVAL; + goto free; + } if (field_type[0] == ';') field_type++; len = strlen(field_type) + 1; + if (array) len += strlen(array); + if (prefix) len += strlen(prefix); @@ -511,26 +642,60 @@ static struct synth_field *parse_synth_field(int argc, const char **argv, ret = -ENOMEM; goto free; } + seq_buf_init(&s, field->type, len); if (prefix) - strcat(field->type, prefix); - strcat(field->type, field_type); + seq_buf_puts(&s, prefix); + seq_buf_puts(&s, field_type); if (array) { - strcat(field->type, array); - if (field->type[len - 1] == ';') - field->type[len - 1] = '\0'; + seq_buf_puts(&s, array); + if (s.buffer[s.len - 1] == ';') + s.len--; } + if (WARN_ON_ONCE(!seq_buf_buffer_left(&s))) + goto free; + s.buffer[s.len] = '\0'; - field->size = synth_field_size(field->type); - if (!field->size) { + size = synth_field_size(field->type); + if (size < 0) { + synth_err(SYNTH_ERR_INVALID_TYPE, errpos(field_type)); ret = -EINVAL; goto free; + } else if (size == 0) { + if (synth_field_is_string(field->type)) { + char *type; + + len = sizeof("__data_loc ") + strlen(field->type) + 1; + type = kzalloc(len, GFP_KERNEL); + if (!type) { + ret = -ENOMEM; + goto free; + } + + seq_buf_init(&s, type, len); + seq_buf_puts(&s, "__data_loc "); + seq_buf_puts(&s, field->type); + + if (WARN_ON_ONCE(!seq_buf_buffer_left(&s))) + goto free; + s.buffer[s.len] = '\0'; + + kfree(field->type); + field->type = type; + + field->is_dynamic = true; + size = sizeof(u64); + } else { + synth_err(SYNTH_ERR_INVALID_TYPE, errpos(field_type)); + ret = -EINVAL; + goto free; + } } + field->size = size; if (synth_field_is_string(field->type)) field->is_string = true; field->is_signed = synth_field_signed(field->type); - out: return field; free: @@ -661,6 +826,7 @@ static void free_synth_event(struct synth_event *event) free_synth_field(event->fields[i]); kfree(event->fields); + kfree(event->dynamic_fields); kfree(event->name); kfree(event->class.system); free_synth_tracepoint(event->tp); @@ -671,8 +837,8 @@ static void free_synth_event(struct synth_event *event) static struct synth_event *alloc_synth_event(const char *name, int n_fields, struct synth_field **fields) { + unsigned int i, j, n_dynamic_fields = 0; struct synth_event *event; - unsigned int i; event = kzalloc(sizeof(*event), GFP_KERNEL); if (!event) { @@ -694,11 +860,33 @@ static struct synth_event *alloc_synth_event(const char *name, int n_fields, goto out; } + for (i = 0; i < n_fields; i++) + if (fields[i]->is_dynamic) + n_dynamic_fields++; + + if (n_dynamic_fields) { + event->dynamic_fields = kcalloc(n_dynamic_fields, + sizeof(*event->dynamic_fields), + GFP_KERNEL); + if (!event->dynamic_fields) { + free_synth_event(event); + event = ERR_PTR(-ENOMEM); + goto out; + } + } + dyn_event_init(&event->devent, &synth_event_ops); - for (i = 0; i < n_fields; i++) + for (i = 0, j = 0; i < n_fields; i++) { event->fields[i] = fields[i]; + if (fields[i]->is_dynamic) { + event->dynamic_fields[j] = fields[i]; + event->dynamic_fields[j]->field_pos = i; + event->dynamic_fields[j++] = fields[i]; + event->n_dynamic_fields++; + } + } event->n_fields = n_fields; out: return event; @@ -710,6 +898,10 @@ static int synth_event_check_arg_fn(void *data) int size; size = synth_field_size((char *)arg_pair->lhs); + if (size == 0) { + if (strstr((char *)arg_pair->lhs, "[")) + return 0; + } return size ? 0 : -EINVAL; } @@ -971,12 +1163,47 @@ int synth_event_gen_cmd_array_start(struct dynevent_cmd *cmd, const char *name, } EXPORT_SYMBOL_GPL(synth_event_gen_cmd_array_start); +static int save_cmdstr(int argc, const char *name, const char **argv) +{ + struct seq_buf s; + char *buf; + int i; + + buf = kzalloc(MAX_DYNEVENT_CMD_LEN, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + seq_buf_init(&s, buf, MAX_DYNEVENT_CMD_LEN); + + seq_buf_puts(&s, name); + + for (i = 0; i < argc; i++) { + seq_buf_putc(&s, ' '); + seq_buf_puts(&s, argv[i]); + } + + if (!seq_buf_buffer_left(&s)) { + synth_err(SYNTH_ERR_CMD_TOO_LONG, 0); + kfree(buf); + return -EINVAL; + } + buf[s.len] = 0; + last_cmd_set(buf); + + kfree(buf); + return 0; +} + static int __create_synth_event(int argc, const char *name, const char **argv) { struct synth_field *field, *fields[SYNTH_FIELDS_MAX]; struct synth_event *event = NULL; int i, consumed = 0, n_fields = 0, ret = 0; + ret = save_cmdstr(argc, name, argv); + if (ret) + return ret; + /* * Argument syntax: * - Add synthetic event: <event_name> field[;field] ... @@ -984,13 +1211,22 @@ static int __create_synth_event(int argc, const char *name, const char **argv) * where 'field' = type field_name */ - if (name[0] == '\0' || argc < 1) + if (name[0] == '\0' || argc < 1) { + synth_err(SYNTH_ERR_CMD_INCOMPLETE, 0); return -EINVAL; + } mutex_lock(&event_mutex); + if (!is_good_name(name)) { + synth_err(SYNTH_ERR_BAD_NAME, errpos(name)); + ret = -EINVAL; + goto out; + } + event = find_synth_event(name); if (event) { + synth_err(SYNTH_ERR_EVENT_EXISTS, errpos(name)); ret = -EEXIST; goto out; } @@ -999,6 +1235,7 @@ static int __create_synth_event(int argc, const char *name, const char **argv) if (strcmp(argv[i], ";") == 0) continue; if (n_fields == SYNTH_FIELDS_MAX) { + synth_err(SYNTH_ERR_TOO_MANY_FIELDS, 0); ret = -EINVAL; goto err; } @@ -1013,6 +1250,7 @@ static int __create_synth_event(int argc, const char *name, const char **argv) } if (i < argc && strcmp(argv[i], ";") != 0) { + synth_err(SYNTH_ERR_INVALID_FIELD, errpos(argv[i])); ret = -EINVAL; goto err; } @@ -1198,10 +1436,9 @@ void synth_event_cmd_init(struct dynevent_cmd *cmd, char *buf, int maxlen) EXPORT_SYMBOL_GPL(synth_event_cmd_init); static inline int -__synth_event_trace_start(struct trace_event_file *file, - struct synth_event_trace_state *trace_state) +__synth_event_trace_init(struct trace_event_file *file, + struct synth_event_trace_state *trace_state) { - int entry_size, fields_size = 0; int ret = 0; memset(trace_state, '\0', sizeof(*trace_state)); @@ -1211,7 +1448,7 @@ __synth_event_trace_start(struct trace_event_file *file, * ENABLED bit is set (which attaches the probe thus allowing * this code to be called, etc). Because this is called * directly by the user, we don't have that but we still need - * to honor not logging when disabled. For the the iterated + * to honor not logging when disabled. For the iterated * trace case, we save the enabed state upon start and just * ignore the following data calls. */ @@ -1223,8 +1460,20 @@ __synth_event_trace_start(struct trace_event_file *file, } trace_state->event = file->event_call->data; +out: + return ret; +} + +static inline int +__synth_event_trace_start(struct trace_event_file *file, + struct synth_event_trace_state *trace_state, + int dynamic_fields_size) +{ + int entry_size, fields_size = 0; + int ret = 0; fields_size = trace_state->event->n_u64 * sizeof(u64); + fields_size += dynamic_fields_size; /* * Avoid ring buffer recursion detection, as this event @@ -1241,7 +1490,7 @@ __synth_event_trace_start(struct trace_event_file *file, ring_buffer_nest_end(trace_state->buffer); ret = -EINVAL; } -out: + return ret; } @@ -1274,23 +1523,46 @@ __synth_event_trace_end(struct synth_event_trace_state *trace_state) */ int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...) { + unsigned int i, n_u64, len, data_size = 0; struct synth_event_trace_state state; - unsigned int i, n_u64; va_list args; int ret; - ret = __synth_event_trace_start(file, &state); + ret = __synth_event_trace_init(file, &state); if (ret) { if (ret == -ENOENT) ret = 0; /* just disabled, not really an error */ return ret; } + if (state.event->n_dynamic_fields) { + va_start(args, n_vals); + + for (i = 0; i < state.event->n_fields; i++) { + u64 val = va_arg(args, u64); + + if (state.event->fields[i]->is_string && + state.event->fields[i]->is_dynamic) { + char *str_val = (char *)(long)val; + + data_size += strlen(str_val) + 1; + } + } + + va_end(args); + } + + ret = __synth_event_trace_start(file, &state, data_size); + if (ret) + return ret; + if (n_vals != state.event->n_fields) { ret = -EINVAL; goto out; } + data_size = 0; + va_start(args, n_vals); for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) { u64 val; @@ -1299,10 +1571,11 @@ int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...) if (state.event->fields[i]->is_string) { char *str_val = (char *)(long)val; - char *str_field = (char *)&state.entry->fields[n_u64]; - strscpy(str_field, str_val, STR_VAR_LEN_MAX); - n_u64 += STR_VAR_LEN_MAX / sizeof(u64); + len = trace_string(state.entry, state.event, str_val, + state.event->fields[i]->is_dynamic, + data_size, &n_u64); + data_size += len; /* only dynamic string increments */ } else { struct synth_field *field = state.event->fields[i]; @@ -1355,29 +1628,46 @@ EXPORT_SYMBOL_GPL(synth_event_trace); int synth_event_trace_array(struct trace_event_file *file, u64 *vals, unsigned int n_vals) { + unsigned int i, n_u64, field_pos, len, data_size = 0; struct synth_event_trace_state state; - unsigned int i, n_u64; + char *str_val; int ret; - ret = __synth_event_trace_start(file, &state); + ret = __synth_event_trace_init(file, &state); if (ret) { if (ret == -ENOENT) ret = 0; /* just disabled, not really an error */ return ret; } + if (state.event->n_dynamic_fields) { + for (i = 0; i < state.event->n_dynamic_fields; i++) { + field_pos = state.event->dynamic_fields[i]->field_pos; + str_val = (char *)(long)vals[field_pos]; + len = strlen(str_val) + 1; + data_size += len; + } + } + + ret = __synth_event_trace_start(file, &state, data_size); + if (ret) + return ret; + if (n_vals != state.event->n_fields) { ret = -EINVAL; goto out; } + data_size = 0; + for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) { if (state.event->fields[i]->is_string) { char *str_val = (char *)(long)vals[i]; - char *str_field = (char *)&state.entry->fields[n_u64]; - strscpy(str_field, str_val, STR_VAR_LEN_MAX); - n_u64 += STR_VAR_LEN_MAX / sizeof(u64); + len = trace_string(state.entry, state.event, str_val, + state.event->fields[i]->is_dynamic, + data_size, &n_u64); + data_size += len; /* only dynamic string increments */ } else { struct synth_field *field = state.event->fields[i]; u64 val = vals[i]; @@ -1445,9 +1735,17 @@ int synth_event_trace_start(struct trace_event_file *file, if (!trace_state) return -EINVAL; - ret = __synth_event_trace_start(file, trace_state); - if (ret == -ENOENT) - ret = 0; /* just disabled, not really an error */ + ret = __synth_event_trace_init(file, trace_state); + if (ret) { + if (ret == -ENOENT) + ret = 0; /* just disabled, not really an error */ + return ret; + } + + if (trace_state->event->n_dynamic_fields) + return -ENOTSUPP; + + ret = __synth_event_trace_start(file, trace_state, 0); return ret; } @@ -1508,6 +1806,11 @@ static int __synth_event_add_val(const char *field_name, u64 val, char *str_val = (char *)(long)val; char *str_field; + if (field->is_dynamic) { /* add_val can't do dynamic strings */ + ret = -EINVAL; + goto out; + } + if (!str_val) { ret = -EINVAL; goto out; @@ -1679,14 +1982,22 @@ static int __synth_event_show(struct seq_file *m, struct synth_event *event) { struct synth_field *field; unsigned int i; + char *type, *t; seq_printf(m, "%s\t", event->name); for (i = 0; i < event->n_fields; i++) { field = event->fields[i]; + type = field->type; + t = strstr(type, "__data_loc"); + if (t) { /* __data_loc belongs in format but not event desc */ + t += sizeof("__data_loc"); + type = t; + } + /* parameter values */ - seq_printf(m, "%s %s%s", field->type, field->name, + seq_printf(m, "%s %s%s", type, field->name, i == event->n_fields - 1 ? "" : "; "); } @@ -1754,25 +2065,31 @@ static const struct file_operations synth_events_fops = { .release = seq_release, }; -static __init int trace_events_synth_init(void) +/* + * Register dynevent at core_initcall. This allows kernel to setup kprobe + * events in postcore_initcall without tracefs. + */ +static __init int trace_events_synth_init_early(void) { - struct dentry *entry = NULL; - struct dentry *d_tracer; int err = 0; err = dyn_event_register(&synth_event_ops); - if (err) { + if (err) pr_warn("Could not register synth_event_ops\n"); - return err; - } - d_tracer = tracing_init_dentry(); - if (IS_ERR(d_tracer)) { - err = PTR_ERR(d_tracer); + return err; +} +core_initcall(trace_events_synth_init_early); + +static __init int trace_events_synth_init(void) +{ + struct dentry *entry = NULL; + int err = 0; + err = tracing_init_dentry(); + if (err) goto err; - } - entry = tracefs_create_file("synthetic_events", 0644, d_tracer, + entry = tracefs_create_file("synthetic_events", 0644, NULL, NULL, &synth_events_fops); if (!entry) { err = -ENODEV; diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 70d3d0a09053..90f81d33fa3f 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -176,7 +176,7 @@ struct trace_event_call __used event_##call = { \ .flags = TRACE_EVENT_FL_IGNORE_ENABLE, \ }; \ static struct trace_event_call __used \ -__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; +__section("_ftrace_events") *__event_##call = &event_##call; #undef FTRACE_ENTRY #define FTRACE_ENTRY(call, struct_name, etype, tstruct, print) \ diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index dd4dff71d89a..2c2126e1871d 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -34,10 +34,14 @@ enum { TRACE_FUNC_OPT_STACK = 0x1, }; -static int allocate_ftrace_ops(struct trace_array *tr) +int ftrace_allocate_ftrace_ops(struct trace_array *tr) { struct ftrace_ops *ops; + /* The top level array uses the "global_ops" */ + if (tr->flags & TRACE_ARRAY_FL_GLOBAL) + return 0; + ops = kzalloc(sizeof(*ops), GFP_KERNEL); if (!ops) return -ENOMEM; @@ -48,15 +52,19 @@ static int allocate_ftrace_ops(struct trace_array *tr) tr->ops = ops; ops->private = tr; + return 0; } +void ftrace_free_ftrace_ops(struct trace_array *tr) +{ + kfree(tr->ops); + tr->ops = NULL; +} int ftrace_create_function_files(struct trace_array *tr, struct dentry *parent) { - int ret; - /* * The top level array uses the "global_ops", and the files are * created on boot up. @@ -64,9 +72,8 @@ int ftrace_create_function_files(struct trace_array *tr, if (tr->flags & TRACE_ARRAY_FL_GLOBAL) return 0; - ret = allocate_ftrace_ops(tr); - if (ret) - return ret; + if (!tr->ops) + return -EINVAL; ftrace_create_filter_files(tr->ops, parent); @@ -76,8 +83,7 @@ int ftrace_create_function_files(struct trace_array *tr, void ftrace_destroy_function_files(struct trace_array *tr) { ftrace_destroy_filter_files(tr->ops); - kfree(tr->ops); - tr->ops = NULL; + ftrace_free_ftrace_ops(tr); } static int function_trace_init(struct trace_array *tr) diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 4a9c49c08ec9..60d66278aa0d 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -1336,13 +1336,13 @@ static const struct file_operations graph_depth_fops = { static __init int init_graph_tracefs(void) { - struct dentry *d_tracer; + int ret; - d_tracer = tracing_init_dentry(); - if (IS_ERR(d_tracer)) + ret = tracing_init_dentry(); + if (ret) return 0; - trace_create_file("max_graph_depth", 0644, d_tracer, + trace_create_file("max_graph_depth", 0644, NULL, NULL, &graph_depth_fops); return 0; diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index 17873e5d0353..c9ad5c6fbaad 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -538,14 +538,14 @@ static const struct file_operations window_fops = { */ static int init_tracefs(void) { - struct dentry *d_tracer; + int ret; struct dentry *top_dir; - d_tracer = tracing_init_dentry(); - if (IS_ERR(d_tracer)) + ret = tracing_init_dentry(); + if (ret) return -ENOMEM; - top_dir = tracefs_create_dir("hwlat_detector", d_tracer); + top_dir = tracefs_create_dir("hwlat_detector", NULL); if (!top_dir) return -ENOMEM; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index aefb6065b508..b911e9f6d9f5 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -106,9 +106,10 @@ static nokprobe_inline bool trace_kprobe_has_gone(struct trace_kprobe *tk) static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk, struct module *mod) { - int len = strlen(mod->name); + int len = strlen(module_name(mod)); const char *name = trace_kprobe_symbol(tk); - return strncmp(mod->name, name, len) == 0 && name[len] == ':'; + + return strncmp(module_name(mod), name, len) == 0 && name[len] == ':'; } static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk) @@ -688,7 +689,7 @@ static int trace_kprobe_module_callback(struct notifier_block *nb, if (ret) pr_warn("Failed to re-register probe %s on %s: %d\n", trace_probe_name(&tk->tp), - mod->name, ret); + module_name(mod), ret); } } mutex_unlock(&event_mutex); @@ -717,6 +718,9 @@ static int trace_kprobe_create(int argc, const char *argv[]) * p[:[GRP/]EVENT] [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS] * - Add kretprobe: * r[MAXACTIVE][:[GRP/]EVENT] [MOD:]KSYM[+0] [FETCHARGS] + * Or + * p:[GRP/]EVENT] [MOD:]KSYM[+0]%return [FETCHARGS] + * * Fetch args: * $retval : fetch return value * $stack : fetch stack address @@ -746,7 +750,6 @@ static int trace_kprobe_create(int argc, const char *argv[]) switch (argv[0][0]) { case 'r': is_return = true; - flags |= TPARG_FL_RETURN; break; case 'p': break; @@ -804,12 +807,26 @@ static int trace_kprobe_create(int argc, const char *argv[]) symbol = kstrdup(argv[1], GFP_KERNEL); if (!symbol) return -ENOMEM; + + tmp = strchr(symbol, '%'); + if (tmp) { + if (!strcmp(tmp, "%return")) { + *tmp = '\0'; + is_return = true; + } else { + trace_probe_log_err(tmp - symbol, BAD_ADDR_SUFFIX); + goto parse_error; + } + } + /* TODO: support .init module functions */ ret = traceprobe_split_symbol_offset(symbol, &offset); if (ret || offset < 0 || offset > UINT_MAX) { trace_probe_log_err(0, BAD_PROBE_ADDR); goto parse_error; } + if (is_return) + flags |= TPARG_FL_RETURN; if (kprobe_on_func_entry(NULL, symbol, offset)) flags |= TPARG_FL_FENTRY; if (offset && is_return && !(flags & TPARG_FL_FENTRY)) { @@ -1880,8 +1897,8 @@ static __init void setup_boot_kprobe_events(void) } /* - * Register dynevent at subsys_initcall. This allows kernel to setup kprobe - * events in fs_initcall without tracefs. + * Register dynevent at core_initcall. This allows kernel to setup kprobe + * events in postcore_initcall without tracefs. */ static __init int init_kprobe_trace_early(void) { @@ -1896,19 +1913,19 @@ static __init int init_kprobe_trace_early(void) return 0; } -subsys_initcall(init_kprobe_trace_early); +core_initcall(init_kprobe_trace_early); /* Make a tracefs interface for controlling probe points */ static __init int init_kprobe_trace(void) { - struct dentry *d_tracer; + int ret; struct dentry *entry; - d_tracer = tracing_init_dentry(); - if (IS_ERR(d_tracer)) + ret = tracing_init_dentry(); + if (ret) return 0; - entry = tracefs_create_file("kprobe_events", 0644, d_tracer, + entry = tracefs_create_file("kprobe_events", 0644, NULL, NULL, &kprobe_events_ops); /* Event list interface */ @@ -1916,7 +1933,7 @@ static __init int init_kprobe_trace(void) pr_warn("Could not create tracefs 'kprobe_events' entry\n"); /* Profile interface */ - entry = tracefs_create_file("kprobe_profile", 0444, d_tracer, + entry = tracefs_create_file("kprobe_profile", 0444, NULL, NULL, &kprobe_profile_ops); if (!entry) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 4d1893564912..000e9dc224c6 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -497,7 +497,7 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) trace_find_cmdline(entry->pid, comm); - trace_seq_printf(s, "%8.8s-%-5d %3d", + trace_seq_printf(s, "%8.8s-%-7d %3d", comm, entry->pid, cpu); return trace_print_lat_fmt(s, entry); @@ -588,15 +588,15 @@ int trace_print_context(struct trace_iterator *iter) trace_find_cmdline(entry->pid, comm); - trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid); + trace_seq_printf(s, "%16s-%-7d ", comm, entry->pid); if (tr->trace_flags & TRACE_ITER_RECORD_TGID) { unsigned int tgid = trace_find_tgid(entry->pid); if (!tgid) - trace_seq_printf(s, "(-----) "); + trace_seq_printf(s, "(-------) "); else - trace_seq_printf(s, "(%5d) ", tgid); + trace_seq_printf(s, "(%7d) ", tgid); } trace_seq_printf(s, "[%03d] ", iter->cpu); @@ -636,7 +636,7 @@ int trace_print_lat_context(struct trace_iterator *iter) trace_find_cmdline(entry->pid, comm); trace_seq_printf( - s, "%16s %5d %3d %d %08x %08lx ", + s, "%16s %7d %3d %d %08x %08lx ", comm, entry->pid, iter->cpu, entry->flags, entry->preempt_count, iter->idx); } else { @@ -917,7 +917,7 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter, S = task_index_to_char(field->prev_state); trace_find_cmdline(field->next_pid, comm); trace_seq_printf(&iter->seq, - " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", + " %7d:%3d:%c %s [%03d] %7d:%3d:%c %s\n", field->prev_pid, field->prev_prio, S, delim, diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c index f10073e62603..f4938040c228 100644 --- a/kernel/trace/trace_preemptirq.c +++ b/kernel/trace/trace_preemptirq.c @@ -102,14 +102,14 @@ NOKPROBE_SYMBOL(trace_hardirqs_on_caller); __visible void trace_hardirqs_off_caller(unsigned long caller_addr) { + lockdep_hardirqs_off(CALLER_ADDR0); + if (!this_cpu_read(tracing_irq_cpu)) { this_cpu_write(tracing_irq_cpu, 1); tracer_hardirqs_off(CALLER_ADDR0, caller_addr); if (!in_nmi()) trace_irq_disable_rcuidle(CALLER_ADDR0, caller_addr); } - - lockdep_hardirqs_off(CALLER_ADDR0); } EXPORT_SYMBOL(trace_hardirqs_off_caller); NOKPROBE_SYMBOL(trace_hardirqs_off_caller); diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index d4e31e969206..ff32476df072 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -96,7 +96,7 @@ static int module_trace_bprintk_format_notify(struct notifier_block *self, if (val == MODULE_STATE_COMING) hold_module_trace_bprintk_format(start, end); } - return 0; + return NOTIFY_OK; } /* @@ -174,7 +174,7 @@ __init static int module_trace_bprintk_format_notify(struct notifier_block *self, unsigned long val, void *data) { - return 0; + return NOTIFY_OK; } static inline const char ** find_next_mod_format(int start_index, void *v, const char **fmt, loff_t *pos) @@ -367,13 +367,13 @@ static const struct file_operations ftrace_formats_fops = { static __init int init_trace_printk_function_export(void) { - struct dentry *d_tracer; + int ret; - d_tracer = tracing_init_dentry(); - if (IS_ERR(d_tracer)) + ret = tracing_init_dentry(); + if (ret) return 0; - trace_create_file("printk_formats", 0444, d_tracer, + trace_create_file("printk_formats", 0444, NULL, NULL, &ftrace_formats_fops); return 0; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index a22b62813f8c..2f703a20c724 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -16,7 +16,6 @@ #include <linux/tracefs.h> #include <linux/types.h> #include <linux/string.h> -#include <linux/ctype.h> #include <linux/ptrace.h> #include <linux/perf_event.h> #include <linux/kprobes.h> @@ -348,18 +347,6 @@ bool trace_probe_match_command_args(struct trace_probe *tp, #define trace_probe_for_each_link_rcu(pos, tp) \ list_for_each_entry_rcu(pos, &(tp)->event->files, list) -/* Check the name is good for event/group/fields */ -static inline bool is_good_name(const char *name) -{ - if (!isalpha(*name) && *name != '_') - return false; - while (*++name != '\0') { - if (!isalpha(*name) && !isdigit(*name) && *name != '_') - return false; - } - return true; -} - #define TPARG_FL_RETURN BIT(0) #define TPARG_FL_KERNEL BIT(1) #define TPARG_FL_FENTRY BIT(2) @@ -404,6 +391,7 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(MAXACT_TOO_BIG, "Maxactive is too big"), \ C(BAD_PROBE_ADDR, "Invalid probed address or symbol"), \ C(BAD_RETPROBE, "Retprobe address must be an function entry"), \ + C(BAD_ADDR_SUFFIX, "Invalid probed address suffix"), \ C(NO_GROUP_NAME, "Group name is not specified"), \ C(GROUP_TOO_LONG, "Group name is too long"), \ C(BAD_GROUP_NAME, "Group name must follow the same rules as C identifiers"), \ diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 98bba4764c52..c408423e5d65 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -554,20 +554,20 @@ __setup("stacktrace", enable_stacktrace); static __init int stack_trace_init(void) { - struct dentry *d_tracer; + int ret; - d_tracer = tracing_init_dentry(); - if (IS_ERR(d_tracer)) + ret = tracing_init_dentry(); + if (ret) return 0; - trace_create_file("stack_max_size", 0644, d_tracer, + trace_create_file("stack_max_size", 0644, NULL, &stack_trace_max_size, &stack_max_size_fops); - trace_create_file("stack_trace", 0444, d_tracer, + trace_create_file("stack_trace", 0444, NULL, NULL, &stack_trace_fops); #ifdef CONFIG_DYNAMIC_FTRACE - trace_create_file("stack_trace_filter", 0644, d_tracer, + trace_create_file("stack_trace_filter", 0644, NULL, &trace_ops, &stack_trace_filter_fops); #endif diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index d1fa19773cc8..8d141c3825a9 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -276,13 +276,13 @@ static const struct file_operations tracing_stat_fops = { static int tracing_stat_init(void) { - struct dentry *d_tracing; + int ret; - d_tracing = tracing_init_dentry(); - if (IS_ERR(d_tracing)) + ret = tracing_init_dentry(); + if (ret) return -ENODEV; - stat_dir = tracefs_create_dir("trace_stat", d_tracing); + stat_dir = tracefs_create_dir("trace_stat", NULL); if (!stat_dir) { pr_warn("Could not create tracefs 'trace_stat' entry\n"); return -ENOMEM; diff --git a/kernel/trace/trace_synth.h b/kernel/trace/trace_synth.h index ac35c45207c4..6e146b959dcd 100644 --- a/kernel/trace/trace_synth.h +++ b/kernel/trace/trace_synth.h @@ -7,7 +7,7 @@ #define SYNTH_SYSTEM "synthetic" #define SYNTH_FIELDS_MAX 32 -#define STR_VAR_LEN_MAX 32 /* must be multiple of sizeof(u64) */ +#define STR_VAR_LEN_MAX MAX_FILTER_STR_VAL /* must be multiple of sizeof(u64) */ struct synth_field { char *type; @@ -16,6 +16,8 @@ struct synth_field { unsigned int offset; bool is_signed; bool is_string; + bool is_dynamic; + bool field_pos; }; struct synth_event { @@ -24,6 +26,8 @@ struct synth_event { char *name; struct synth_field **fields; unsigned int n_fields; + struct synth_field **dynamic_fields; + unsigned int n_dynamic_fields; unsigned int n_u64; struct trace_event_class class; struct trace_event_call call; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index f4286c9bdeb4..3cf7128e1ad3 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -528,7 +528,7 @@ end: /* * Argument syntax: - * - Add uprobe: p|r[:[GRP/]EVENT] PATH:OFFSET [FETCHARGS] + * - Add uprobe: p|r[:[GRP/]EVENT] PATH:OFFSET[%return][(REF)] [FETCHARGS] */ static int trace_uprobe_create(int argc, const char **argv) { @@ -617,6 +617,19 @@ static int trace_uprobe_create(int argc, const char **argv) } } + /* Check if there is %return suffix */ + tmp = strchr(arg, '%'); + if (tmp) { + if (!strcmp(tmp, "%return")) { + *tmp = '\0'; + is_return = true; + } else { + trace_probe_log_err(tmp - filename, BAD_ADDR_SUFFIX); + ret = -EINVAL; + goto fail_address_parse; + } + } + /* Parse uprobe offset. */ ret = kstrtoul(arg, 0, &offset); if (ret) { @@ -1625,21 +1638,20 @@ void destroy_local_trace_uprobe(struct trace_event_call *event_call) /* Make a trace interface for controling probe points */ static __init int init_uprobe_trace(void) { - struct dentry *d_tracer; int ret; ret = dyn_event_register(&trace_uprobe_ops); if (ret) return ret; - d_tracer = tracing_init_dentry(); - if (IS_ERR(d_tracer)) + ret = tracing_init_dentry(); + if (ret) return 0; - trace_create_file("uprobe_events", 0644, d_tracer, + trace_create_file("uprobe_events", 0644, NULL, NULL, &uprobe_events_ops); /* Profile interface */ - trace_create_file("uprobe_profile", 0444, d_tracer, + trace_create_file("uprobe_profile", 0444, NULL, NULL, &uprobe_profile_ops); return 0; } diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index 74738c9856f1..4b50fc0cb12c 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -260,7 +260,7 @@ int tracing_map_add_var(struct tracing_map *map) * to use cmp_fn. * * A key can be a subset of a compound key; for that purpose, the - * offset param is used to describe where within the the compound key + * offset param is used to describe where within the compound key * the key referenced by this key field resides. * * Return: The index identifying the field in the map and associated diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 73956eaff8a9..3f659f855074 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -50,7 +50,7 @@ static bool ok_to_free_tracepoints; */ struct tp_probes { struct rcu_head rcu; - struct tracepoint_func probes[0]; + struct tracepoint_func probes[]; }; static inline void *allocate_probes(int count) @@ -221,6 +221,29 @@ static void *func_remove(struct tracepoint_func **funcs, return old; } +static void tracepoint_update_call(struct tracepoint *tp, struct tracepoint_func *tp_funcs, bool sync) +{ + void *func = tp->iterator; + + /* Synthetic events do not have static call sites */ + if (!tp->static_call_key) + return; + + if (!tp_funcs[1].func) { + func = tp_funcs[0].func; + /* + * If going from the iterator back to a single caller, + * we need to synchronize with __DO_TRACE to make sure + * that the data passed to the callback is the one that + * belongs to that callback. + */ + if (sync) + tracepoint_synchronize_unregister(); + } + + __static_call_update(tp->static_call_key, tp->static_call_tramp, func); +} + /* * Add the probe function to a tracepoint. */ @@ -251,8 +274,9 @@ static int tracepoint_add_func(struct tracepoint *tp, * include/linux/tracepoint.h using rcu_dereference_sched(). */ rcu_assign_pointer(tp->funcs, tp_funcs); - if (!static_key_enabled(&tp->key)) - static_key_slow_inc(&tp->key); + tracepoint_update_call(tp, tp_funcs, false); + static_key_enable(&tp->key); + release_probes(old); return 0; } @@ -281,10 +305,13 @@ static int tracepoint_remove_func(struct tracepoint *tp, if (tp->unregfunc && static_key_enabled(&tp->key)) tp->unregfunc(); - if (static_key_enabled(&tp->key)) - static_key_slow_dec(&tp->key); + static_key_disable(&tp->key); + rcu_assign_pointer(tp->funcs, tp_funcs); + } else { + rcu_assign_pointer(tp->funcs, tp_funcs); + tracepoint_update_call(tp, tp_funcs, + tp_funcs[0].func != old[0].func); } - rcu_assign_pointer(tp->funcs, tp_funcs); release_probes(old); return 0; } @@ -521,7 +548,7 @@ static int tracepoint_module_notify(struct notifier_block *self, case MODULE_STATE_UNFORMED: break; } - return ret; + return notifier_from_errno(ret); } static struct notifier_block tracepoint_module_nb = { diff --git a/kernel/umh.c b/kernel/umh.c index fcf3ee803630..3f646613a9d3 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -14,6 +14,7 @@ #include <linux/cred.h> #include <linux/file.h> #include <linux/fdtable.h> +#include <linux/fs_struct.h> #include <linux/workqueue.h> #include <linux/security.h> #include <linux/mount.h> @@ -72,6 +73,14 @@ static int call_usermodehelper_exec_async(void *data) spin_unlock_irq(¤t->sighand->siglock); /* + * Initial kernel threads share ther FS with init, in order to + * get the init root directory. But we've now created a new + * thread that is going to execve a user process and has its own + * 'struct fs_struct'. Reset umask to the default. + */ + current->fs->umask = 0022; + + /* * Our parent (unbound workqueue) runs with elevated scheduling * priority. Avoid propagating that into the userspace child. */ diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 87804e0371fe..e703d5d9cbe8 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -515,7 +515,7 @@ EXPORT_SYMBOL(from_kgid_munged); * * When there is no mapping defined for the user-namespace projid * pair INVALID_PROJID is returned. Callers are expected to test - * for and handle handle INVALID_PROJID being returned. INVALID_PROJID + * for and handle INVALID_PROJID being returned. INVALID_PROJID * may be tested for using projid_valid(). */ kprojid_t make_kprojid(struct user_namespace *ns, projid_t projid) diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index f74020f6bd9d..0ef8f65bd2d7 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -393,6 +393,7 @@ static void free_watch(struct rcu_head *rcu) struct watch *watch = container_of(rcu, struct watch, rcu); put_watch_queue(rcu_access_pointer(watch->queue)); + atomic_dec(&watch->cred->user->nr_watches); put_cred(watch->cred); } @@ -452,6 +453,13 @@ int add_watch_to_object(struct watch *watch, struct watch_list *wlist) watch->cred = get_current_cred(); rcu_assign_pointer(watch->watch_list, wlist); + if (atomic_inc_return(&watch->cred->user->nr_watches) > + task_rlimit(current, RLIMIT_NOFILE)) { + atomic_dec(&watch->cred->user->nr_watches); + put_cred(watch->cred); + return -EAGAIN; + } + spin_lock_bh(&wqueue->lock); kref_get(&wqueue->usage); kref_get(&watch->usage); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c41c3c17b86a..437935e7a199 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -427,7 +427,7 @@ static void show_pwq(struct pool_workqueue *pwq); #ifdef CONFIG_DEBUG_OBJECTS_WORK -static struct debug_obj_descr work_debug_descr; +static const struct debug_obj_descr work_debug_descr; static void *work_debug_hint(void *addr) { @@ -477,7 +477,7 @@ static bool work_fixup_free(void *addr, enum debug_obj_state state) } } -static struct debug_obj_descr work_debug_descr = { +static const struct debug_obj_descr work_debug_descr = { .name = "work_struct", .debug_hint = work_debug_hint, .is_static_object = work_is_static_object, @@ -1212,11 +1212,14 @@ out_put: * stable state - idle, on timer or on worklist. * * Return: + * + * ======== ================================================================ * 1 if @work was pending and we successfully stole PENDING * 0 if @work was idle and we claimed PENDING * -EAGAIN if PENDING couldn't be grabbed at the moment, safe to busy-retry * -ENOENT if someone else is canceling @work, this state may persist * for arbitrarily long + * ======== ================================================================ * * Note: * On >= 0 return, the caller owns @work's PENDING bit. To avoid getting |