diff options
Diffstat (limited to 'kernel')
221 files changed, 23502 insertions, 9442 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 4cb4130ced32..bdeb77e27042 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -12,6 +12,7 @@ obj-y = fork.o exec_domain.o panic.o \ notifier.o ksysfs.o cred.o reboot.o \ async.o range.o smpboot.o ucount.o +obj-$(CONFIG_BPFILTER) += usermode_driver.o obj-$(CONFIG_MODULES) += kmod.o obj-$(CONFIG_MULTIUSER) += groups.o @@ -23,6 +24,9 @@ endif # Prevents flicker of uninteresting __do_softirq()/__local_bh_disable_ip() # in coverage traces. KCOV_INSTRUMENT_softirq.o := n +# Avoid KCSAN instrumentation in softirq ("No shared variables, all the data +# are CPU local" => assume no data races), to reduce overhead in interrupts. +KCSAN_SANITIZE_softirq.o = n # These are called from save_stack_trace() on slub debug path, # and produce insane amounts of uninteresting coverage. KCOV_INSTRUMENT_module.o := n @@ -31,6 +35,7 @@ KCOV_INSTRUMENT_stacktrace.o := n # Don't self-instrument. KCOV_INSTRUMENT_kcov.o := n KASAN_SANITIZE_kcov.o := n +KCSAN_SANITIZE_kcov.o := n CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) # cond_syscall is currently not LTO compatible @@ -44,6 +49,7 @@ obj-y += irq/ obj-y += rcu/ obj-y += livepatch/ obj-y += dma/ +obj-y += entry/ obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o obj-$(CONFIG_FREEZER) += freezer.o @@ -103,6 +109,8 @@ obj-$(CONFIG_TRACEPOINTS) += trace/ obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-$(CONFIG_CPU_PM) += cpu_pm.o obj-$(CONFIG_BPF) += bpf/ +obj-$(CONFIG_KCSAN) += kcsan/ +obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o obj-$(CONFIG_PERF_EVENTS) += events/ @@ -115,11 +123,14 @@ obj-$(CONFIG_TORTURE_TEST) += torture.o obj-$(CONFIG_HAS_IOMEM) += iomem.o obj-$(CONFIG_RSEQ) += rseq.o +obj-$(CONFIG_WATCH_QUEUE) += watch_queue.o obj-$(CONFIG_SYSCTL_KUNIT_TEST) += sysctl-test.o +CFLAGS_stackleak.o += $(DISABLE_STACKLEAK_PLUGIN) obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += stackleak.o KASAN_SANITIZE_stackleak.o := n +KCSAN_SANITIZE_stackleak.o := n KCOV_INSTRUMENT_stackleak.o := n $(obj)/configs.o: $(obj)/config_data.gz diff --git a/kernel/acct.c b/kernel/acct.c index 11ff4a596d6b..b0c5b3a9f5af 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -40,7 +40,7 @@ * is one more bug... 10/11/98, AV. * * Oh, fsck... Oopsable SMP race in do_process_acct() - we must hold - * ->mmap_sem to walk the vma list of current->mm. Nasty, since it leaks + * ->mmap_lock to walk the vma list of current->mm. Nasty, since it leaks * a struct file opened for write. Fixed. 2/6/2000, AV. */ @@ -541,13 +541,13 @@ void acct_collect(long exitcode, int group_dead) if (group_dead && current->mm) { struct vm_area_struct *vma; - down_read(¤t->mm->mmap_sem); + mmap_read_lock(current->mm); vma = current->mm->mmap; while (vma) { vsize += vma->vm_end - vma->vm_start; vma = vma->vm_next; } - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); } spin_lock_irq(¤t->sighand->siglock); diff --git a/kernel/async.c b/kernel/async.c index 4f9c1d614016..33258e6e20f8 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -111,7 +111,7 @@ static void async_run_entry_fn(struct work_struct *work) struct async_entry *entry = container_of(work, struct async_entry, work); unsigned long flags; - ktime_t uninitialized_var(calltime), delta, rettime; + ktime_t calltime, delta, rettime; /* 1) run (and print duration) */ if (initcall_debug && system_state < SYSTEM_RUNNING) { @@ -287,7 +287,7 @@ EXPORT_SYMBOL_GPL(async_synchronize_full_domain); */ void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *domain) { - ktime_t uninitialized_var(starttime), delta, endtime; + ktime_t starttime, delta, endtime; if (initcall_debug && system_state < SYSTEM_RUNNING) { pr_debug("async_waiting @ %i\n", task_pid_nr(current)); diff --git a/kernel/audit.c b/kernel/audit.c index 87f31bf1f0a0..7efaece534a9 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -136,6 +136,11 @@ u32 audit_sig_sid = 0; */ static atomic_t audit_lost = ATOMIC_INIT(0); +/* Monotonically increasing sum of time the kernel has spent + * waiting while the backlog limit is exceeded. + */ +static atomic_t audit_backlog_wait_time_actual = ATOMIC_INIT(0); + /* Hash for inode-based rules */ struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; @@ -880,7 +885,7 @@ main_queue: return 0; } -int audit_send_list(void *_dest) +int audit_send_list_thread(void *_dest) { struct audit_netlink_list *dest = _dest; struct sk_buff *skb; @@ -924,19 +929,30 @@ out_kfree_skb: return NULL; } +static void audit_free_reply(struct audit_reply *reply) +{ + if (!reply) + return; + + if (reply->skb) + kfree_skb(reply->skb); + if (reply->net) + put_net(reply->net); + kfree(reply); +} + static int audit_send_reply_thread(void *arg) { struct audit_reply *reply = (struct audit_reply *)arg; - struct sock *sk = audit_get_sk(reply->net); audit_ctl_lock(); audit_ctl_unlock(); /* Ignore failure. It'll only happen if the sender goes away, because our timeout is set to infinite. */ - netlink_unicast(sk, reply->skb, reply->portid, 0); - put_net(reply->net); - kfree(reply); + netlink_unicast(audit_get_sk(reply->net), reply->skb, reply->portid, 0); + reply->skb = NULL; + audit_free_reply(reply); return 0; } @@ -950,35 +966,32 @@ static int audit_send_reply_thread(void *arg) * @payload: payload data * @size: payload size * - * Allocates an skb, builds the netlink message, and sends it to the port id. - * No failure notifications. + * Allocates a skb, builds the netlink message, and sends it to the port id. */ static void audit_send_reply(struct sk_buff *request_skb, int seq, int type, int done, int multi, const void *payload, int size) { - struct net *net = sock_net(NETLINK_CB(request_skb).sk); - struct sk_buff *skb; struct task_struct *tsk; - struct audit_reply *reply = kmalloc(sizeof(struct audit_reply), - GFP_KERNEL); + struct audit_reply *reply; + reply = kzalloc(sizeof(*reply), GFP_KERNEL); if (!reply) return; - skb = audit_make_reply(seq, type, done, multi, payload, size); - if (!skb) - goto out; - - reply->net = get_net(net); + reply->skb = audit_make_reply(seq, type, done, multi, payload, size); + if (!reply->skb) + goto err; + reply->net = get_net(sock_net(NETLINK_CB(request_skb).sk)); reply->portid = NETLINK_CB(request_skb).portid; - reply->skb = skb; tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply"); - if (!IS_ERR(tsk)) - return; - kfree_skb(skb); -out: - kfree(reply); + if (IS_ERR(tsk)) + goto err; + + return; + +err: + audit_free_reply(reply); } /* @@ -1193,17 +1206,18 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) case AUDIT_GET: { struct audit_status s; memset(&s, 0, sizeof(s)); - s.enabled = audit_enabled; - s.failure = audit_failure; + s.enabled = audit_enabled; + s.failure = audit_failure; /* NOTE: use pid_vnr() so the PID is relative to the current * namespace */ - s.pid = auditd_pid_vnr(); - s.rate_limit = audit_rate_limit; - s.backlog_limit = audit_backlog_limit; - s.lost = atomic_read(&audit_lost); - s.backlog = skb_queue_len(&audit_queue); - s.feature_bitmap = AUDIT_FEATURE_BITMAP_ALL; - s.backlog_wait_time = audit_backlog_wait_time; + s.pid = auditd_pid_vnr(); + s.rate_limit = audit_rate_limit; + s.backlog_limit = audit_backlog_limit; + s.lost = atomic_read(&audit_lost); + s.backlog = skb_queue_len(&audit_queue); + s.feature_bitmap = AUDIT_FEATURE_BITMAP_ALL; + s.backlog_wait_time = audit_backlog_wait_time; + s.backlog_wait_time_actual = atomic_read(&audit_backlog_wait_time_actual); audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s)); break; } @@ -1307,6 +1321,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) audit_log_config_change("lost", 0, lost, 1); return lost; } + if (s.mask == AUDIT_STATUS_BACKLOG_WAIT_TIME_ACTUAL) { + u32 actual = atomic_xchg(&audit_backlog_wait_time_actual, 0); + + audit_log_config_change("backlog_wait_time_actual", 0, actual, 1); + return actual; + } break; } case AUDIT_GET_FEATURE: @@ -1525,20 +1545,60 @@ static void audit_receive(struct sk_buff *skb) audit_ctl_unlock(); } +/* Log information about who is connecting to the audit multicast socket */ +static void audit_log_multicast(int group, const char *op, int err) +{ + const struct cred *cred; + struct tty_struct *tty; + char comm[sizeof(current->comm)]; + struct audit_buffer *ab; + + if (!audit_enabled) + return; + + ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_EVENT_LISTENER); + if (!ab) + return; + + cred = current_cred(); + tty = audit_get_tty(); + audit_log_format(ab, "pid=%u uid=%u auid=%u tty=%s ses=%u", + task_pid_nr(current), + from_kuid(&init_user_ns, cred->uid), + from_kuid(&init_user_ns, audit_get_loginuid(current)), + tty ? tty_name(tty) : "(none)", + audit_get_sessionid(current)); + audit_put_tty(tty); + audit_log_task_context(ab); /* subj= */ + audit_log_format(ab, " comm="); + audit_log_untrustedstring(ab, get_task_comm(comm, current)); + audit_log_d_path_exe(ab, current->mm); /* exe= */ + audit_log_format(ab, " nl-mcgrp=%d op=%s res=%d", group, op, !err); + audit_log_end(ab); +} + /* Run custom bind function on netlink socket group connect or bind requests. */ -static int audit_bind(struct net *net, int group) +static int audit_multicast_bind(struct net *net, int group) { + int err = 0; + if (!capable(CAP_AUDIT_READ)) - return -EPERM; + err = -EPERM; + audit_log_multicast(group, "connect", err); + return err; +} - return 0; +static void audit_multicast_unbind(struct net *net, int group) +{ + audit_log_multicast(group, "disconnect", 0); } static int __net_init audit_net_init(struct net *net) { struct netlink_kernel_cfg cfg = { .input = audit_receive, - .bind = audit_bind, + .bind = audit_multicast_bind, + .unbind = audit_multicast_unbind, .flags = NL_CFG_F_NONROOT_RECV, .groups = AUDIT_NLGRP_MAX, }; @@ -1752,7 +1812,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, { struct audit_buffer *ab; struct timespec64 t; - unsigned int uninitialized_var(serial); + unsigned int serial; if (audit_initialized != AUDIT_INITIALIZED) return NULL; @@ -1778,12 +1838,15 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, /* sleep if we are allowed and we haven't exhausted our * backlog wait limit */ if (gfpflags_allow_blocking(gfp_mask) && (stime > 0)) { + long rtime = stime; + DECLARE_WAITQUEUE(wait, current); add_wait_queue_exclusive(&audit_backlog_wait, &wait); set_current_state(TASK_UNINTERRUPTIBLE); - stime = schedule_timeout(stime); + stime = schedule_timeout(rtime); + atomic_add(rtime - stime, &audit_backlog_wait_time_actual); remove_wait_queue(&audit_backlog_wait, &wait); } else { if (audit_rate_check() && printk_ratelimit()) @@ -1803,7 +1866,6 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, } audit_get_stamp(ab->ctx, &t, &serial); - audit_clear_dummy(ab->ctx); audit_log_format(ab, "audit(%llu.%03lu:%u): ", (unsigned long long)t.tv_sec, t.tv_nsec/1000000, serial); @@ -2032,13 +2094,13 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix, /* We will allow 11 spaces for ' (deleted)' to be appended */ pathname = kmalloc(PATH_MAX+11, ab->gfp_mask); if (!pathname) { - audit_log_string(ab, "<no_memory>"); + audit_log_format(ab, "\"<no_memory>\""); return; } p = d_path(path, pathname, PATH_MAX+11); if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */ /* FIXME: can we save some information here? */ - audit_log_string(ab, "<too_long>"); + audit_log_format(ab, "\"<too_long>\""); } else audit_log_untrustedstring(ab, p); kfree(pathname); diff --git a/kernel/audit.h b/kernel/audit.h index 2eed4d231624..ddc22878433d 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -229,7 +229,7 @@ struct audit_netlink_list { struct sk_buff_head q; }; -int audit_send_list(void *_dest); +int audit_send_list_thread(void *_dest); extern int selinux_audit_rule_update(void); @@ -290,13 +290,6 @@ extern int audit_signal_info_syscall(struct task_struct *t); extern void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx); extern struct list_head *audit_killed_trees(void); - -static inline void audit_clear_dummy(struct audit_context *ctx) -{ - if (ctx) - ctx->dummy = 0; -} - #else /* CONFIG_AUDITSYSCALL */ #define auditsc_get_stamp(c, t, s) 0 #define audit_put_watch(w) {} @@ -330,7 +323,6 @@ static inline int audit_signal_info_syscall(struct task_struct *t) } #define audit_filter_inodes(t, c) AUDIT_DISABLED -#define audit_clear_dummy(c) {} #endif /* CONFIG_AUDITSYSCALL */ extern char *audit_unpack_string(void **bufp, size_t *remain, size_t len); diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index e49c912f862d..1b7a2f041793 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -188,11 +188,9 @@ static struct fsnotify_mark *alloc_mark(void) static struct audit_chunk *alloc_chunk(int count) { struct audit_chunk *chunk; - size_t size; int i; - size = offsetof(struct audit_chunk, owners) + count * sizeof(struct node); - chunk = kzalloc(size, GFP_KERNEL); + chunk = kzalloc(struct_size(chunk, owners, count), GFP_KERNEL); if (!chunk) return NULL; diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 026e34da4ace..a10e2997aa6c 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1161,11 +1161,8 @@ int audit_rule_change(int type, int seq, void *data, size_t datasz) */ int audit_list_rules_send(struct sk_buff *request_skb, int seq) { - u32 portid = NETLINK_CB(request_skb).portid; - struct net *net = sock_net(NETLINK_CB(request_skb).sk); struct task_struct *tsk; struct audit_netlink_list *dest; - int err = 0; /* We can't just spew out the rules here because we might fill * the available socket buffer space and deadlock waiting for @@ -1173,25 +1170,26 @@ int audit_list_rules_send(struct sk_buff *request_skb, int seq) * happen if we're actually running in the context of auditctl * trying to _send_ the stuff */ - dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL); + dest = kmalloc(sizeof(*dest), GFP_KERNEL); if (!dest) return -ENOMEM; - dest->net = get_net(net); - dest->portid = portid; + dest->net = get_net(sock_net(NETLINK_CB(request_skb).sk)); + dest->portid = NETLINK_CB(request_skb).portid; skb_queue_head_init(&dest->q); mutex_lock(&audit_filter_mutex); audit_list_rules(seq, &dest->q); mutex_unlock(&audit_filter_mutex); - tsk = kthread_run(audit_send_list, dest, "audit_send_list"); + tsk = kthread_run(audit_send_list_thread, dest, "audit_send_list"); if (IS_ERR(tsk)) { skb_queue_purge(&dest->q); + put_net(dest->net); kfree(dest); - err = PTR_ERR(tsk); + return PTR_ERR(tsk); } - return err; + return 0; } int audit_comparator(u32 left, u32 op, u32 right) diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 814406a35db1..8dba8f0983b5 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -75,6 +75,7 @@ #include <linux/uaccess.h> #include <linux/fsnotify_backend.h> #include <uapi/linux/limits.h> +#include <uapi/linux/netfilter/nf_tables.h> #include "audit.h" @@ -130,6 +131,34 @@ struct audit_tree_refs { struct audit_chunk *c[31]; }; +struct audit_nfcfgop_tab { + enum audit_nfcfgop op; + const char *s; +}; + +static const struct audit_nfcfgop_tab audit_nfcfgs[] = { + { AUDIT_XT_OP_REGISTER, "xt_register" }, + { AUDIT_XT_OP_REPLACE, "xt_replace" }, + { AUDIT_XT_OP_UNREGISTER, "xt_unregister" }, + { AUDIT_NFT_OP_TABLE_REGISTER, "nft_register_table" }, + { AUDIT_NFT_OP_TABLE_UNREGISTER, "nft_unregister_table" }, + { AUDIT_NFT_OP_CHAIN_REGISTER, "nft_register_chain" }, + { AUDIT_NFT_OP_CHAIN_UNREGISTER, "nft_unregister_chain" }, + { AUDIT_NFT_OP_RULE_REGISTER, "nft_register_rule" }, + { AUDIT_NFT_OP_RULE_UNREGISTER, "nft_unregister_rule" }, + { AUDIT_NFT_OP_SET_REGISTER, "nft_register_set" }, + { AUDIT_NFT_OP_SET_UNREGISTER, "nft_unregister_set" }, + { AUDIT_NFT_OP_SETELEM_REGISTER, "nft_register_setelem" }, + { AUDIT_NFT_OP_SETELEM_UNREGISTER, "nft_unregister_setelem" }, + { AUDIT_NFT_OP_GEN_REGISTER, "nft_register_gen" }, + { AUDIT_NFT_OP_OBJ_REGISTER, "nft_register_obj" }, + { AUDIT_NFT_OP_OBJ_UNREGISTER, "nft_unregister_obj" }, + { AUDIT_NFT_OP_OBJ_RESET, "nft_reset_obj" }, + { AUDIT_NFT_OP_FLOWTABLE_REGISTER, "nft_register_flowtable" }, + { AUDIT_NFT_OP_FLOWTABLE_UNREGISTER, "nft_unregister_flowtable" }, + { AUDIT_NFT_OP_INVALID, "nft_invalid" }, +}; + static int audit_match_perm(struct audit_context *ctx, int mask) { unsigned n; @@ -1406,6 +1435,9 @@ static void audit_log_proctitle(void) struct audit_context *context = audit_context(); struct audit_buffer *ab; + if (!context || context->dummy) + return; + ab = audit_log_start(context, GFP_KERNEL, AUDIT_PROCTITLE); if (!ab) return; /* audit_panic or being filtered */ @@ -1862,6 +1894,20 @@ __audit_reusename(const __user char *uptr) return NULL; } +inline void _audit_getcwd(struct audit_context *context) +{ + if (!context->pwd.dentry) + get_fs_pwd(current->fs, &context->pwd); +} + +void __audit_getcwd(void) +{ + struct audit_context *context = audit_context(); + + if (context->in_syscall) + _audit_getcwd(context); +} + /** * __audit_getname - add a name to the list * @name: name to add @@ -1886,8 +1932,7 @@ void __audit_getname(struct filename *name) name->aname = n; name->refcnt++; - if (!context->pwd.dentry) - get_fs_pwd(current->fs, &context->pwd); + _audit_getcwd(context); } static inline int audit_copy_fcaps(struct audit_names *name, @@ -2542,6 +2587,26 @@ void __audit_ntp_log(const struct audit_ntp_data *ad) audit_log_ntp_val(ad, "adjust", AUDIT_NTP_ADJUST); } +void __audit_log_nfcfg(const char *name, u8 af, unsigned int nentries, + enum audit_nfcfgop op, gfp_t gfp) +{ + struct audit_buffer *ab; + char comm[sizeof(current->comm)]; + + ab = audit_log_start(audit_context(), gfp, AUDIT_NETFILTER_CFG); + if (!ab) + return; + audit_log_format(ab, "table=%s family=%u entries=%u op=%s", + name, af, nentries, audit_nfcfgs[op].s); + + audit_log_format(ab, " pid=%u", task_pid_nr(current)); + audit_log_task_context(ab); /* subj= */ + audit_log_format(ab, " comm="); + audit_log_untrustedstring(ab, get_task_comm(comm, current)); + audit_log_end(ab); +} +EXPORT_SYMBOL_GPL(__audit_log_nfcfg); + static void audit_log_task(struct audit_buffer *ab) { kuid_t auid, uid; diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c index a2a97fa3071b..370217dd7e39 100644 --- a/kernel/backtracetest.c +++ b/kernel/backtracetest.c @@ -29,7 +29,7 @@ static void backtrace_test_irq_callback(unsigned long data) complete(&backtrace_work); } -static DECLARE_TASKLET(backtrace_tasklet, &backtrace_test_irq_callback, 0); +static DECLARE_TASKLET_OLD(backtrace_tasklet, &backtrace_test_irq_callback); static void backtrace_test_irq(void) { diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index f2d7be596966..e6eb9c0402da 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -2,9 +2,9 @@ obj-y := core.o CFLAGS_core.o += $(call cc-disable-warning, override-init) -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o -obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o +obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o obj-$(CONFIG_BPF_JIT) += trampoline.o obj-$(CONFIG_BPF_SYSCALL) += btf.o @@ -12,10 +12,8 @@ obj-$(CONFIG_BPF_JIT) += dispatcher.o ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o obj-$(CONFIG_BPF_SYSCALL) += cpumap.o -ifeq ($(CONFIG_XDP_SOCKETS),y) -obj-$(CONFIG_BPF_SYSCALL) += xskmap.o -endif obj-$(CONFIG_BPF_SYSCALL) += offload.o +obj-$(CONFIG_BPF_SYSCALL) += net_namespace.o endif ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 1d6120fd5ba6..8ff419b632a6 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -77,7 +77,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; int ret, numa_node = bpf_map_attr_numa_node(attr); u32 elem_size, index_mask, max_entries; - bool unpriv = !capable(CAP_SYS_ADMIN); + bool bypass_spec_v1 = bpf_bypass_spec_v1(); u64 cost, array_size, mask64; struct bpf_map_memory mem; struct bpf_array *array; @@ -95,7 +95,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) mask64 -= 1; index_mask = mask64; - if (unpriv) { + if (!bypass_spec_v1) { /* round up array size to nearest power of 2, * since cpu will speculate within index_mask limits */ @@ -149,7 +149,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) return ERR_PTR(-ENOMEM); } array->index_mask = index_mask; - array->map.unpriv_array = unpriv; + array->map.bypass_spec_v1 = bypass_spec_v1; /* copy mandatory map attributes */ bpf_map_init_from_attr(&array->map, attr); @@ -219,7 +219,7 @@ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); - if (map->unpriv_array) { + if (!map->bypass_spec_v1) { *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 4); *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); } else { @@ -386,13 +386,6 @@ static void array_map_free(struct bpf_map *map) { struct bpf_array *array = container_of(map, struct bpf_array, map); - /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, - * so the programs (can be more than one that used this map) were - * disconnected from events. Wait for outstanding programs to complete - * and free the array - */ - synchronize_rcu(); - if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) bpf_array_free_percpu(array); @@ -494,6 +487,143 @@ static int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) vma->vm_pgoff + pgoff); } +struct bpf_iter_seq_array_map_info { + struct bpf_map *map; + void *percpu_value_buf; + u32 index; +}; + +static void *bpf_array_map_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct bpf_iter_seq_array_map_info *info = seq->private; + struct bpf_map *map = info->map; + struct bpf_array *array; + u32 index; + + if (info->index >= map->max_entries) + return NULL; + + if (*pos == 0) + ++*pos; + array = container_of(map, struct bpf_array, map); + index = info->index & array->index_mask; + if (info->percpu_value_buf) + return array->pptrs[index]; + return array->value + array->elem_size * index; +} + +static void *bpf_array_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct bpf_iter_seq_array_map_info *info = seq->private; + struct bpf_map *map = info->map; + struct bpf_array *array; + u32 index; + + ++*pos; + ++info->index; + if (info->index >= map->max_entries) + return NULL; + + array = container_of(map, struct bpf_array, map); + index = info->index & array->index_mask; + if (info->percpu_value_buf) + return array->pptrs[index]; + return array->value + array->elem_size * index; +} + +static int __bpf_array_map_seq_show(struct seq_file *seq, void *v) +{ + struct bpf_iter_seq_array_map_info *info = seq->private; + struct bpf_iter__bpf_map_elem ctx = {}; + struct bpf_map *map = info->map; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + int off = 0, cpu = 0; + void __percpu **pptr; + u32 size; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, v == NULL); + if (!prog) + return 0; + + ctx.meta = &meta; + ctx.map = info->map; + if (v) { + ctx.key = &info->index; + + if (!info->percpu_value_buf) { + ctx.value = v; + } else { + pptr = v; + size = round_up(map->value_size, 8); + for_each_possible_cpu(cpu) { + bpf_long_memcpy(info->percpu_value_buf + off, + per_cpu_ptr(pptr, cpu), + size); + off += size; + } + ctx.value = info->percpu_value_buf; + } + } + + return bpf_iter_run_prog(prog, &ctx); +} + +static int bpf_array_map_seq_show(struct seq_file *seq, void *v) +{ + return __bpf_array_map_seq_show(seq, v); +} + +static void bpf_array_map_seq_stop(struct seq_file *seq, void *v) +{ + if (!v) + (void)__bpf_array_map_seq_show(seq, NULL); +} + +static int bpf_iter_init_array_map(void *priv_data, + struct bpf_iter_aux_info *aux) +{ + struct bpf_iter_seq_array_map_info *seq_info = priv_data; + struct bpf_map *map = aux->map; + void *value_buf; + u32 buf_size; + + if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { + buf_size = round_up(map->value_size, 8) * num_possible_cpus(); + value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN); + if (!value_buf) + return -ENOMEM; + + seq_info->percpu_value_buf = value_buf; + } + + seq_info->map = map; + return 0; +} + +static void bpf_iter_fini_array_map(void *priv_data) +{ + struct bpf_iter_seq_array_map_info *seq_info = priv_data; + + kfree(seq_info->percpu_value_buf); +} + +static const struct seq_operations bpf_array_map_seq_ops = { + .start = bpf_array_map_seq_start, + .next = bpf_array_map_seq_next, + .stop = bpf_array_map_seq_stop, + .show = bpf_array_map_seq_show, +}; + +static const struct bpf_iter_seq_info iter_seq_info = { + .seq_ops = &bpf_array_map_seq_ops, + .init_seq_private = bpf_iter_init_array_map, + .fini_seq_private = bpf_iter_fini_array_map, + .seq_priv_size = sizeof(struct bpf_iter_seq_array_map_info), +}; + +static int array_map_btf_id; const struct bpf_map_ops array_map_ops = { .map_alloc_check = array_map_alloc_check, .map_alloc = array_map_alloc, @@ -510,8 +640,12 @@ const struct bpf_map_ops array_map_ops = { .map_check_btf = array_map_check_btf, .map_lookup_batch = generic_map_lookup_batch, .map_update_batch = generic_map_update_batch, + .map_btf_name = "bpf_array", + .map_btf_id = &array_map_btf_id, + .iter_seq_info = &iter_seq_info, }; +static int percpu_array_map_btf_id; const struct bpf_map_ops percpu_array_map_ops = { .map_alloc_check = array_map_alloc_check, .map_alloc = array_map_alloc, @@ -522,6 +656,9 @@ const struct bpf_map_ops percpu_array_map_ops = { .map_delete_elem = array_map_delete_elem, .map_seq_show_elem = percpu_array_map_seq_show_elem, .map_check_btf = array_map_check_btf, + .map_btf_name = "bpf_array", + .map_btf_id = &percpu_array_map_btf_id, + .iter_seq_info = &iter_seq_info, }; static int fd_array_map_alloc_check(union bpf_attr *attr) @@ -540,8 +677,6 @@ static void fd_array_map_free(struct bpf_map *map) struct bpf_array *array = container_of(map, struct bpf_array, map); int i; - synchronize_rcu(); - /* make sure it's empty */ for (i = 0; i < array->map.max_entries; i++) BUG_ON(array->ptrs[i] != NULL); @@ -868,6 +1003,7 @@ static void prog_array_map_free(struct bpf_map *map) fd_array_map_free(map); } +static int prog_array_map_btf_id; const struct bpf_map_ops prog_array_map_ops = { .map_alloc_check = fd_array_map_alloc_check, .map_alloc = prog_array_map_alloc, @@ -883,6 +1019,8 @@ const struct bpf_map_ops prog_array_map_ops = { .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem, .map_release_uref = prog_array_map_clear, .map_seq_show_elem = prog_array_map_seq_show_elem, + .map_btf_name = "bpf_array", + .map_btf_id = &prog_array_map_btf_id, }; static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, @@ -961,6 +1099,7 @@ static void perf_event_fd_array_release(struct bpf_map *map, rcu_read_unlock(); } +static int perf_event_array_map_btf_id; const struct bpf_map_ops perf_event_array_map_ops = { .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_map_alloc, @@ -972,6 +1111,8 @@ const struct bpf_map_ops perf_event_array_map_ops = { .map_fd_put_ptr = perf_event_fd_array_put_ptr, .map_release = perf_event_fd_array_release, .map_check_btf = map_check_no_btf, + .map_btf_name = "bpf_array", + .map_btf_id = &perf_event_array_map_btf_id, }; #ifdef CONFIG_CGROUPS @@ -994,6 +1135,7 @@ static void cgroup_fd_array_free(struct bpf_map *map) fd_array_map_free(map); } +static int cgroup_array_map_btf_id; const struct bpf_map_ops cgroup_array_map_ops = { .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_map_alloc, @@ -1004,6 +1146,8 @@ const struct bpf_map_ops cgroup_array_map_ops = { .map_fd_get_ptr = cgroup_fd_array_get_ptr, .map_fd_put_ptr = cgroup_fd_array_put_ptr, .map_check_btf = map_check_no_btf, + .map_btf_name = "bpf_array", + .map_btf_id = &cgroup_array_map_btf_id, }; #endif @@ -1058,7 +1202,7 @@ static u32 array_of_map_gen_lookup(struct bpf_map *map, *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); - if (map->unpriv_array) { + if (!map->bypass_spec_v1) { *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 6); *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); } else { @@ -1077,6 +1221,7 @@ static u32 array_of_map_gen_lookup(struct bpf_map *map, return insn - insn_buf; } +static int array_of_maps_map_btf_id; const struct bpf_map_ops array_of_maps_map_ops = { .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_of_map_alloc, @@ -1089,4 +1234,6 @@ const struct bpf_map_ops array_of_maps_map_ops = { .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, .map_gen_lookup = array_of_map_gen_lookup, .map_check_btf = map_check_no_btf, + .map_btf_name = "bpf_array", + .map_btf_id = &array_of_maps_map_btf_id, }; diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c new file mode 100644 index 000000000000..363b9cafc2d8 --- /dev/null +++ b/kernel/bpf/bpf_iter.c @@ -0,0 +1,592 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2020 Facebook */ + +#include <linux/fs.h> +#include <linux/anon_inodes.h> +#include <linux/filter.h> +#include <linux/bpf.h> + +struct bpf_iter_target_info { + struct list_head list; + const struct bpf_iter_reg *reg_info; + u32 btf_id; /* cached value */ +}; + +struct bpf_iter_link { + struct bpf_link link; + struct bpf_iter_aux_info aux; + struct bpf_iter_target_info *tinfo; +}; + +struct bpf_iter_priv_data { + struct bpf_iter_target_info *tinfo; + const struct bpf_iter_seq_info *seq_info; + struct bpf_prog *prog; + u64 session_id; + u64 seq_num; + bool done_stop; + u8 target_private[] __aligned(8); +}; + +static struct list_head targets = LIST_HEAD_INIT(targets); +static DEFINE_MUTEX(targets_mutex); + +/* protect bpf_iter_link changes */ +static DEFINE_MUTEX(link_mutex); + +/* incremented on every opened seq_file */ +static atomic64_t session_id; + +static int prepare_seq_file(struct file *file, struct bpf_iter_link *link, + const struct bpf_iter_seq_info *seq_info); + +static void bpf_iter_inc_seq_num(struct seq_file *seq) +{ + struct bpf_iter_priv_data *iter_priv; + + iter_priv = container_of(seq->private, struct bpf_iter_priv_data, + target_private); + iter_priv->seq_num++; +} + +static void bpf_iter_dec_seq_num(struct seq_file *seq) +{ + struct bpf_iter_priv_data *iter_priv; + + iter_priv = container_of(seq->private, struct bpf_iter_priv_data, + target_private); + iter_priv->seq_num--; +} + +static void bpf_iter_done_stop(struct seq_file *seq) +{ + struct bpf_iter_priv_data *iter_priv; + + iter_priv = container_of(seq->private, struct bpf_iter_priv_data, + target_private); + iter_priv->done_stop = true; +} + +/* bpf_seq_read, a customized and simpler version for bpf iterator. + * no_llseek is assumed for this file. + * The following are differences from seq_read(): + * . fixed buffer size (PAGE_SIZE) + * . assuming no_llseek + * . stop() may call bpf program, handling potential overflow there + */ +static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size, + loff_t *ppos) +{ + struct seq_file *seq = file->private_data; + size_t n, offs, copied = 0; + int err = 0; + void *p; + + mutex_lock(&seq->lock); + + if (!seq->buf) { + seq->size = PAGE_SIZE; + seq->buf = kmalloc(seq->size, GFP_KERNEL); + if (!seq->buf) { + err = -ENOMEM; + goto done; + } + } + + if (seq->count) { + n = min(seq->count, size); + err = copy_to_user(buf, seq->buf + seq->from, n); + if (err) { + err = -EFAULT; + goto done; + } + seq->count -= n; + seq->from += n; + copied = n; + goto done; + } + + seq->from = 0; + p = seq->op->start(seq, &seq->index); + if (!p) + goto stop; + if (IS_ERR(p)) { + err = PTR_ERR(p); + seq->op->stop(seq, p); + seq->count = 0; + goto done; + } + + err = seq->op->show(seq, p); + if (err > 0) { + /* object is skipped, decrease seq_num, so next + * valid object can reuse the same seq_num. + */ + bpf_iter_dec_seq_num(seq); + seq->count = 0; + } else if (err < 0 || seq_has_overflowed(seq)) { + if (!err) + err = -E2BIG; + seq->op->stop(seq, p); + seq->count = 0; + goto done; + } + + while (1) { + loff_t pos = seq->index; + + offs = seq->count; + p = seq->op->next(seq, p, &seq->index); + if (pos == seq->index) { + pr_info_ratelimited("buggy seq_file .next function %ps " + "did not updated position index\n", + seq->op->next); + seq->index++; + } + + if (IS_ERR_OR_NULL(p)) + break; + + /* got a valid next object, increase seq_num */ + bpf_iter_inc_seq_num(seq); + + if (seq->count >= size) + break; + + err = seq->op->show(seq, p); + if (err > 0) { + bpf_iter_dec_seq_num(seq); + seq->count = offs; + } else if (err < 0 || seq_has_overflowed(seq)) { + seq->count = offs; + if (offs == 0) { + if (!err) + err = -E2BIG; + seq->op->stop(seq, p); + goto done; + } + break; + } + } +stop: + offs = seq->count; + /* bpf program called if !p */ + seq->op->stop(seq, p); + if (!p) { + if (!seq_has_overflowed(seq)) { + bpf_iter_done_stop(seq); + } else { + seq->count = offs; + if (offs == 0) { + err = -E2BIG; + goto done; + } + } + } + + n = min(seq->count, size); + err = copy_to_user(buf, seq->buf, n); + if (err) { + err = -EFAULT; + goto done; + } + copied = n; + seq->count -= n; + seq->from = n; +done: + if (!copied) + copied = err; + else + *ppos += copied; + mutex_unlock(&seq->lock); + return copied; +} + +static const struct bpf_iter_seq_info * +__get_seq_info(struct bpf_iter_link *link) +{ + const struct bpf_iter_seq_info *seq_info; + + if (link->aux.map) { + seq_info = link->aux.map->ops->iter_seq_info; + if (seq_info) + return seq_info; + } + + return link->tinfo->reg_info->seq_info; +} + +static int iter_open(struct inode *inode, struct file *file) +{ + struct bpf_iter_link *link = inode->i_private; + + return prepare_seq_file(file, link, __get_seq_info(link)); +} + +static int iter_release(struct inode *inode, struct file *file) +{ + struct bpf_iter_priv_data *iter_priv; + struct seq_file *seq; + + seq = file->private_data; + if (!seq) + return 0; + + iter_priv = container_of(seq->private, struct bpf_iter_priv_data, + target_private); + + if (iter_priv->seq_info->fini_seq_private) + iter_priv->seq_info->fini_seq_private(seq->private); + + bpf_prog_put(iter_priv->prog); + seq->private = iter_priv; + + return seq_release_private(inode, file); +} + +const struct file_operations bpf_iter_fops = { + .open = iter_open, + .llseek = no_llseek, + .read = bpf_seq_read, + .release = iter_release, +}; + +/* The argument reg_info will be cached in bpf_iter_target_info. + * The common practice is to declare target reg_info as + * a const static variable and passed as an argument to + * bpf_iter_reg_target(). + */ +int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info) +{ + struct bpf_iter_target_info *tinfo; + + tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); + if (!tinfo) + return -ENOMEM; + + tinfo->reg_info = reg_info; + INIT_LIST_HEAD(&tinfo->list); + + mutex_lock(&targets_mutex); + list_add(&tinfo->list, &targets); + mutex_unlock(&targets_mutex); + + return 0; +} + +void bpf_iter_unreg_target(const struct bpf_iter_reg *reg_info) +{ + struct bpf_iter_target_info *tinfo; + bool found = false; + + mutex_lock(&targets_mutex); + list_for_each_entry(tinfo, &targets, list) { + if (reg_info == tinfo->reg_info) { + list_del(&tinfo->list); + kfree(tinfo); + found = true; + break; + } + } + mutex_unlock(&targets_mutex); + + WARN_ON(found == false); +} + +static void cache_btf_id(struct bpf_iter_target_info *tinfo, + struct bpf_prog *prog) +{ + tinfo->btf_id = prog->aux->attach_btf_id; +} + +bool bpf_iter_prog_supported(struct bpf_prog *prog) +{ + const char *attach_fname = prog->aux->attach_func_name; + u32 prog_btf_id = prog->aux->attach_btf_id; + const char *prefix = BPF_ITER_FUNC_PREFIX; + struct bpf_iter_target_info *tinfo; + int prefix_len = strlen(prefix); + bool supported = false; + + if (strncmp(attach_fname, prefix, prefix_len)) + return false; + + mutex_lock(&targets_mutex); + list_for_each_entry(tinfo, &targets, list) { + if (tinfo->btf_id && tinfo->btf_id == prog_btf_id) { + supported = true; + break; + } + if (!strcmp(attach_fname + prefix_len, tinfo->reg_info->target)) { + cache_btf_id(tinfo, prog); + supported = true; + break; + } + } + mutex_unlock(&targets_mutex); + + if (supported) { + prog->aux->ctx_arg_info_size = tinfo->reg_info->ctx_arg_info_size; + prog->aux->ctx_arg_info = tinfo->reg_info->ctx_arg_info; + } + + return supported; +} + +static void bpf_iter_link_release(struct bpf_link *link) +{ + struct bpf_iter_link *iter_link = + container_of(link, struct bpf_iter_link, link); + + if (iter_link->aux.map) + bpf_map_put_with_uref(iter_link->aux.map); +} + +static void bpf_iter_link_dealloc(struct bpf_link *link) +{ + struct bpf_iter_link *iter_link = + container_of(link, struct bpf_iter_link, link); + + kfree(iter_link); +} + +static int bpf_iter_link_replace(struct bpf_link *link, + struct bpf_prog *new_prog, + struct bpf_prog *old_prog) +{ + int ret = 0; + + mutex_lock(&link_mutex); + if (old_prog && link->prog != old_prog) { + ret = -EPERM; + goto out_unlock; + } + + if (link->prog->type != new_prog->type || + link->prog->expected_attach_type != new_prog->expected_attach_type || + link->prog->aux->attach_btf_id != new_prog->aux->attach_btf_id) { + ret = -EINVAL; + goto out_unlock; + } + + old_prog = xchg(&link->prog, new_prog); + bpf_prog_put(old_prog); + +out_unlock: + mutex_unlock(&link_mutex); + return ret; +} + +static const struct bpf_link_ops bpf_iter_link_lops = { + .release = bpf_iter_link_release, + .dealloc = bpf_iter_link_dealloc, + .update_prog = bpf_iter_link_replace, +}; + +bool bpf_link_is_iter(struct bpf_link *link) +{ + return link->ops == &bpf_iter_link_lops; +} + +int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct bpf_link_primer link_primer; + struct bpf_iter_target_info *tinfo; + struct bpf_iter_aux_info aux = {}; + struct bpf_iter_link *link; + u32 prog_btf_id, target_fd; + bool existed = false; + struct bpf_map *map; + int err; + + prog_btf_id = prog->aux->attach_btf_id; + mutex_lock(&targets_mutex); + list_for_each_entry(tinfo, &targets, list) { + if (tinfo->btf_id == prog_btf_id) { + existed = true; + break; + } + } + mutex_unlock(&targets_mutex); + if (!existed) + return -ENOENT; + + /* Make sure user supplied flags are target expected. */ + target_fd = attr->link_create.target_fd; + if (attr->link_create.flags != tinfo->reg_info->req_linfo) + return -EINVAL; + if (!attr->link_create.flags && target_fd) + return -EINVAL; + + link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN); + if (!link) + return -ENOMEM; + + bpf_link_init(&link->link, BPF_LINK_TYPE_ITER, &bpf_iter_link_lops, prog); + link->tinfo = tinfo; + + err = bpf_link_prime(&link->link, &link_primer); + if (err) { + kfree(link); + return err; + } + + if (tinfo->reg_info->req_linfo == BPF_ITER_LINK_MAP_FD) { + map = bpf_map_get_with_uref(target_fd); + if (IS_ERR(map)) { + err = PTR_ERR(map); + goto cleanup_link; + } + + aux.map = map; + err = tinfo->reg_info->check_target(prog, &aux); + if (err) { + bpf_map_put_with_uref(map); + goto cleanup_link; + } + + link->aux.map = map; + } + + return bpf_link_settle(&link_primer); + +cleanup_link: + bpf_link_cleanup(&link_primer); + return err; +} + +static void init_seq_meta(struct bpf_iter_priv_data *priv_data, + struct bpf_iter_target_info *tinfo, + const struct bpf_iter_seq_info *seq_info, + struct bpf_prog *prog) +{ + priv_data->tinfo = tinfo; + priv_data->seq_info = seq_info; + priv_data->prog = prog; + priv_data->session_id = atomic64_inc_return(&session_id); + priv_data->seq_num = 0; + priv_data->done_stop = false; +} + +static int prepare_seq_file(struct file *file, struct bpf_iter_link *link, + const struct bpf_iter_seq_info *seq_info) +{ + struct bpf_iter_priv_data *priv_data; + struct bpf_iter_target_info *tinfo; + struct bpf_prog *prog; + u32 total_priv_dsize; + struct seq_file *seq; + int err = 0; + + mutex_lock(&link_mutex); + prog = link->link.prog; + bpf_prog_inc(prog); + mutex_unlock(&link_mutex); + + tinfo = link->tinfo; + total_priv_dsize = offsetof(struct bpf_iter_priv_data, target_private) + + seq_info->seq_priv_size; + priv_data = __seq_open_private(file, seq_info->seq_ops, + total_priv_dsize); + if (!priv_data) { + err = -ENOMEM; + goto release_prog; + } + + if (seq_info->init_seq_private) { + err = seq_info->init_seq_private(priv_data->target_private, &link->aux); + if (err) + goto release_seq_file; + } + + init_seq_meta(priv_data, tinfo, seq_info, prog); + seq = file->private_data; + seq->private = priv_data->target_private; + + return 0; + +release_seq_file: + seq_release_private(file->f_inode, file); + file->private_data = NULL; +release_prog: + bpf_prog_put(prog); + return err; +} + +int bpf_iter_new_fd(struct bpf_link *link) +{ + struct bpf_iter_link *iter_link; + struct file *file; + unsigned int flags; + int err, fd; + + if (link->ops != &bpf_iter_link_lops) + return -EINVAL; + + flags = O_RDONLY | O_CLOEXEC; + fd = get_unused_fd_flags(flags); + if (fd < 0) + return fd; + + file = anon_inode_getfile("bpf_iter", &bpf_iter_fops, NULL, flags); + if (IS_ERR(file)) { + err = PTR_ERR(file); + goto free_fd; + } + + iter_link = container_of(link, struct bpf_iter_link, link); + err = prepare_seq_file(file, iter_link, __get_seq_info(iter_link)); + if (err) + goto free_file; + + fd_install(fd, file); + return fd; + +free_file: + fput(file); +free_fd: + put_unused_fd(fd); + return err; +} + +struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop) +{ + struct bpf_iter_priv_data *iter_priv; + struct seq_file *seq; + void *seq_priv; + + seq = meta->seq; + if (seq->file->f_op != &bpf_iter_fops) + return NULL; + + seq_priv = seq->private; + iter_priv = container_of(seq_priv, struct bpf_iter_priv_data, + target_private); + + if (in_stop && iter_priv->done_stop) + return NULL; + + meta->session_id = iter_priv->session_id; + meta->seq_num = iter_priv->seq_num; + + return iter_priv->prog; +} + +int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx) +{ + int ret; + + rcu_read_lock(); + migrate_disable(); + ret = BPF_PROG_RUN(prog, ctx); + migrate_enable(); + rcu_read_unlock(); + + /* bpf program can only return 0 or 1: + * 0 : okay + * 1 : retry the same object + * The bpf_iter_run_prog() return value + * will be seq_ops->show() return value. + */ + return ret == 0 ? 0 : -EAGAIN; +} diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 19636703b24e..fb278144e9fd 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -49,6 +49,6 @@ const struct bpf_prog_ops lsm_prog_ops = { }; const struct bpf_verifier_ops lsm_verifier_ops = { - .get_func_proto = bpf_tracing_func_proto, + .get_func_proto = tracing_prog_func_proto, .is_valid_access = btf_ctx_access, }; diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 26cb51f2db72..969c5d47f81f 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -557,7 +557,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) struct bpf_map *map; int err; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return ERR_PTR(-EPERM); st_ops = bpf_struct_ops_find_value(attr->btf_vmlinux_value_type_id); @@ -611,6 +611,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) return map; } +static int bpf_struct_ops_map_btf_id; const struct bpf_map_ops bpf_struct_ops_map_ops = { .map_alloc_check = bpf_struct_ops_map_alloc_check, .map_alloc = bpf_struct_ops_map_alloc, @@ -620,6 +621,8 @@ const struct bpf_map_ops bpf_struct_ops_map_ops = { .map_delete_elem = bpf_struct_ops_map_delete_elem, .map_update_elem = bpf_struct_ops_map_update_elem, .map_seq_show_elem = bpf_struct_ops_map_seq_show_elem, + .map_btf_name = "bpf_struct_ops_map", + .map_btf_id = &bpf_struct_ops_map_btf_id, }; /* "const void *" because some subsystem is diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index d65c6912bdaf..91afdd4c82e3 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -18,6 +18,7 @@ #include <linux/sort.h> #include <linux/bpf_verifier.h> #include <linux/btf.h> +#include <linux/btf_ids.h> #include <linux/skmsg.h> #include <linux/perf_event.h> #include <net/sock.h> @@ -3482,6 +3483,7 @@ extern char __weak __stop_BTF[]; extern struct btf *btf_vmlinux; #define BPF_MAP_TYPE(_id, _ops) +#define BPF_LINK_TYPE(_id, _name) static union { struct bpf_ctx_convert { #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ @@ -3508,6 +3510,7 @@ static u8 bpf_ctx_convert_map[] = { 0, /* avoid empty array */ }; #undef BPF_MAP_TYPE +#undef BPF_LINK_TYPE static const struct btf_member * btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf, @@ -3569,6 +3572,41 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf, return ctx_type; } +static const struct bpf_map_ops * const btf_vmlinux_map_ops[] = { +#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) +#define BPF_LINK_TYPE(_id, _name) +#define BPF_MAP_TYPE(_id, _ops) \ + [_id] = &_ops, +#include <linux/bpf_types.h> +#undef BPF_PROG_TYPE +#undef BPF_LINK_TYPE +#undef BPF_MAP_TYPE +}; + +static int btf_vmlinux_map_ids_init(const struct btf *btf, + struct bpf_verifier_log *log) +{ + const struct bpf_map_ops *ops; + int i, btf_id; + + for (i = 0; i < ARRAY_SIZE(btf_vmlinux_map_ops); ++i) { + ops = btf_vmlinux_map_ops[i]; + if (!ops || (!ops->map_btf_name && !ops->map_btf_id)) + continue; + if (!ops->map_btf_name || !ops->map_btf_id) { + bpf_log(log, "map type %d is misconfigured\n", i); + return -EINVAL; + } + btf_id = btf_find_by_name_kind(btf, ops->map_btf_name, + BTF_KIND_STRUCT); + if (btf_id < 0) + return btf_id; + *ops->map_btf_id = btf_id; + } + + return 0; +} + static int btf_translate_to_vmlinux(struct bpf_verifier_log *log, struct btf *btf, const struct btf_type *t, @@ -3584,12 +3622,15 @@ static int btf_translate_to_vmlinux(struct bpf_verifier_log *log, return kern_ctx_type->type; } +BTF_ID_LIST(bpf_ctx_convert_btf_id) +BTF_ID(struct, bpf_ctx_convert) + struct btf *btf_parse_vmlinux(void) { struct btf_verifier_env *env = NULL; struct bpf_verifier_log *log; struct btf *btf = NULL; - int err, i; + int err; env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN); if (!env) @@ -3622,25 +3663,13 @@ struct btf *btf_parse_vmlinux(void) if (err) goto errout; - /* find struct bpf_ctx_convert for type checking later */ - for (i = 1; i <= btf->nr_types; i++) { - const struct btf_type *t; - const char *tname; + /* btf_parse_vmlinux() runs under bpf_verifier_lock */ + bpf_ctx_convert.t = btf_type_by_id(btf, bpf_ctx_convert_btf_id[0]); - t = btf_type_by_id(btf, i); - if (!__btf_type_is_struct(t)) - continue; - tname = __btf_name_by_offset(btf, t->name_off); - if (!strcmp(tname, "bpf_ctx_convert")) { - /* btf_parse_vmlinux() runs under bpf_verifier_lock */ - bpf_ctx_convert.t = t; - break; - } - } - if (i > btf->nr_types) { - err = -ENOENT; + /* find bpf map structs for map_ptr access checking */ + err = btf_vmlinux_map_ids_init(btf, log); + if (err < 0) goto errout; - } bpf_struct_ops_init(btf, log); @@ -3692,7 +3721,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, struct bpf_verifier_log *log = info->log; const struct btf_param *args; u32 nr_args, arg; - int ret; + int i, ret; if (off % 8) { bpf_log(log, "func '%s' offset %d is not multiple of 8\n", @@ -3744,7 +3773,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, return false; t = btf_type_skip_modifiers(btf, t->type, NULL); - if (!btf_type_is_int(t)) { + if (!btf_type_is_small_int(t)) { bpf_log(log, "ret type %s not allowed for fmod_ret\n", btf_kind_str[BTF_INFO_KIND(t->info)]); @@ -3766,7 +3795,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, /* skip modifiers */ while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); - if (btf_type_is_int(t) || btf_type_is_enum(t)) + if (btf_type_is_small_int(t) || btf_type_is_enum(t)) /* accessing a scalar */ return true; if (!btf_type_is_ptr(t)) { @@ -3777,6 +3806,19 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, btf_kind_str[BTF_INFO_KIND(t->info)]); return false; } + + /* check for PTR_TO_RDONLY_BUF_OR_NULL or PTR_TO_RDWR_BUF_OR_NULL */ + for (i = 0; i < prog->aux->ctx_arg_info_size; i++) { + const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i]; + + if (ctx_arg_info->offset == off && + (ctx_arg_info->reg_type == PTR_TO_RDONLY_BUF_OR_NULL || + ctx_arg_info->reg_type == PTR_TO_RDWR_BUF_OR_NULL)) { + info->reg_type = ctx_arg_info->reg_type; + return true; + } + } + if (t->type == 0) /* This is a pointer to void. * It is the same as scalar from the verifier safety pov. @@ -3788,8 +3830,17 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, return true; /* this is a pointer to another type */ - info->reg_type = PTR_TO_BTF_ID; + for (i = 0; i < prog->aux->ctx_arg_info_size; i++) { + const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i]; + if (ctx_arg_info->offset == off) { + info->reg_type = ctx_arg_info->reg_type; + info->btf_id = ctx_arg_info->btf_id; + return true; + } + } + + info->reg_type = PTR_TO_BTF_ID; if (tgt_prog) { ret = btf_translate_to_vmlinux(log, btf, t, tgt_prog->type, arg); if (ret > 0) { @@ -3828,6 +3879,7 @@ int btf_struct_access(struct bpf_verifier_log *log, const struct btf_type *mtype, *elem_type = NULL; const struct btf_member *member; const char *tname, *mname; + u32 vlen; again: tname = __btf_name_by_offset(btf_vmlinux, t->name_off); @@ -3836,7 +3888,43 @@ again: return -EINVAL; } + vlen = btf_type_vlen(t); if (off + size > t->size) { + /* If the last element is a variable size array, we may + * need to relax the rule. + */ + struct btf_array *array_elem; + + if (vlen == 0) + goto error; + + member = btf_type_member(t) + vlen - 1; + mtype = btf_type_skip_modifiers(btf_vmlinux, member->type, + NULL); + if (!btf_type_is_array(mtype)) + goto error; + + array_elem = (struct btf_array *)(mtype + 1); + if (array_elem->nelems != 0) + goto error; + + moff = btf_member_bit_offset(t, member) / 8; + if (off < moff) + goto error; + + /* Only allow structure for now, can be relaxed for + * other types later. + */ + elem_type = btf_type_skip_modifiers(btf_vmlinux, + array_elem->type, NULL); + if (!btf_type_is_struct(elem_type)) + goto error; + + off = (off - moff) % elem_type->size; + return btf_struct_access(log, elem_type, off, size, atype, + next_btf_id); + +error: bpf_log(log, "access beyond struct %s at off %u size %u\n", tname, off, size); return -EACCES; @@ -4002,96 +4090,17 @@ again: return -EINVAL; } -static int __btf_resolve_helper_id(struct bpf_verifier_log *log, void *fn, - int arg) -{ - char fnname[KSYM_SYMBOL_LEN + 4] = "btf_"; - const struct btf_param *args; - const struct btf_type *t; - const char *tname, *sym; - u32 btf_id, i; - - if (IS_ERR(btf_vmlinux)) { - bpf_log(log, "btf_vmlinux is malformed\n"); - return -EINVAL; - } - - sym = kallsyms_lookup((long)fn, NULL, NULL, NULL, fnname + 4); - if (!sym) { - bpf_log(log, "kernel doesn't have kallsyms\n"); - return -EFAULT; - } - - for (i = 1; i <= btf_vmlinux->nr_types; i++) { - t = btf_type_by_id(btf_vmlinux, i); - if (BTF_INFO_KIND(t->info) != BTF_KIND_TYPEDEF) - continue; - tname = __btf_name_by_offset(btf_vmlinux, t->name_off); - if (!strcmp(tname, fnname)) - break; - } - if (i > btf_vmlinux->nr_types) { - bpf_log(log, "helper %s type is not found\n", fnname); - return -ENOENT; - } - - t = btf_type_by_id(btf_vmlinux, t->type); - if (!btf_type_is_ptr(t)) - return -EFAULT; - t = btf_type_by_id(btf_vmlinux, t->type); - if (!btf_type_is_func_proto(t)) - return -EFAULT; - - args = (const struct btf_param *)(t + 1); - if (arg >= btf_type_vlen(t)) { - bpf_log(log, "bpf helper %s doesn't have %d-th argument\n", - fnname, arg); - return -EINVAL; - } - - t = btf_type_by_id(btf_vmlinux, args[arg].type); - if (!btf_type_is_ptr(t) || !t->type) { - /* anything but the pointer to struct is a helper config bug */ - bpf_log(log, "ARG_PTR_TO_BTF is misconfigured\n"); - return -EFAULT; - } - btf_id = t->type; - t = btf_type_by_id(btf_vmlinux, t->type); - /* skip modifiers */ - while (btf_type_is_modifier(t)) { - btf_id = t->type; - t = btf_type_by_id(btf_vmlinux, t->type); - } - if (!btf_type_is_struct(t)) { - bpf_log(log, "ARG_PTR_TO_BTF is not a struct\n"); - return -EFAULT; - } - bpf_log(log, "helper %s arg%d has btf_id %d struct %s\n", fnname + 4, - arg, btf_id, __btf_name_by_offset(btf_vmlinux, t->name_off)); - return btf_id; -} - int btf_resolve_helper_id(struct bpf_verifier_log *log, const struct bpf_func_proto *fn, int arg) { - int *btf_id = &fn->btf_id[arg]; - int ret; + int id; - if (fn->arg_type[arg] != ARG_PTR_TO_BTF_ID) + if (fn->arg_type[arg] != ARG_PTR_TO_BTF_ID || !btf_vmlinux) return -EINVAL; - - ret = READ_ONCE(*btf_id); - if (ret) - return ret; - /* ok to race the search. The result is the same */ - ret = __btf_resolve_helper_id(log, fn->func, arg); - if (!ret) { - /* Function argument cannot be type 'void' */ - bpf_log(log, "BTF resolution bug\n"); - return -EFAULT; - } - WRITE_ONCE(*btf_id, ret); - return ret; + id = fn->btf_id[arg]; + if (!id || id > btf_vmlinux->nr_types) + return -EINVAL; + return id; } static int __get_type_size(struct btf *btf, u32 btf_id, diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index cb305e71e7de..83ff127ef7ae 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -37,17 +37,34 @@ static void bpf_cgroup_storages_free(struct bpf_cgroup_storage *storages[]) } static int bpf_cgroup_storages_alloc(struct bpf_cgroup_storage *storages[], - struct bpf_prog *prog) + struct bpf_cgroup_storage *new_storages[], + enum bpf_attach_type type, + struct bpf_prog *prog, + struct cgroup *cgrp) { enum bpf_cgroup_storage_type stype; + struct bpf_cgroup_storage_key key; + struct bpf_map *map; + + key.cgroup_inode_id = cgroup_id(cgrp); + key.attach_type = type; for_each_cgroup_storage_type(stype) { + map = prog->aux->cgroup_storage[stype]; + if (!map) + continue; + + storages[stype] = cgroup_storage_lookup((void *)map, &key, false); + if (storages[stype]) + continue; + storages[stype] = bpf_cgroup_storage_alloc(prog, stype); if (IS_ERR(storages[stype])) { - storages[stype] = NULL; - bpf_cgroup_storages_free(storages); + bpf_cgroup_storages_free(new_storages); return -ENOMEM; } + + new_storages[stype] = storages[stype]; } return 0; @@ -63,7 +80,7 @@ static void bpf_cgroup_storages_assign(struct bpf_cgroup_storage *dst[], } static void bpf_cgroup_storages_link(struct bpf_cgroup_storage *storages[], - struct cgroup* cgrp, + struct cgroup *cgrp, enum bpf_attach_type attach_type) { enum bpf_cgroup_storage_type stype; @@ -72,14 +89,6 @@ static void bpf_cgroup_storages_link(struct bpf_cgroup_storage *storages[], bpf_cgroup_storage_link(storages[stype], cgrp, attach_type); } -static void bpf_cgroup_storages_unlink(struct bpf_cgroup_storage *storages[]) -{ - enum bpf_cgroup_storage_type stype; - - for_each_cgroup_storage_type(stype) - bpf_cgroup_storage_unlink(storages[stype]); -} - /* Called when bpf_cgroup_link is auto-detached from dying cgroup. * It drops cgroup and bpf_prog refcounts, and marks bpf_link as defunct. It * doesn't free link memory, which will eventually be done by bpf_link's @@ -101,22 +110,23 @@ static void cgroup_bpf_release(struct work_struct *work) struct cgroup *p, *cgrp = container_of(work, struct cgroup, bpf.release_work); struct bpf_prog_array *old_array; + struct list_head *storages = &cgrp->bpf.storages; + struct bpf_cgroup_storage *storage, *stmp; + unsigned int type; mutex_lock(&cgroup_mutex); for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) { struct list_head *progs = &cgrp->bpf.progs[type]; - struct bpf_prog_list *pl, *tmp; + struct bpf_prog_list *pl, *pltmp; - list_for_each_entry_safe(pl, tmp, progs, node) { + list_for_each_entry_safe(pl, pltmp, progs, node) { list_del(&pl->node); if (pl->prog) bpf_prog_put(pl->prog); if (pl->link) bpf_cgroup_link_auto_detach(pl->link); - bpf_cgroup_storages_unlink(pl->storage); - bpf_cgroup_storages_free(pl->storage); kfree(pl); static_branch_dec(&cgroup_bpf_enabled_key); } @@ -126,6 +136,11 @@ static void cgroup_bpf_release(struct work_struct *work) bpf_prog_array_free(old_array); } + list_for_each_entry_safe(storage, stmp, storages, list_cg) { + bpf_cgroup_storage_unlink(storage); + bpf_cgroup_storage_free(storage); + } + mutex_unlock(&cgroup_mutex); for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p)) @@ -290,6 +305,8 @@ int cgroup_bpf_inherit(struct cgroup *cgrp) for (i = 0; i < NR; i++) INIT_LIST_HEAD(&cgrp->bpf.progs[i]); + INIT_LIST_HEAD(&cgrp->bpf.storages); + for (i = 0; i < NR; i++) if (compute_effective_progs(cgrp, i, &arrays[i])) goto cleanup; @@ -378,7 +395,7 @@ static struct bpf_prog_list *find_attach_entry(struct list_head *progs, } list_for_each_entry(pl, progs, node) { - if (prog && pl->prog == prog) + if (prog && pl->prog == prog && prog != replace_prog) /* disallow attaching the same prog twice */ return ERR_PTR(-EINVAL); if (link && pl->link == link) @@ -422,7 +439,7 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct list_head *progs = &cgrp->bpf.progs[type]; struct bpf_prog *old_prog = NULL; struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; - struct bpf_cgroup_storage *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; + struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; struct bpf_prog_list *pl; int err; @@ -455,17 +472,16 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, if (IS_ERR(pl)) return PTR_ERR(pl); - if (bpf_cgroup_storages_alloc(storage, prog ? : link->link.prog)) + if (bpf_cgroup_storages_alloc(storage, new_storage, type, + prog ? : link->link.prog, cgrp)) return -ENOMEM; if (pl) { old_prog = pl->prog; - bpf_cgroup_storages_unlink(pl->storage); - bpf_cgroup_storages_assign(old_storage, pl->storage); } else { pl = kmalloc(sizeof(*pl), GFP_KERNEL); if (!pl) { - bpf_cgroup_storages_free(storage); + bpf_cgroup_storages_free(new_storage); return -ENOMEM; } list_add_tail(&pl->node, progs); @@ -480,12 +496,11 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, if (err) goto cleanup; - bpf_cgroup_storages_free(old_storage); if (old_prog) bpf_prog_put(old_prog); else static_branch_inc(&cgroup_bpf_enabled_key); - bpf_cgroup_storages_link(pl->storage, cgrp, type); + bpf_cgroup_storages_link(new_storage, cgrp, type); return 0; cleanup: @@ -493,9 +508,7 @@ cleanup: pl->prog = old_prog; pl->link = NULL; } - bpf_cgroup_storages_free(pl->storage); - bpf_cgroup_storages_assign(pl->storage, old_storage); - bpf_cgroup_storages_link(pl->storage, cgrp, type); + bpf_cgroup_storages_free(new_storage); if (!old_prog) { list_del(&pl->node); kfree(pl); @@ -557,8 +570,9 @@ found: * * Must be called with cgroup_mutex held. */ -int __cgroup_bpf_replace(struct cgroup *cgrp, struct bpf_cgroup_link *link, - struct bpf_prog *new_prog) +static int __cgroup_bpf_replace(struct cgroup *cgrp, + struct bpf_cgroup_link *link, + struct bpf_prog *new_prog) { struct list_head *progs = &cgrp->bpf.progs[link->type]; struct bpf_prog *old_prog; @@ -583,6 +597,30 @@ int __cgroup_bpf_replace(struct cgroup *cgrp, struct bpf_cgroup_link *link, return 0; } +static int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *new_prog, + struct bpf_prog *old_prog) +{ + struct bpf_cgroup_link *cg_link; + int ret; + + cg_link = container_of(link, struct bpf_cgroup_link, link); + + mutex_lock(&cgroup_mutex); + /* link might have been auto-released by dying cgroup, so fail */ + if (!cg_link->cgroup) { + ret = -ENOLINK; + goto out_unlock; + } + if (old_prog && link->prog != old_prog) { + ret = -EPERM; + goto out_unlock; + } + ret = __cgroup_bpf_replace(cg_link->cgroup, cg_link, new_prog); +out_unlock: + mutex_unlock(&cgroup_mutex); + return ret; +} + static struct bpf_prog_list *find_detach_entry(struct list_head *progs, struct bpf_prog *prog, struct bpf_cgroup_link *link, @@ -654,8 +692,6 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, /* now can actually delete it from this cgroup list */ list_del(&pl->node); - bpf_cgroup_storages_unlink(pl->storage); - bpf_cgroup_storages_free(pl->storage); kfree(pl); if (list_empty(progs)) /* last program was detached, reset flags to zero */ @@ -778,6 +814,7 @@ static void bpf_cgroup_link_release(struct bpf_link *link) { struct bpf_cgroup_link *cg_link = container_of(link, struct bpf_cgroup_link, link); + struct cgroup *cg; /* link might have been auto-detached by dying cgroup already, * in that case our work is done here @@ -796,8 +833,12 @@ static void bpf_cgroup_link_release(struct bpf_link *link) WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link, cg_link->type)); + cg = cg_link->cgroup; + cg_link->cgroup = NULL; + mutex_unlock(&cgroup_mutex); - cgroup_put(cg_link->cgroup); + + cgroup_put(cg); } static void bpf_cgroup_link_dealloc(struct bpf_link *link) @@ -808,17 +849,64 @@ static void bpf_cgroup_link_dealloc(struct bpf_link *link) kfree(cg_link); } -const struct bpf_link_ops bpf_cgroup_link_lops = { +static int bpf_cgroup_link_detach(struct bpf_link *link) +{ + bpf_cgroup_link_release(link); + + return 0; +} + +static void bpf_cgroup_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_cgroup_link *cg_link = + container_of(link, struct bpf_cgroup_link, link); + u64 cg_id = 0; + + mutex_lock(&cgroup_mutex); + if (cg_link->cgroup) + cg_id = cgroup_id(cg_link->cgroup); + mutex_unlock(&cgroup_mutex); + + seq_printf(seq, + "cgroup_id:\t%llu\n" + "attach_type:\t%d\n", + cg_id, + cg_link->type); +} + +static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + struct bpf_cgroup_link *cg_link = + container_of(link, struct bpf_cgroup_link, link); + u64 cg_id = 0; + + mutex_lock(&cgroup_mutex); + if (cg_link->cgroup) + cg_id = cgroup_id(cg_link->cgroup); + mutex_unlock(&cgroup_mutex); + + info->cgroup.cgroup_id = cg_id; + info->cgroup.attach_type = cg_link->type; + return 0; +} + +static const struct bpf_link_ops bpf_cgroup_link_lops = { .release = bpf_cgroup_link_release, .dealloc = bpf_cgroup_link_dealloc, + .detach = bpf_cgroup_link_detach, + .update_prog = cgroup_bpf_replace, + .show_fdinfo = bpf_cgroup_link_show_fdinfo, + .fill_link_info = bpf_cgroup_link_fill_link_info, }; int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { + struct bpf_link_primer link_primer; struct bpf_cgroup_link *link; - struct file *link_file; struct cgroup *cgrp; - int err, link_fd; + int err; if (attr->link_create.flags) return -EINVAL; @@ -832,26 +920,25 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) err = -ENOMEM; goto out_put_cgroup; } - bpf_link_init(&link->link, &bpf_cgroup_link_lops, prog); + bpf_link_init(&link->link, BPF_LINK_TYPE_CGROUP, &bpf_cgroup_link_lops, + prog); link->cgroup = cgrp; link->type = attr->link_create.attach_type; - link_file = bpf_link_new_file(&link->link, &link_fd); - if (IS_ERR(link_file)) { + err = bpf_link_prime(&link->link, &link_primer); + if (err) { kfree(link); - err = PTR_ERR(link_file); goto out_put_cgroup; } err = cgroup_bpf_attach(cgrp, NULL, NULL, link, link->type, BPF_F_ALLOW_MULTI); if (err) { - bpf_link_cleanup(&link->link, link_file, link_fd); + bpf_link_cleanup(&link_primer); goto out_put_cgroup; } - fd_install(link_fd, link_file); - return link_fd; + return bpf_link_settle(&link_primer); out_put_cgroup: cgroup_put(cgrp); @@ -1054,36 +1141,21 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, return !allow; } -EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission); static const struct bpf_func_proto * cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { - case BPF_FUNC_map_lookup_elem: - return &bpf_map_lookup_elem_proto; - case BPF_FUNC_map_update_elem: - return &bpf_map_update_elem_proto; - case BPF_FUNC_map_delete_elem: - return &bpf_map_delete_elem_proto; - case BPF_FUNC_map_push_elem: - return &bpf_map_push_elem_proto; - case BPF_FUNC_map_pop_elem: - return &bpf_map_pop_elem_proto; - case BPF_FUNC_map_peek_elem: - return &bpf_map_peek_elem_proto; case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; case BPF_FUNC_get_current_cgroup_id: return &bpf_get_current_cgroup_id_proto; - case BPF_FUNC_trace_printk: - if (capable(CAP_SYS_ADMIN)) - return bpf_get_trace_printk_proto(); - /* fall through */ + case BPF_FUNC_perf_event_output: + return &bpf_event_output_data_proto; default: - return NULL; + return bpf_base_func_proto(func_id); } } @@ -1137,16 +1209,13 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = { * @head: sysctl table header * @table: sysctl table * @write: sysctl is being read (= 0) or written (= 1) - * @buf: pointer to buffer passed by user space + * @buf: pointer to buffer (in and out) * @pcount: value-result argument: value is size of buffer pointed to by @buf, * result is size of @new_buf if program set new value, initial value * otherwise * @ppos: value-result argument: value is position at which read from or write * to sysctl is happening, result is new position if program overrode it, * initial value otherwise - * @new_buf: pointer to pointer to new buffer that will be allocated if program - * overrides new value provided by user space on sysctl write - * NOTE: it's caller responsibility to free *new_buf if it was set * @type: type of program to be executed * * Program is run when sysctl is being accessed, either read or written, and @@ -1157,8 +1226,7 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = { */ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, struct ctl_table *table, int write, - void __user *buf, size_t *pcount, - loff_t *ppos, void **new_buf, + void **buf, size_t *pcount, loff_t *ppos, enum bpf_attach_type type) { struct bpf_sysctl_kern ctx = { @@ -1173,36 +1241,28 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, .new_updated = 0, }; struct cgroup *cgrp; + loff_t pos = 0; int ret; ctx.cur_val = kmalloc_track_caller(ctx.cur_len, GFP_KERNEL); - if (ctx.cur_val) { - mm_segment_t old_fs; - loff_t pos = 0; - - old_fs = get_fs(); - set_fs(KERNEL_DS); - if (table->proc_handler(table, 0, (void __user *)ctx.cur_val, - &ctx.cur_len, &pos)) { - /* Let BPF program decide how to proceed. */ - ctx.cur_len = 0; - } - set_fs(old_fs); - } else { + if (!ctx.cur_val || + table->proc_handler(table, 0, ctx.cur_val, &ctx.cur_len, &pos)) { /* Let BPF program decide how to proceed. */ ctx.cur_len = 0; } - if (write && buf && *pcount) { + if (write && *buf && *pcount) { /* BPF program should be able to override new value with a * buffer bigger than provided by user. */ ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL); ctx.new_len = min_t(size_t, PAGE_SIZE, *pcount); - if (!ctx.new_val || - copy_from_user(ctx.new_val, buf, ctx.new_len)) + if (ctx.new_val) { + memcpy(ctx.new_val, *buf, ctx.new_len); + } else { /* Let BPF program decide how to proceed. */ ctx.new_len = 0; + } } rcu_read_lock(); @@ -1213,7 +1273,8 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, kfree(ctx.cur_val); if (ret == 1 && ctx.new_updated) { - *new_buf = ctx.new_val; + kfree(*buf); + *buf = ctx.new_val; *pcount = ctx.new_len; } else { kfree(ctx.new_val); @@ -1221,7 +1282,6 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, return ret == 1 ? 0 : -EPERM; } -EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl); #ifdef CONFIG_NET static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp, @@ -1240,16 +1300,23 @@ static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp, static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen) { - if (unlikely(max_optlen > PAGE_SIZE) || max_optlen < 0) + if (unlikely(max_optlen < 0)) return -EINVAL; + if (unlikely(max_optlen > PAGE_SIZE)) { + /* We don't expose optvals that are greater than PAGE_SIZE + * to the BPF program. + */ + max_optlen = PAGE_SIZE; + } + ctx->optval = kzalloc(max_optlen, GFP_USER); if (!ctx->optval) return -ENOMEM; ctx->optval_end = ctx->optval + max_optlen; - return 0; + return max_optlen; } static void sockopt_free_buf(struct bpf_sockopt_kern *ctx) @@ -1283,13 +1350,13 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, */ max_optlen = max_t(int, 16, *optlen); - ret = sockopt_alloc_buf(&ctx, max_optlen); - if (ret) - return ret; + max_optlen = sockopt_alloc_buf(&ctx, max_optlen); + if (max_optlen < 0) + return max_optlen; ctx.optlen = *optlen; - if (copy_from_user(ctx.optval, optval, *optlen) != 0) { + if (copy_from_user(ctx.optval, optval, min(*optlen, max_optlen)) != 0) { ret = -EFAULT; goto out; } @@ -1317,8 +1384,14 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, /* export any potential modifications */ *level = ctx.level; *optname = ctx.optname; - *optlen = ctx.optlen; - *kernel_optval = ctx.optval; + + /* optlen == 0 from BPF indicates that we should + * use original userspace data. + */ + if (ctx.optlen != 0) { + *optlen = ctx.optlen; + *kernel_optval = ctx.optval; + } } out: @@ -1326,7 +1399,6 @@ out: sockopt_free_buf(&ctx); return ret; } -EXPORT_SYMBOL(__cgroup_bpf_run_filter_setsockopt); int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, int optname, char __user *optval, @@ -1350,12 +1422,12 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT)) return retval; - ret = sockopt_alloc_buf(&ctx, max_optlen); - if (ret) - return ret; - ctx.optlen = max_optlen; + max_optlen = sockopt_alloc_buf(&ctx, max_optlen); + if (max_optlen < 0) + return max_optlen; + if (!retval) { /* If kernel getsockopt finished successfully, * copy whatever was returned to the user back @@ -1369,10 +1441,8 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, goto out; } - if (ctx.optlen > max_optlen) - ctx.optlen = max_optlen; - - if (copy_from_user(ctx.optval, optval, ctx.optlen) != 0) { + if (copy_from_user(ctx.optval, optval, + min(ctx.optlen, max_optlen)) != 0) { ret = -EFAULT; goto out; } @@ -1401,10 +1471,12 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, goto out; } - if (copy_to_user(optval, ctx.optval, ctx.optlen) || - put_user(ctx.optlen, optlen)) { - ret = -EFAULT; - goto out; + if (ctx.optlen != 0) { + if (copy_to_user(optval, ctx.optval, ctx.optlen) || + put_user(ctx.optlen, optlen)) { + ret = -EFAULT; + goto out; + } } ret = ctx.retval; @@ -1413,7 +1485,6 @@ out: sockopt_free_buf(&ctx); return ret; } -EXPORT_SYMBOL(__cgroup_bpf_run_filter_getsockopt); #endif static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp, diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 916f5132a984..bde93344164d 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -82,7 +82,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag struct bpf_prog *fp; size = round_up(size, PAGE_SIZE); - fp = __vmalloc(size, gfp_flags, PAGE_KERNEL); + fp = __vmalloc(size, gfp_flags); if (fp == NULL) return NULL; @@ -232,7 +232,7 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, if (ret) return NULL; - fp = __vmalloc(size, gfp_flags, PAGE_KERNEL); + fp = __vmalloc(size, gfp_flags); if (fp == NULL) { __bpf_prog_uncharge(fp_old->aux->user, delta); } else { @@ -262,10 +262,10 @@ void __bpf_prog_free(struct bpf_prog *fp) int bpf_prog_calc_tag(struct bpf_prog *fp) { - const u32 bits_offset = SHA_MESSAGE_BYTES - sizeof(__be64); + const u32 bits_offset = SHA1_BLOCK_SIZE - sizeof(__be64); u32 raw_size = bpf_prog_tag_scratch_size(fp); - u32 digest[SHA_DIGEST_WORDS]; - u32 ws[SHA_WORKSPACE_WORDS]; + u32 digest[SHA1_DIGEST_WORDS]; + u32 ws[SHA1_WORKSPACE_WORDS]; u32 i, bsize, psize, blocks; struct bpf_insn *dst; bool was_ld_map; @@ -277,7 +277,7 @@ int bpf_prog_calc_tag(struct bpf_prog *fp) if (!raw) return -ENOMEM; - sha_init(digest); + sha1_init(digest); memset(ws, 0, sizeof(ws)); /* We need to take out the map fd for the digest calculation @@ -308,8 +308,8 @@ int bpf_prog_calc_tag(struct bpf_prog *fp) memset(&raw[psize], 0, raw_size - psize); raw[psize++] = 0x80; - bsize = round_up(psize, SHA_MESSAGE_BYTES); - blocks = bsize / SHA_MESSAGE_BYTES; + bsize = round_up(psize, SHA1_BLOCK_SIZE); + blocks = bsize / SHA1_BLOCK_SIZE; todo = raw; if (bsize - psize >= sizeof(__be64)) { bits = (__be64 *)(todo + bsize - sizeof(__be64)); @@ -320,12 +320,12 @@ int bpf_prog_calc_tag(struct bpf_prog *fp) *bits = cpu_to_be64((psize - 1) << 3); while (blocks--) { - sha_transform(digest, todo, ws); - todo += SHA_MESSAGE_BYTES; + sha1_transform(digest, todo, ws); + todo += SHA1_BLOCK_SIZE; } result = (__force __be32 *)digest; - for (i = 0; i < SHA_DIGEST_WORDS; i++) + for (i = 0; i < SHA1_DIGEST_WORDS; i++) result[i] = cpu_to_be32(digest[i]); memcpy(fp->tag, result, sizeof(fp->tag)); @@ -646,7 +646,7 @@ static bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp) void bpf_prog_kallsyms_add(struct bpf_prog *fp) { if (!bpf_prog_kallsyms_candidate(fp) || - !capable(CAP_SYS_ADMIN)) + !bpf_capable()) return; bpf_prog_ksym_set_addr(fp); @@ -1089,7 +1089,7 @@ static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other, gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; struct bpf_prog *fp; - fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags, PAGE_KERNEL); + fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags); if (fp != NULL) { /* aux->prog still points to the fp_other one, so * when promoting the clone to the real program, @@ -1543,7 +1543,7 @@ select_insn: /* ARG1 at this point is guaranteed to point to CTX from * the verifier side due to the fact that the tail call is - * handeled like a helper, that is, bpf_tail_call_proto, + * handled like a helper, that is, bpf_tail_call_proto, * where arg1_type is ARG_PTR_TO_CTX. */ insn = prog->insnsi; @@ -1958,6 +1958,61 @@ void bpf_prog_array_delete_safe(struct bpf_prog_array *array, } } +/** + * bpf_prog_array_delete_safe_at() - Replaces the program at the given + * index into the program array with + * a dummy no-op program. + * @array: a bpf_prog_array + * @index: the index of the program to replace + * + * Skips over dummy programs, by not counting them, when calculating + * the the position of the program to replace. + * + * Return: + * * 0 - Success + * * -EINVAL - Invalid index value. Must be a non-negative integer. + * * -ENOENT - Index out of range + */ +int bpf_prog_array_delete_safe_at(struct bpf_prog_array *array, int index) +{ + return bpf_prog_array_update_at(array, index, &dummy_bpf_prog.prog); +} + +/** + * bpf_prog_array_update_at() - Updates the program at the given index + * into the program array. + * @array: a bpf_prog_array + * @index: the index of the program to update + * @prog: the program to insert into the array + * + * Skips over dummy programs, by not counting them, when calculating + * the position of the program to update. + * + * Return: + * * 0 - Success + * * -EINVAL - Invalid index value. Must be a non-negative integer. + * * -ENOENT - Index out of range + */ +int bpf_prog_array_update_at(struct bpf_prog_array *array, int index, + struct bpf_prog *prog) +{ + struct bpf_prog_array_item *item; + + if (unlikely(index < 0)) + return -EINVAL; + + for (item = array->items; item->prog; item++) { + if (item->prog == &dummy_bpf_prog.prog) + continue; + if (!index) { + WRITE_ONCE(item->prog, prog); + return 0; + } + index--; + } + return -ENOENT; +} + int bpf_prog_array_copy(struct bpf_prog_array *old_array, struct bpf_prog *exclude_prog, struct bpf_prog *include_prog, @@ -2042,24 +2097,12 @@ int bpf_prog_array_copy_info(struct bpf_prog_array *array, : 0; } -static void bpf_free_cgroup_storage(struct bpf_prog_aux *aux) -{ - enum bpf_cgroup_storage_type stype; - - for_each_cgroup_storage_type(stype) { - if (!aux->cgroup_storage[stype]) - continue; - bpf_cgroup_storage_release(aux, aux->cgroup_storage[stype]); - } -} - void __bpf_free_used_maps(struct bpf_prog_aux *aux, struct bpf_map **used_maps, u32 len) { struct bpf_map *map; u32 i; - bpf_free_cgroup_storage(aux); for (i = 0; i < len; i++) { map = used_maps[i]; if (map->ops->map_poke_untrack) @@ -2136,6 +2179,11 @@ BPF_CALL_0(bpf_user_rnd_u32) return res; } +BPF_CALL_0(bpf_get_raw_cpu_id) +{ + return raw_smp_processor_id(); +} + /* Weak definitions of helper functions in case we don't have bpf syscall. */ const struct bpf_func_proto bpf_map_lookup_elem_proto __weak; const struct bpf_func_proto bpf_map_update_elem_proto __weak; @@ -2151,6 +2199,7 @@ const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; const struct bpf_func_proto bpf_get_numa_node_id_proto __weak; const struct bpf_func_proto bpf_ktime_get_ns_proto __weak; +const struct bpf_func_proto bpf_ktime_get_boot_ns_proto __weak; const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak; const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak; diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 3fe0b006d2d2..f1c46529929b 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -52,7 +52,6 @@ struct xdp_bulk_queue { struct bpf_cpu_map_entry { u32 cpu; /* kthread CPU and map index */ int map_id; /* Back reference to map */ - u32 qsize; /* Queue size placeholder for map lookup */ /* XDP can run multiple RX-ring queues, need __percpu enqueue store */ struct xdp_bulk_queue __percpu *bulkq; @@ -62,10 +61,14 @@ struct bpf_cpu_map_entry { /* Queue with potential multi-producers, and single-consumer kthread */ struct ptr_ring *queue; struct task_struct *kthread; - struct work_struct kthread_stop_wq; + + struct bpf_cpumap_val value; + struct bpf_prog *prog; atomic_t refcnt; /* Control when this struct can be free'ed */ struct rcu_head rcu; + + struct work_struct kthread_stop_wq; }; struct bpf_cpu_map { @@ -80,17 +83,20 @@ static int bq_flush_to_queue(struct xdp_bulk_queue *bq); static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) { + u32 value_size = attr->value_size; struct bpf_cpu_map *cmap; int err = -ENOMEM; u64 cost; int ret; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return ERR_PTR(-EPERM); /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE) + (value_size != offsetofend(struct bpf_cpumap_val, qsize) && + value_size != offsetofend(struct bpf_cpumap_val, bpf_prog.fd)) || + attr->map_flags & ~BPF_F_NUMA_NODE) return ERR_PTR(-EINVAL); cmap = kzalloc(sizeof(*cmap), GFP_USER); @@ -162,25 +168,10 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, /* Part of headroom was reserved to xdpf */ hard_start_headroom = sizeof(struct xdp_frame) + xdpf->headroom; - /* build_skb need to place skb_shared_info after SKB end, and - * also want to know the memory "truesize". Thus, need to - * know the memory frame size backing xdp_buff. - * - * XDP was designed to have PAGE_SIZE frames, but this - * assumption is not longer true with ixgbe and i40e. It - * would be preferred to set frame_size to 2048 or 4096 - * depending on the driver. - * frame_size = 2048; - * frame_len = frame_size - sizeof(*xdp_frame); - * - * Instead, with info avail, skb_shared_info in placed after - * packet len. This, unfortunately fakes the truesize. - * Another disadvantage of this approach, the skb_shared_info - * is not at a fixed memory location, with mixed length - * packets, which is bad for cache-line hotness. + /* Memory size backing xdp_frame data already have reserved + * room for build_skb to place skb_shared_info in tailroom. */ - frame_size = SKB_DATA_ALIGN(xdpf->len + hard_start_headroom) + - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + frame_size = xdpf->frame_sz; pkt_data_start = xdpf->data - hard_start_headroom; skb = build_skb_around(skb, pkt_data_start, frame_size); @@ -227,6 +218,8 @@ static void __cpu_map_ring_cleanup(struct ptr_ring *ring) static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) { if (atomic_dec_and_test(&rcpu->refcnt)) { + if (rcpu->prog) + bpf_prog_put(rcpu->prog); /* The queue should be empty at this point */ __cpu_map_ring_cleanup(rcpu->queue); ptr_ring_cleanup(rcpu->queue, NULL); @@ -235,6 +228,75 @@ static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) } } +static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, + void **frames, int n, + struct xdp_cpumap_stats *stats) +{ + struct xdp_rxq_info rxq; + struct xdp_buff xdp; + int i, nframes = 0; + + if (!rcpu->prog) + return n; + + rcu_read_lock_bh(); + + xdp_set_return_frame_no_direct(); + xdp.rxq = &rxq; + + for (i = 0; i < n; i++) { + struct xdp_frame *xdpf = frames[i]; + u32 act; + int err; + + rxq.dev = xdpf->dev_rx; + rxq.mem = xdpf->mem; + /* TODO: report queue_index to xdp_rxq_info */ + + xdp_convert_frame_to_buff(xdpf, &xdp); + + act = bpf_prog_run_xdp(rcpu->prog, &xdp); + switch (act) { + case XDP_PASS: + err = xdp_update_frame_from_buff(&xdp, xdpf); + if (err < 0) { + xdp_return_frame(xdpf); + stats->drop++; + } else { + frames[nframes++] = xdpf; + stats->pass++; + } + break; + case XDP_REDIRECT: + err = xdp_do_redirect(xdpf->dev_rx, &xdp, + rcpu->prog); + if (unlikely(err)) { + xdp_return_frame(xdpf); + stats->drop++; + } else { + stats->redirect++; + } + break; + default: + bpf_warn_invalid_xdp_action(act); + /* fallthrough */ + case XDP_DROP: + xdp_return_frame(xdpf); + stats->drop++; + break; + } + } + + if (stats->redirect) + xdp_do_flush_map(); + + xdp_clear_return_frame_no_direct(); + + rcu_read_unlock_bh(); /* resched point, may call do_softirq() */ + + return nframes; +} + #define CPUMAP_BATCH 8 static int cpu_map_kthread_run(void *data) @@ -249,11 +311,12 @@ static int cpu_map_kthread_run(void *data) * kthread_stop signal until queue is empty. */ while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) { + struct xdp_cpumap_stats stats = {}; /* zero stats */ + gfp_t gfp = __GFP_ZERO | GFP_ATOMIC; unsigned int drops = 0, sched = 0; void *frames[CPUMAP_BATCH]; void *skbs[CPUMAP_BATCH]; - gfp_t gfp = __GFP_ZERO | GFP_ATOMIC; - int i, n, m; + int i, n, m, nframes; /* Release CPU reschedule checks */ if (__ptr_ring_empty(rcpu->queue)) { @@ -274,8 +337,8 @@ static int cpu_map_kthread_run(void *data) * kthread CPU pinned. Lockless access to ptr_ring * consume side valid as no-resize allowed of queue. */ - n = ptr_ring_consume_batched(rcpu->queue, frames, CPUMAP_BATCH); - + n = __ptr_ring_consume_batched(rcpu->queue, frames, + CPUMAP_BATCH); for (i = 0; i < n; i++) { void *f = frames[i]; struct page *page = virt_to_page(f); @@ -287,15 +350,19 @@ static int cpu_map_kthread_run(void *data) prefetchw(page); } - m = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, n, skbs); - if (unlikely(m == 0)) { - for (i = 0; i < n; i++) - skbs[i] = NULL; /* effect: xdp_return_frame */ - drops = n; + /* Support running another XDP prog on this CPU */ + nframes = cpu_map_bpf_prog_run_xdp(rcpu, frames, n, &stats); + if (nframes) { + m = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, nframes, skbs); + if (unlikely(m == 0)) { + for (i = 0; i < nframes; i++) + skbs[i] = NULL; /* effect: xdp_return_frame */ + drops += nframes; + } } local_bh_disable(); - for (i = 0; i < n; i++) { + for (i = 0; i < nframes; i++) { struct xdp_frame *xdpf = frames[i]; struct sk_buff *skb = skbs[i]; int ret; @@ -312,7 +379,7 @@ static int cpu_map_kthread_run(void *data) drops++; } /* Feedback loop via tracepoint */ - trace_xdp_cpumap_kthread(rcpu->map_id, n, drops, sched); + trace_xdp_cpumap_kthread(rcpu->map_id, n, drops, sched, &stats); local_bh_enable(); /* resched point, may call do_softirq() */ } @@ -322,13 +389,38 @@ static int cpu_map_kthread_run(void *data) return 0; } -static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, - int map_id) +bool cpu_map_prog_allowed(struct bpf_map *map) +{ + return map->map_type == BPF_MAP_TYPE_CPUMAP && + map->value_size != offsetofend(struct bpf_cpumap_val, qsize); +} + +static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu, int fd) +{ + struct bpf_prog *prog; + + prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + if (prog->expected_attach_type != BPF_XDP_CPUMAP) { + bpf_prog_put(prog); + return -EINVAL; + } + + rcpu->value.bpf_prog.id = prog->aux->id; + rcpu->prog = prog; + + return 0; +} + +static struct bpf_cpu_map_entry * +__cpu_map_entry_alloc(struct bpf_cpumap_val *value, u32 cpu, int map_id) { + int numa, err, i, fd = value->bpf_prog.fd; gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; struct bpf_cpu_map_entry *rcpu; struct xdp_bulk_queue *bq; - int numa, err, i; /* Have map->numa_node, but choose node of redirect target CPU */ numa = cpu_to_node(cpu); @@ -353,19 +445,22 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, if (!rcpu->queue) goto free_bulkq; - err = ptr_ring_init(rcpu->queue, qsize, gfp); + err = ptr_ring_init(rcpu->queue, value->qsize, gfp); if (err) goto free_queue; rcpu->cpu = cpu; rcpu->map_id = map_id; - rcpu->qsize = qsize; + rcpu->value.qsize = value->qsize; + + if (fd > 0 && __cpu_map_load_bpf_program(rcpu, fd)) + goto free_ptr_ring; /* Setup kthread */ rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa, "cpumap/%d/map:%d", cpu, map_id); if (IS_ERR(rcpu->kthread)) - goto free_ptr_ring; + goto free_prog; get_cpu_map_entry(rcpu); /* 1-refcnt for being in cmap->cpu_map[] */ get_cpu_map_entry(rcpu); /* 1-refcnt for kthread */ @@ -376,6 +471,9 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, return rcpu; +free_prog: + if (rcpu->prog) + bpf_prog_put(rcpu->prog); free_ptr_ring: ptr_ring_cleanup(rcpu->queue, NULL); free_queue: @@ -452,12 +550,12 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); + struct bpf_cpumap_val cpumap_value = {}; struct bpf_cpu_map_entry *rcpu; - /* Array index key correspond to CPU number */ u32 key_cpu = *(u32 *)key; - /* Value is the queue size */ - u32 qsize = *(u32 *)value; + + memcpy(&cpumap_value, value, map->value_size); if (unlikely(map_flags > BPF_EXIST)) return -EINVAL; @@ -465,18 +563,18 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, return -E2BIG; if (unlikely(map_flags == BPF_NOEXIST)) return -EEXIST; - if (unlikely(qsize > 16384)) /* sanity limit on qsize */ + if (unlikely(cpumap_value.qsize > 16384)) /* sanity limit on qsize */ return -EOVERFLOW; /* Make sure CPU is a valid possible cpu */ if (key_cpu >= nr_cpumask_bits || !cpu_possible(key_cpu)) return -ENODEV; - if (qsize == 0) { + if (cpumap_value.qsize == 0) { rcpu = NULL; /* Same as deleting */ } else { /* Updating qsize cause re-allocation of bpf_cpu_map_entry */ - rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id); + rcpu = __cpu_map_entry_alloc(&cpumap_value, key_cpu, map->id); if (!rcpu) return -ENOMEM; rcpu->cmap = cmap; @@ -538,7 +636,7 @@ static void *cpu_map_lookup_elem(struct bpf_map *map, void *key) struct bpf_cpu_map_entry *rcpu = __cpu_map_lookup_elem(map, *(u32 *)key); - return rcpu ? &rcpu->qsize : NULL; + return rcpu ? &rcpu->value : NULL; } static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key) @@ -558,6 +656,7 @@ static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } +static int cpu_map_btf_id; const struct bpf_map_ops cpu_map_ops = { .map_alloc = cpu_map_alloc, .map_free = cpu_map_free, @@ -566,6 +665,8 @@ const struct bpf_map_ops cpu_map_ops = { .map_lookup_elem = cpu_map_lookup_elem, .map_get_next_key = cpu_map_get_next_key, .map_check_btf = map_check_no_btf, + .map_btf_name = "bpf_cpu_map", + .map_btf_id = &cpu_map_btf_id, }; static int bq_flush_to_queue(struct xdp_bulk_queue *bq) @@ -636,7 +737,7 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, { struct xdp_frame *xdpf; - xdpf = convert_to_xdp_frame(xdp); + xdpf = xdp_convert_buff_to_frame(xdp); if (unlikely(!xdpf)) return -EOVERFLOW; diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 58bdca5d978a..10abb06065bb 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -52,7 +52,6 @@ #define DEV_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) -#define DEV_MAP_BULK_SIZE 16 struct xdp_dev_bulk_queue { struct xdp_frame *q[DEV_MAP_BULK_SIZE]; struct list_head flush_node; @@ -65,8 +64,10 @@ struct bpf_dtab_netdev { struct net_device *dev; /* must be first member, due to tracepoint */ struct hlist_node index_hlist; struct bpf_dtab *dtab; + struct bpf_prog *xdp_prog; struct rcu_head rcu; unsigned int idx; + struct bpf_devmap_val val; }; struct bpf_dtab { @@ -85,12 +86,13 @@ static DEFINE_PER_CPU(struct list_head, dev_flush_list); static DEFINE_SPINLOCK(dev_map_lock); static LIST_HEAD(dev_map_list); -static struct hlist_head *dev_map_create_hash(unsigned int entries) +static struct hlist_head *dev_map_create_hash(unsigned int entries, + int numa_node) { int i; struct hlist_head *hash; - hash = kmalloc_array(entries, sizeof(*hash), GFP_KERNEL); + hash = bpf_map_area_alloc(entries * sizeof(*hash), numa_node); if (hash != NULL) for (i = 0; i < entries; i++) INIT_HLIST_HEAD(&hash[i]); @@ -106,12 +108,18 @@ static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab, static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) { + u32 valsize = attr->value_size; u64 cost = 0; int err; - /* check sanity of attributes */ + /* check sanity of attributes. 2 value sizes supported: + * 4 bytes: ifindex + * 8 bytes: ifindex + prog fd + */ if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK) + (valsize != offsetofend(struct bpf_devmap_val, ifindex) && + valsize != offsetofend(struct bpf_devmap_val, bpf_prog.fd)) || + attr->map_flags & ~DEV_CREATE_FLAG_MASK) return -EINVAL; /* Lookup returns a pointer straight to dev->ifindex, so make sure the @@ -138,7 +146,8 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) return -EINVAL; if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { - dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets); + dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets, + dtab->map.numa_node); if (!dtab->dev_index_head) goto free_charge; @@ -218,12 +227,14 @@ static void dev_map_free(struct bpf_map *map) hlist_for_each_entry_safe(dev, next, head, index_hlist) { hlist_del_rcu(&dev->index_hlist); + if (dev->xdp_prog) + bpf_prog_put(dev->xdp_prog); dev_put(dev->dev); kfree(dev); } } - kfree(dtab->dev_index_head); + bpf_map_area_free(dtab->dev_index_head); } else { for (i = 0; i < dtab->map.max_entries; i++) { struct bpf_dtab_netdev *dev; @@ -232,6 +243,8 @@ static void dev_map_free(struct bpf_map *map) if (!dev) continue; + if (dev->xdp_prog) + bpf_prog_put(dev->xdp_prog); dev_put(dev->dev); kfree(dev); } @@ -318,6 +331,16 @@ static int dev_map_hash_get_next_key(struct bpf_map *map, void *key, return -ENOENT; } +bool dev_map_can_have_prog(struct bpf_map *map) +{ + if ((map->map_type == BPF_MAP_TYPE_DEVMAP || + map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) && + map->value_size != offsetofend(struct bpf_devmap_val, ifindex)) + return true; + + return false; +} + static int bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) { struct net_device *dev = bq->dev; @@ -435,13 +458,41 @@ static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, if (unlikely(err)) return err; - xdpf = convert_to_xdp_frame(xdp); + xdpf = xdp_convert_buff_to_frame(xdp); if (unlikely(!xdpf)) return -EOVERFLOW; return bq_enqueue(dev, xdpf, dev_rx); } +static struct xdp_buff *dev_map_run_prog(struct net_device *dev, + struct xdp_buff *xdp, + struct bpf_prog *xdp_prog) +{ + struct xdp_txq_info txq = { .dev = dev }; + u32 act; + + xdp_set_data_meta_invalid(xdp); + xdp->txq = &txq; + + act = bpf_prog_run_xdp(xdp_prog, xdp); + switch (act) { + case XDP_PASS: + return xdp; + case XDP_DROP: + break; + default: + bpf_warn_invalid_xdp_action(act); + fallthrough; + case XDP_ABORTED: + trace_xdp_exception(dev, xdp_prog, act); + break; + } + + xdp_return_buff(xdp); + return NULL; +} + int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, struct net_device *dev_rx) { @@ -453,6 +504,11 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, { struct net_device *dev = dst->dev; + if (dst->xdp_prog) { + xdp = dev_map_run_prog(dev, xdp, dst->xdp_prog); + if (!xdp) + return 0; + } return __xdp_enqueue(dev, xdp, dev_rx); } @@ -473,18 +529,15 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, static void *dev_map_lookup_elem(struct bpf_map *map, void *key) { struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key); - struct net_device *dev = obj ? obj->dev : NULL; - return dev ? &dev->ifindex : NULL; + return obj ? &obj->val : NULL; } static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key) { struct bpf_dtab_netdev *obj = __dev_map_hash_lookup_elem(map, *(u32 *)key); - struct net_device *dev = obj ? obj->dev : NULL; - - return dev ? &dev->ifindex : NULL; + return obj ? &obj->val : NULL; } static void __dev_map_entry_free(struct rcu_head *rcu) @@ -492,6 +545,8 @@ static void __dev_map_entry_free(struct rcu_head *rcu) struct bpf_dtab_netdev *dev; dev = container_of(rcu, struct bpf_dtab_netdev, rcu); + if (dev->xdp_prog) + bpf_prog_put(dev->xdp_prog); dev_put(dev->dev); kfree(dev); } @@ -542,9 +597,10 @@ static int dev_map_hash_delete_elem(struct bpf_map *map, void *key) static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, struct bpf_dtab *dtab, - u32 ifindex, + struct bpf_devmap_val *val, unsigned int idx) { + struct bpf_prog *prog = NULL; struct bpf_dtab_netdev *dev; dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN, @@ -552,16 +608,38 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, if (!dev) return ERR_PTR(-ENOMEM); - dev->dev = dev_get_by_index(net, ifindex); - if (!dev->dev) { - kfree(dev); - return ERR_PTR(-EINVAL); + dev->dev = dev_get_by_index(net, val->ifindex); + if (!dev->dev) + goto err_out; + + if (val->bpf_prog.fd > 0) { + prog = bpf_prog_get_type_dev(val->bpf_prog.fd, + BPF_PROG_TYPE_XDP, false); + if (IS_ERR(prog)) + goto err_put_dev; + if (prog->expected_attach_type != BPF_XDP_DEVMAP) + goto err_put_prog; } dev->idx = idx; dev->dtab = dtab; + if (prog) { + dev->xdp_prog = prog; + dev->val.bpf_prog.id = prog->aux->id; + } else { + dev->xdp_prog = NULL; + dev->val.bpf_prog.id = 0; + } + dev->val.ifindex = val->ifindex; return dev; +err_put_prog: + bpf_prog_put(prog); +err_put_dev: + dev_put(dev->dev); +err_out: + kfree(dev); + return ERR_PTR(-EINVAL); } static int __dev_map_update_elem(struct net *net, struct bpf_map *map, @@ -569,7 +647,7 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map, { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *dev, *old_dev; - u32 ifindex = *(u32 *)value; + struct bpf_devmap_val val = {}; u32 i = *(u32 *)key; if (unlikely(map_flags > BPF_EXIST)) @@ -579,10 +657,16 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map, if (unlikely(map_flags == BPF_NOEXIST)) return -EEXIST; - if (!ifindex) { + /* already verified value_size <= sizeof val */ + memcpy(&val, value, map->value_size); + + if (!val.ifindex) { dev = NULL; + /* can not specify fd if ifindex is 0 */ + if (val.bpf_prog.fd > 0) + return -EINVAL; } else { - dev = __dev_map_alloc_node(net, dtab, ifindex, i); + dev = __dev_map_alloc_node(net, dtab, &val, i); if (IS_ERR(dev)) return PTR_ERR(dev); } @@ -610,12 +694,15 @@ static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *dev, *old_dev; - u32 ifindex = *(u32 *)value; + struct bpf_devmap_val val = {}; u32 idx = *(u32 *)key; unsigned long flags; int err = -EEXIST; - if (unlikely(map_flags > BPF_EXIST || !ifindex)) + /* already verified value_size <= sizeof val */ + memcpy(&val, value, map->value_size); + + if (unlikely(map_flags > BPF_EXIST || !val.ifindex)) return -EINVAL; spin_lock_irqsave(&dtab->index_lock, flags); @@ -624,7 +711,7 @@ static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, if (old_dev && (map_flags & BPF_NOEXIST)) goto out_err; - dev = __dev_map_alloc_node(net, dtab, ifindex, idx); + dev = __dev_map_alloc_node(net, dtab, &val, idx); if (IS_ERR(dev)) { err = PTR_ERR(dev); goto out_err; @@ -662,6 +749,7 @@ static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value, map, key, value, map_flags); } +static int dev_map_btf_id; const struct bpf_map_ops dev_map_ops = { .map_alloc = dev_map_alloc, .map_free = dev_map_free, @@ -670,8 +758,11 @@ const struct bpf_map_ops dev_map_ops = { .map_update_elem = dev_map_update_elem, .map_delete_elem = dev_map_delete_elem, .map_check_btf = map_check_no_btf, + .map_btf_name = "bpf_dtab", + .map_btf_id = &dev_map_btf_id, }; +static int dev_map_hash_map_btf_id; const struct bpf_map_ops dev_map_hash_ops = { .map_alloc = dev_map_alloc, .map_free = dev_map_free, @@ -680,6 +771,8 @@ const struct bpf_map_ops dev_map_hash_ops = { .map_update_elem = dev_map_hash_update_elem, .map_delete_elem = dev_map_hash_delete_elem, .map_check_btf = map_check_no_btf, + .map_btf_name = "bpf_dtab", + .map_btf_id = &dev_map_hash_map_btf_id, }; static void dev_map_hash_remove_netdev(struct bpf_dtab *dtab, diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index d541c8486c95..78dfff6a501b 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -359,9 +359,9 @@ static int htab_map_alloc_check(union bpf_attr *attr) BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) != offsetof(struct htab_elem, hash_node.pprev)); - if (lru && !capable(CAP_SYS_ADMIN)) + if (lru && !bpf_capable()) /* LRU implementation is much complicated than other - * maps. Hence, limit to CAP_SYS_ADMIN for now. + * maps. Hence, limit to CAP_BPF. */ return -EPERM; @@ -779,15 +779,20 @@ static void htab_elem_free_rcu(struct rcu_head *head) htab_elem_free(htab, l); } -static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) +static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l) { struct bpf_map *map = &htab->map; + void *ptr; if (map->ops->map_fd_put_ptr) { - void *ptr = fd_htab_map_get_ptr(map, l); - + ptr = fd_htab_map_get_ptr(map, l); map->ops->map_fd_put_ptr(ptr); } +} + +static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) +{ + htab_put_fd_value(htab, l); if (htab_is_prealloc(htab)) { __pcpu_freelist_push(&htab->freelist, &l->fnode); @@ -839,6 +844,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, */ pl_new = this_cpu_ptr(htab->extra_elems); l_new = *pl_new; + htab_put_fd_value(htab, old_elem); *pl_new = old_elem; } else { struct pcpu_freelist_node *l; @@ -1290,12 +1296,10 @@ static void htab_map_free(struct bpf_map *map) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, - * so the programs (can be more than one that used this map) were - * disconnected from events. Wait for outstanding critical sections in - * these programs to complete + /* bpf_free_used_maps() or close(map_fd) will trigger this map_free callback. + * bpf_free_used_maps() is called after bpf prog is no longer executing. + * There is no need to synchronize_rcu() here to protect map elements. */ - synchronize_rcu(); /* some of free_htab_elem() callbacks for elements of this map may * not have executed. Wait for them. @@ -1614,6 +1618,197 @@ htab_lru_map_lookup_and_delete_batch(struct bpf_map *map, true, false); } +struct bpf_iter_seq_hash_map_info { + struct bpf_map *map; + struct bpf_htab *htab; + void *percpu_value_buf; // non-zero means percpu hash + unsigned long flags; + u32 bucket_id; + u32 skip_elems; +}; + +static struct htab_elem * +bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info *info, + struct htab_elem *prev_elem) +{ + const struct bpf_htab *htab = info->htab; + unsigned long flags = info->flags; + u32 skip_elems = info->skip_elems; + u32 bucket_id = info->bucket_id; + struct hlist_nulls_head *head; + struct hlist_nulls_node *n; + struct htab_elem *elem; + struct bucket *b; + u32 i, count; + + if (bucket_id >= htab->n_buckets) + return NULL; + + /* try to find next elem in the same bucket */ + if (prev_elem) { + /* no update/deletion on this bucket, prev_elem should be still valid + * and we won't skip elements. + */ + n = rcu_dereference_raw(hlist_nulls_next_rcu(&prev_elem->hash_node)); + elem = hlist_nulls_entry_safe(n, struct htab_elem, hash_node); + if (elem) + return elem; + + /* not found, unlock and go to the next bucket */ + b = &htab->buckets[bucket_id++]; + htab_unlock_bucket(htab, b, flags); + skip_elems = 0; + } + + for (i = bucket_id; i < htab->n_buckets; i++) { + b = &htab->buckets[i]; + flags = htab_lock_bucket(htab, b); + + count = 0; + head = &b->head; + hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) { + if (count >= skip_elems) { + info->flags = flags; + info->bucket_id = i; + info->skip_elems = count; + return elem; + } + count++; + } + + htab_unlock_bucket(htab, b, flags); + skip_elems = 0; + } + + info->bucket_id = i; + info->skip_elems = 0; + return NULL; +} + +static void *bpf_hash_map_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct bpf_iter_seq_hash_map_info *info = seq->private; + struct htab_elem *elem; + + elem = bpf_hash_map_seq_find_next(info, NULL); + if (!elem) + return NULL; + + if (*pos == 0) + ++*pos; + return elem; +} + +static void *bpf_hash_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct bpf_iter_seq_hash_map_info *info = seq->private; + + ++*pos; + ++info->skip_elems; + return bpf_hash_map_seq_find_next(info, v); +} + +static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem) +{ + struct bpf_iter_seq_hash_map_info *info = seq->private; + u32 roundup_key_size, roundup_value_size; + struct bpf_iter__bpf_map_elem ctx = {}; + struct bpf_map *map = info->map; + struct bpf_iter_meta meta; + int ret = 0, off = 0, cpu; + struct bpf_prog *prog; + void __percpu *pptr; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, elem == NULL); + if (prog) { + ctx.meta = &meta; + ctx.map = info->map; + if (elem) { + roundup_key_size = round_up(map->key_size, 8); + ctx.key = elem->key; + if (!info->percpu_value_buf) { + ctx.value = elem->key + roundup_key_size; + } else { + roundup_value_size = round_up(map->value_size, 8); + pptr = htab_elem_get_ptr(elem, map->key_size); + for_each_possible_cpu(cpu) { + bpf_long_memcpy(info->percpu_value_buf + off, + per_cpu_ptr(pptr, cpu), + roundup_value_size); + off += roundup_value_size; + } + ctx.value = info->percpu_value_buf; + } + } + ret = bpf_iter_run_prog(prog, &ctx); + } + + return ret; +} + +static int bpf_hash_map_seq_show(struct seq_file *seq, void *v) +{ + return __bpf_hash_map_seq_show(seq, v); +} + +static void bpf_hash_map_seq_stop(struct seq_file *seq, void *v) +{ + struct bpf_iter_seq_hash_map_info *info = seq->private; + + if (!v) + (void)__bpf_hash_map_seq_show(seq, NULL); + else + htab_unlock_bucket(info->htab, + &info->htab->buckets[info->bucket_id], + info->flags); +} + +static int bpf_iter_init_hash_map(void *priv_data, + struct bpf_iter_aux_info *aux) +{ + struct bpf_iter_seq_hash_map_info *seq_info = priv_data; + struct bpf_map *map = aux->map; + void *value_buf; + u32 buf_size; + + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { + buf_size = round_up(map->value_size, 8) * num_possible_cpus(); + value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN); + if (!value_buf) + return -ENOMEM; + + seq_info->percpu_value_buf = value_buf; + } + + seq_info->map = map; + seq_info->htab = container_of(map, struct bpf_htab, map); + return 0; +} + +static void bpf_iter_fini_hash_map(void *priv_data) +{ + struct bpf_iter_seq_hash_map_info *seq_info = priv_data; + + kfree(seq_info->percpu_value_buf); +} + +static const struct seq_operations bpf_hash_map_seq_ops = { + .start = bpf_hash_map_seq_start, + .next = bpf_hash_map_seq_next, + .stop = bpf_hash_map_seq_stop, + .show = bpf_hash_map_seq_show, +}; + +static const struct bpf_iter_seq_info iter_seq_info = { + .seq_ops = &bpf_hash_map_seq_ops, + .init_seq_private = bpf_iter_init_hash_map, + .fini_seq_private = bpf_iter_fini_hash_map, + .seq_priv_size = sizeof(struct bpf_iter_seq_hash_map_info), +}; + +static int htab_map_btf_id; const struct bpf_map_ops htab_map_ops = { .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, @@ -1625,8 +1820,12 @@ const struct bpf_map_ops htab_map_ops = { .map_gen_lookup = htab_map_gen_lookup, .map_seq_show_elem = htab_map_seq_show_elem, BATCH_OPS(htab), + .map_btf_name = "bpf_htab", + .map_btf_id = &htab_map_btf_id, + .iter_seq_info = &iter_seq_info, }; +static int htab_lru_map_btf_id; const struct bpf_map_ops htab_lru_map_ops = { .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, @@ -1639,6 +1838,9 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_gen_lookup = htab_lru_map_gen_lookup, .map_seq_show_elem = htab_map_seq_show_elem, BATCH_OPS(htab_lru), + .map_btf_name = "bpf_htab", + .map_btf_id = &htab_lru_map_btf_id, + .iter_seq_info = &iter_seq_info, }; /* Called from eBPF program */ @@ -1743,6 +1945,7 @@ static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key, rcu_read_unlock(); } +static int htab_percpu_map_btf_id; const struct bpf_map_ops htab_percpu_map_ops = { .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, @@ -1753,8 +1956,12 @@ const struct bpf_map_ops htab_percpu_map_ops = { .map_delete_elem = htab_map_delete_elem, .map_seq_show_elem = htab_percpu_map_seq_show_elem, BATCH_OPS(htab_percpu), + .map_btf_name = "bpf_htab", + .map_btf_id = &htab_percpu_map_btf_id, + .iter_seq_info = &iter_seq_info, }; +static int htab_lru_percpu_map_btf_id; const struct bpf_map_ops htab_lru_percpu_map_ops = { .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, @@ -1765,6 +1972,9 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = { .map_delete_elem = htab_lru_map_delete_elem, .map_seq_show_elem = htab_percpu_map_seq_show_elem, BATCH_OPS(htab_lru_percpu), + .map_btf_name = "bpf_htab", + .map_btf_id = &htab_lru_percpu_map_btf_id, + .iter_seq_info = &iter_seq_info, }; static int fd_htab_map_alloc_check(union bpf_attr *attr) @@ -1887,6 +2097,7 @@ static void htab_of_map_free(struct bpf_map *map) fd_htab_map_free(map); } +static int htab_of_maps_map_btf_id; const struct bpf_map_ops htab_of_maps_map_ops = { .map_alloc_check = fd_htab_map_alloc_check, .map_alloc = htab_of_map_alloc, @@ -1899,4 +2110,6 @@ const struct bpf_map_ops htab_of_maps_map_ops = { .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, .map_gen_lookup = htab_of_map_gen_lookup, .map_check_btf = map_check_no_btf, + .map_btf_name = "bpf_htab", + .map_btf_id = &htab_of_maps_map_btf_id, }; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index bafc53ddd350..be43ab3e619f 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -151,7 +151,19 @@ BPF_CALL_0(bpf_ktime_get_ns) const struct bpf_func_proto bpf_ktime_get_ns_proto = { .func = bpf_ktime_get_ns, - .gpl_only = true, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; + +BPF_CALL_0(bpf_ktime_get_boot_ns) +{ + /* NMI safe access to clock boottime */ + return ktime_get_boot_fast_ns(); +} + +const struct bpf_func_proto bpf_ktime_get_boot_ns_proto = { + .func = bpf_ktime_get_boot_ns, + .gpl_only = false, .ret_type = RET_INTEGER, }; @@ -562,3 +574,114 @@ const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto = { .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, }; + +static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = { + .func = bpf_get_raw_cpu_id, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; + +BPF_CALL_5(bpf_event_output_data, void *, ctx, struct bpf_map *, map, + u64, flags, void *, data, u64, size) +{ + if (unlikely(flags & ~(BPF_F_INDEX_MASK))) + return -EINVAL; + + return bpf_event_output(map, flags, data, size, NULL, 0, NULL); +} + +const struct bpf_func_proto bpf_event_output_data_proto = { + .func = bpf_event_output_data, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, +}; + +const struct bpf_func_proto bpf_get_current_task_proto __weak; +const struct bpf_func_proto bpf_probe_read_user_proto __weak; +const struct bpf_func_proto bpf_probe_read_user_str_proto __weak; +const struct bpf_func_proto bpf_probe_read_kernel_proto __weak; +const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak; + +const struct bpf_func_proto * +bpf_base_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_map_lookup_elem: + return &bpf_map_lookup_elem_proto; + case BPF_FUNC_map_update_elem: + return &bpf_map_update_elem_proto; + case BPF_FUNC_map_delete_elem: + return &bpf_map_delete_elem_proto; + case BPF_FUNC_map_push_elem: + return &bpf_map_push_elem_proto; + case BPF_FUNC_map_pop_elem: + return &bpf_map_pop_elem_proto; + case BPF_FUNC_map_peek_elem: + return &bpf_map_peek_elem_proto; + case BPF_FUNC_get_prandom_u32: + return &bpf_get_prandom_u32_proto; + case BPF_FUNC_get_smp_processor_id: + return &bpf_get_raw_smp_processor_id_proto; + case BPF_FUNC_get_numa_node_id: + return &bpf_get_numa_node_id_proto; + case BPF_FUNC_tail_call: + return &bpf_tail_call_proto; + case BPF_FUNC_ktime_get_ns: + return &bpf_ktime_get_ns_proto; + case BPF_FUNC_ktime_get_boot_ns: + return &bpf_ktime_get_boot_ns_proto; + case BPF_FUNC_ringbuf_output: + return &bpf_ringbuf_output_proto; + case BPF_FUNC_ringbuf_reserve: + return &bpf_ringbuf_reserve_proto; + case BPF_FUNC_ringbuf_submit: + return &bpf_ringbuf_submit_proto; + case BPF_FUNC_ringbuf_discard: + return &bpf_ringbuf_discard_proto; + case BPF_FUNC_ringbuf_query: + return &bpf_ringbuf_query_proto; + default: + break; + } + + if (!bpf_capable()) + return NULL; + + switch (func_id) { + case BPF_FUNC_spin_lock: + return &bpf_spin_lock_proto; + case BPF_FUNC_spin_unlock: + return &bpf_spin_unlock_proto; + case BPF_FUNC_trace_printk: + if (!perfmon_capable()) + return NULL; + return bpf_get_trace_printk_proto(); + case BPF_FUNC_jiffies64: + return &bpf_jiffies64_proto; + default: + break; + } + + if (!perfmon_capable()) + return NULL; + + switch (func_id) { + case BPF_FUNC_get_current_task: + return &bpf_get_current_task_proto; + case BPF_FUNC_probe_read_user: + return &bpf_probe_read_user_proto; + case BPF_FUNC_probe_read_kernel: + return &bpf_probe_read_kernel_proto; + case BPF_FUNC_probe_read_user_str: + return &bpf_probe_read_user_str_proto; + case BPF_FUNC_probe_read_kernel_str: + return &bpf_probe_read_kernel_str_proto; + default: + return NULL; + } +} diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 95087d9f4ed3..fb878ba3f22f 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -358,8 +358,11 @@ static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg) static int bpf_mklink(struct dentry *dentry, umode_t mode, void *arg) { + struct bpf_link *link = arg; + return bpf_mkobj_ops(dentry, mode, arg, &bpf_link_iops, - &bpffs_obj_fops); + bpf_link_is_iter(link) ? + &bpf_iter_fops : &bpffs_obj_fops); } static struct dentry * diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 33d01866bcc2..571bb351ed3b 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -13,6 +13,8 @@ DEFINE_PER_CPU(struct bpf_cgroup_storage*, bpf_cgroup_storage[MAX_BPF_CGROUP_STO #ifdef CONFIG_CGROUP_BPF +#include "../cgroup/cgroup-internal.h" + #define LOCAL_STORAGE_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK) @@ -20,7 +22,6 @@ struct bpf_cgroup_storage_map { struct bpf_map map; spinlock_t lock; - struct bpf_prog_aux *aux; struct rb_root root; struct list_head list; }; @@ -30,24 +31,41 @@ static struct bpf_cgroup_storage_map *map_to_storage(struct bpf_map *map) return container_of(map, struct bpf_cgroup_storage_map, map); } -static int bpf_cgroup_storage_key_cmp( - const struct bpf_cgroup_storage_key *key1, - const struct bpf_cgroup_storage_key *key2) +static bool attach_type_isolated(const struct bpf_map *map) { - if (key1->cgroup_inode_id < key2->cgroup_inode_id) - return -1; - else if (key1->cgroup_inode_id > key2->cgroup_inode_id) - return 1; - else if (key1->attach_type < key2->attach_type) - return -1; - else if (key1->attach_type > key2->attach_type) - return 1; + return map->key_size == sizeof(struct bpf_cgroup_storage_key); +} + +static int bpf_cgroup_storage_key_cmp(const struct bpf_cgroup_storage_map *map, + const void *_key1, const void *_key2) +{ + if (attach_type_isolated(&map->map)) { + const struct bpf_cgroup_storage_key *key1 = _key1; + const struct bpf_cgroup_storage_key *key2 = _key2; + + if (key1->cgroup_inode_id < key2->cgroup_inode_id) + return -1; + else if (key1->cgroup_inode_id > key2->cgroup_inode_id) + return 1; + else if (key1->attach_type < key2->attach_type) + return -1; + else if (key1->attach_type > key2->attach_type) + return 1; + } else { + const __u64 *cgroup_inode_id1 = _key1; + const __u64 *cgroup_inode_id2 = _key2; + + if (*cgroup_inode_id1 < *cgroup_inode_id2) + return -1; + else if (*cgroup_inode_id1 > *cgroup_inode_id2) + return 1; + } return 0; } -static struct bpf_cgroup_storage *cgroup_storage_lookup( - struct bpf_cgroup_storage_map *map, struct bpf_cgroup_storage_key *key, - bool locked) +struct bpf_cgroup_storage * +cgroup_storage_lookup(struct bpf_cgroup_storage_map *map, + void *key, bool locked) { struct rb_root *root = &map->root; struct rb_node *node; @@ -61,7 +79,7 @@ static struct bpf_cgroup_storage *cgroup_storage_lookup( storage = container_of(node, struct bpf_cgroup_storage, node); - switch (bpf_cgroup_storage_key_cmp(key, &storage->key)) { + switch (bpf_cgroup_storage_key_cmp(map, key, &storage->key)) { case -1: node = node->rb_left; break; @@ -93,7 +111,7 @@ static int cgroup_storage_insert(struct bpf_cgroup_storage_map *map, this = container_of(*new, struct bpf_cgroup_storage, node); parent = *new; - switch (bpf_cgroup_storage_key_cmp(&storage->key, &this->key)) { + switch (bpf_cgroup_storage_key_cmp(map, &storage->key, &this->key)) { case -1: new = &((*new)->rb_left); break; @@ -111,10 +129,9 @@ static int cgroup_storage_insert(struct bpf_cgroup_storage_map *map, return 0; } -static void *cgroup_storage_lookup_elem(struct bpf_map *_map, void *_key) +static void *cgroup_storage_lookup_elem(struct bpf_map *_map, void *key) { struct bpf_cgroup_storage_map *map = map_to_storage(_map); - struct bpf_cgroup_storage_key *key = _key; struct bpf_cgroup_storage *storage; storage = cgroup_storage_lookup(map, key, false); @@ -124,17 +141,13 @@ static void *cgroup_storage_lookup_elem(struct bpf_map *_map, void *_key) return &READ_ONCE(storage->buf)->data[0]; } -static int cgroup_storage_update_elem(struct bpf_map *map, void *_key, +static int cgroup_storage_update_elem(struct bpf_map *map, void *key, void *value, u64 flags) { - struct bpf_cgroup_storage_key *key = _key; struct bpf_cgroup_storage *storage; struct bpf_storage_buffer *new; - if (unlikely(flags & ~(BPF_F_LOCK | BPF_EXIST | BPF_NOEXIST))) - return -EINVAL; - - if (unlikely(flags & BPF_NOEXIST)) + if (unlikely(flags & ~(BPF_F_LOCK | BPF_EXIST))) return -EINVAL; if (unlikely((flags & BPF_F_LOCK) && @@ -167,11 +180,10 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key, return 0; } -int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *_key, +int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *key, void *value) { struct bpf_cgroup_storage_map *map = map_to_storage(_map); - struct bpf_cgroup_storage_key *key = _key; struct bpf_cgroup_storage *storage; int cpu, off = 0; u32 size; @@ -197,11 +209,10 @@ int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *_key, return 0; } -int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *_key, +int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *key, void *value, u64 map_flags) { struct bpf_cgroup_storage_map *map = map_to_storage(_map); - struct bpf_cgroup_storage_key *key = _key; struct bpf_cgroup_storage *storage; int cpu, off = 0; u32 size; @@ -232,12 +243,10 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *_key, return 0; } -static int cgroup_storage_get_next_key(struct bpf_map *_map, void *_key, +static int cgroup_storage_get_next_key(struct bpf_map *_map, void *key, void *_next_key) { struct bpf_cgroup_storage_map *map = map_to_storage(_map); - struct bpf_cgroup_storage_key *key = _key; - struct bpf_cgroup_storage_key *next = _next_key; struct bpf_cgroup_storage *storage; spin_lock_bh(&map->lock); @@ -250,17 +259,23 @@ static int cgroup_storage_get_next_key(struct bpf_map *_map, void *_key, if (!storage) goto enoent; - storage = list_next_entry(storage, list); + storage = list_next_entry(storage, list_map); if (!storage) goto enoent; } else { storage = list_first_entry(&map->list, - struct bpf_cgroup_storage, list); + struct bpf_cgroup_storage, list_map); } spin_unlock_bh(&map->lock); - next->attach_type = storage->key.attach_type; - next->cgroup_inode_id = storage->key.cgroup_inode_id; + + if (attach_type_isolated(&map->map)) { + struct bpf_cgroup_storage_key *next = _next_key; + *next = storage->key; + } else { + __u64 *next = _next_key; + *next = storage->key.cgroup_inode_id; + } return 0; enoent: @@ -275,7 +290,8 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) struct bpf_map_memory mem; int ret; - if (attr->key_size != sizeof(struct bpf_cgroup_storage_key)) + if (attr->key_size != sizeof(struct bpf_cgroup_storage_key) && + attr->key_size != sizeof(__u64)) return ERR_PTR(-EINVAL); if (attr->value_size == 0) @@ -318,6 +334,17 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) static void cgroup_storage_map_free(struct bpf_map *_map) { struct bpf_cgroup_storage_map *map = map_to_storage(_map); + struct list_head *storages = &map->list; + struct bpf_cgroup_storage *storage, *stmp; + + mutex_lock(&cgroup_mutex); + + list_for_each_entry_safe(storage, stmp, storages, list_map) { + bpf_cgroup_storage_unlink(storage); + bpf_cgroup_storage_free(storage); + } + + mutex_unlock(&cgroup_mutex); WARN_ON(!RB_EMPTY_ROOT(&map->root)); WARN_ON(!list_empty(&map->list)); @@ -335,49 +362,63 @@ static int cgroup_storage_check_btf(const struct bpf_map *map, const struct btf_type *key_type, const struct btf_type *value_type) { - struct btf_member *m; - u32 offset, size; - - /* Key is expected to be of struct bpf_cgroup_storage_key type, - * which is: - * struct bpf_cgroup_storage_key { - * __u64 cgroup_inode_id; - * __u32 attach_type; - * }; - */ + if (attach_type_isolated(map)) { + struct btf_member *m; + u32 offset, size; + + /* Key is expected to be of struct bpf_cgroup_storage_key type, + * which is: + * struct bpf_cgroup_storage_key { + * __u64 cgroup_inode_id; + * __u32 attach_type; + * }; + */ + + /* + * Key_type must be a structure with two fields. + */ + if (BTF_INFO_KIND(key_type->info) != BTF_KIND_STRUCT || + BTF_INFO_VLEN(key_type->info) != 2) + return -EINVAL; + + /* + * The first field must be a 64 bit integer at 0 offset. + */ + m = (struct btf_member *)(key_type + 1); + size = sizeof_field(struct bpf_cgroup_storage_key, cgroup_inode_id); + if (!btf_member_is_reg_int(btf, key_type, m, 0, size)) + return -EINVAL; + + /* + * The second field must be a 32 bit integer at 64 bit offset. + */ + m++; + offset = offsetof(struct bpf_cgroup_storage_key, attach_type); + size = sizeof_field(struct bpf_cgroup_storage_key, attach_type); + if (!btf_member_is_reg_int(btf, key_type, m, offset, size)) + return -EINVAL; + } else { + u32 int_data; - /* - * Key_type must be a structure with two fields. - */ - if (BTF_INFO_KIND(key_type->info) != BTF_KIND_STRUCT || - BTF_INFO_VLEN(key_type->info) != 2) - return -EINVAL; + /* + * Key is expected to be u64, which stores the cgroup_inode_id + */ - /* - * The first field must be a 64 bit integer at 0 offset. - */ - m = (struct btf_member *)(key_type + 1); - size = sizeof_field(struct bpf_cgroup_storage_key, cgroup_inode_id); - if (!btf_member_is_reg_int(btf, key_type, m, 0, size)) - return -EINVAL; + if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) + return -EINVAL; - /* - * The second field must be a 32 bit integer at 64 bit offset. - */ - m++; - offset = offsetof(struct bpf_cgroup_storage_key, attach_type); - size = sizeof_field(struct bpf_cgroup_storage_key, attach_type); - if (!btf_member_is_reg_int(btf, key_type, m, offset, size)) - return -EINVAL; + int_data = *(u32 *)(key_type + 1); + if (BTF_INT_BITS(int_data) != 64 || BTF_INT_OFFSET(int_data)) + return -EINVAL; + } return 0; } -static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *_key, +static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *key, struct seq_file *m) { enum bpf_cgroup_storage_type stype = cgroup_storage_type(map); - struct bpf_cgroup_storage_key *key = _key; struct bpf_cgroup_storage *storage; int cpu; @@ -409,6 +450,7 @@ static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *_key, rcu_read_unlock(); } +static int cgroup_storage_map_btf_id; const struct bpf_map_ops cgroup_storage_map_ops = { .map_alloc = cgroup_storage_map_alloc, .map_free = cgroup_storage_map_free, @@ -418,43 +460,20 @@ const struct bpf_map_ops cgroup_storage_map_ops = { .map_delete_elem = cgroup_storage_delete_elem, .map_check_btf = cgroup_storage_check_btf, .map_seq_show_elem = cgroup_storage_seq_show_elem, + .map_btf_name = "bpf_cgroup_storage_map", + .map_btf_id = &cgroup_storage_map_btf_id, }; int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *_map) { enum bpf_cgroup_storage_type stype = cgroup_storage_type(_map); - struct bpf_cgroup_storage_map *map = map_to_storage(_map); - int ret = -EBUSY; - - spin_lock_bh(&map->lock); - if (map->aux && map->aux != aux) - goto unlock; if (aux->cgroup_storage[stype] && aux->cgroup_storage[stype] != _map) - goto unlock; + return -EBUSY; - map->aux = aux; aux->cgroup_storage[stype] = _map; - ret = 0; -unlock: - spin_unlock_bh(&map->lock); - - return ret; -} - -void bpf_cgroup_storage_release(struct bpf_prog_aux *aux, struct bpf_map *_map) -{ - enum bpf_cgroup_storage_type stype = cgroup_storage_type(_map); - struct bpf_cgroup_storage_map *map = map_to_storage(_map); - - spin_lock_bh(&map->lock); - if (map->aux == aux) { - WARN_ON(aux->cgroup_storage[stype] != _map); - map->aux = NULL; - aux->cgroup_storage[stype] = NULL; - } - spin_unlock_bh(&map->lock); + return 0; } static size_t bpf_cgroup_storage_calculate_size(struct bpf_map *map, u32 *pages) @@ -575,7 +594,8 @@ void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, spin_lock_bh(&map->lock); WARN_ON(cgroup_storage_insert(map, storage)); - list_add(&storage->list, &map->list); + list_add(&storage->list_map, &map->list); + list_add(&storage->list_cg, &cgroup->bpf.storages); spin_unlock_bh(&map->lock); } @@ -593,7 +613,8 @@ void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage) root = &map->root; rb_erase(&storage->node, root); - list_del(&storage->list); + list_del(&storage->list_map); + list_del(&storage->list_cg); spin_unlock_bh(&map->lock); } diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 65c236cf341e..44474bf3ab7a 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -543,7 +543,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) u64 cost = sizeof(*trie), cost_per_node; int ret; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return ERR_PTR(-EPERM); /* check sanity of attributes */ @@ -589,11 +589,6 @@ static void trie_free(struct bpf_map *map) struct lpm_trie_node __rcu **slot; struct lpm_trie_node *node; - /* Wait for outstanding programs to complete - * update/lookup/delete/get_next_key and free the trie. - */ - synchronize_rcu(); - /* Always start at the root and walk down to a node that has no * children. Then free that node, nullify its reference in the parent * and start over. @@ -735,6 +730,7 @@ static int trie_check_btf(const struct bpf_map *map, -EINVAL : 0; } +static int trie_map_btf_id; const struct bpf_map_ops trie_map_ops = { .map_alloc = trie_alloc, .map_free = trie_free, @@ -743,4 +739,6 @@ const struct bpf_map_ops trie_map_ops = { .map_update_elem = trie_update_elem, .map_delete_elem = trie_delete_elem, .map_check_btf = trie_check_btf, + .map_btf_name = "lpm_trie", + .map_btf_id = &trie_map_btf_id, }; diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index b3c48d1533cb..17738c93bec8 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -60,7 +60,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) /* Misc members not needed in bpf_map_meta_equal() check. */ inner_map_meta->ops = inner_map->ops; if (inner_map->ops == &array_map_ops) { - inner_map_meta->unpriv_array = inner_map->unpriv_array; + inner_map_meta->bypass_spec_v1 = inner_map->bypass_spec_v1; container_of(inner_map_meta, struct bpf_array, map)->index_mask = container_of(inner_map, struct bpf_array, map)->index_mask; } diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c new file mode 100644 index 000000000000..fbe1f557cb88 --- /dev/null +++ b/kernel/bpf/map_iter.c @@ -0,0 +1,159 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2020 Facebook */ +#include <linux/bpf.h> +#include <linux/fs.h> +#include <linux/filter.h> +#include <linux/kernel.h> +#include <linux/btf_ids.h> + +struct bpf_iter_seq_map_info { + u32 map_id; +}; + +static void *bpf_map_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct bpf_iter_seq_map_info *info = seq->private; + struct bpf_map *map; + + map = bpf_map_get_curr_or_next(&info->map_id); + if (!map) + return NULL; + + if (*pos == 0) + ++*pos; + return map; +} + +static void *bpf_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct bpf_iter_seq_map_info *info = seq->private; + + ++*pos; + ++info->map_id; + bpf_map_put((struct bpf_map *)v); + return bpf_map_get_curr_or_next(&info->map_id); +} + +struct bpf_iter__bpf_map { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct bpf_map *, map); +}; + +DEFINE_BPF_ITER_FUNC(bpf_map, struct bpf_iter_meta *meta, struct bpf_map *map) + +static int __bpf_map_seq_show(struct seq_file *seq, void *v, bool in_stop) +{ + struct bpf_iter__bpf_map ctx; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + int ret = 0; + + ctx.meta = &meta; + ctx.map = v; + meta.seq = seq; + prog = bpf_iter_get_info(&meta, in_stop); + if (prog) + ret = bpf_iter_run_prog(prog, &ctx); + + return ret; +} + +static int bpf_map_seq_show(struct seq_file *seq, void *v) +{ + return __bpf_map_seq_show(seq, v, false); +} + +static void bpf_map_seq_stop(struct seq_file *seq, void *v) +{ + if (!v) + (void)__bpf_map_seq_show(seq, v, true); + else + bpf_map_put((struct bpf_map *)v); +} + +static const struct seq_operations bpf_map_seq_ops = { + .start = bpf_map_seq_start, + .next = bpf_map_seq_next, + .stop = bpf_map_seq_stop, + .show = bpf_map_seq_show, +}; + +BTF_ID_LIST(btf_bpf_map_id) +BTF_ID(struct, bpf_map) + +static const struct bpf_iter_seq_info bpf_map_seq_info = { + .seq_ops = &bpf_map_seq_ops, + .init_seq_private = NULL, + .fini_seq_private = NULL, + .seq_priv_size = sizeof(struct bpf_iter_seq_map_info), +}; + +static struct bpf_iter_reg bpf_map_reg_info = { + .target = "bpf_map", + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__bpf_map, map), + PTR_TO_BTF_ID_OR_NULL }, + }, + .seq_info = &bpf_map_seq_info, +}; + +static int bpf_iter_check_map(struct bpf_prog *prog, + struct bpf_iter_aux_info *aux) +{ + u32 key_acc_size, value_acc_size, key_size, value_size; + struct bpf_map *map = aux->map; + bool is_percpu = false; + + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) + is_percpu = true; + else if (map->map_type != BPF_MAP_TYPE_HASH && + map->map_type != BPF_MAP_TYPE_LRU_HASH && + map->map_type != BPF_MAP_TYPE_ARRAY) + return -EINVAL; + + key_acc_size = prog->aux->max_rdonly_access; + value_acc_size = prog->aux->max_rdwr_access; + key_size = map->key_size; + if (!is_percpu) + value_size = map->value_size; + else + value_size = round_up(map->value_size, 8) * num_possible_cpus(); + + if (key_acc_size > key_size || value_acc_size > value_size) + return -EACCES; + + return 0; +} + +DEFINE_BPF_ITER_FUNC(bpf_map_elem, struct bpf_iter_meta *meta, + struct bpf_map *map, void *key, void *value) + +static const struct bpf_iter_reg bpf_map_elem_reg_info = { + .target = "bpf_map_elem", + .check_target = bpf_iter_check_map, + .req_linfo = BPF_ITER_LINK_MAP_FD, + .ctx_arg_info_size = 2, + .ctx_arg_info = { + { offsetof(struct bpf_iter__bpf_map_elem, key), + PTR_TO_RDONLY_BUF_OR_NULL }, + { offsetof(struct bpf_iter__bpf_map_elem, value), + PTR_TO_RDWR_BUF_OR_NULL }, + }, +}; + +static int __init bpf_map_iter_init(void) +{ + int ret; + + bpf_map_reg_info.ctx_arg_info[0].btf_id = *btf_bpf_map_id; + ret = bpf_iter_reg_target(&bpf_map_reg_info); + if (ret) + return ret; + + return bpf_iter_reg_target(&bpf_map_elem_reg_info); +} + +late_initcall(bpf_map_iter_init); diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c new file mode 100644 index 000000000000..542f275bf252 --- /dev/null +++ b/kernel/bpf/net_namespace.c @@ -0,0 +1,566 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/bpf.h> +#include <linux/filter.h> +#include <net/net_namespace.h> + +/* + * Functions to manage BPF programs attached to netns + */ + +struct bpf_netns_link { + struct bpf_link link; + enum bpf_attach_type type; + enum netns_bpf_attach_type netns_type; + + /* We don't hold a ref to net in order to auto-detach the link + * when netns is going away. Instead we rely on pernet + * pre_exit callback to clear this pointer. Must be accessed + * with netns_bpf_mutex held. + */ + struct net *net; + struct list_head node; /* node in list of links attached to net */ +}; + +/* Protects updates to netns_bpf */ +DEFINE_MUTEX(netns_bpf_mutex); + +static void netns_bpf_attach_type_unneed(enum netns_bpf_attach_type type) +{ + switch (type) { +#ifdef CONFIG_INET + case NETNS_BPF_SK_LOOKUP: + static_branch_dec(&bpf_sk_lookup_enabled); + break; +#endif + default: + break; + } +} + +static void netns_bpf_attach_type_need(enum netns_bpf_attach_type type) +{ + switch (type) { +#ifdef CONFIG_INET + case NETNS_BPF_SK_LOOKUP: + static_branch_inc(&bpf_sk_lookup_enabled); + break; +#endif + default: + break; + } +} + +/* Must be called with netns_bpf_mutex held. */ +static void netns_bpf_run_array_detach(struct net *net, + enum netns_bpf_attach_type type) +{ + struct bpf_prog_array *run_array; + + run_array = rcu_replace_pointer(net->bpf.run_array[type], NULL, + lockdep_is_held(&netns_bpf_mutex)); + bpf_prog_array_free(run_array); +} + +static int link_index(struct net *net, enum netns_bpf_attach_type type, + struct bpf_netns_link *link) +{ + struct bpf_netns_link *pos; + int i = 0; + + list_for_each_entry(pos, &net->bpf.links[type], node) { + if (pos == link) + return i; + i++; + } + return -ENOENT; +} + +static int link_count(struct net *net, enum netns_bpf_attach_type type) +{ + struct list_head *pos; + int i = 0; + + list_for_each(pos, &net->bpf.links[type]) + i++; + return i; +} + +static void fill_prog_array(struct net *net, enum netns_bpf_attach_type type, + struct bpf_prog_array *prog_array) +{ + struct bpf_netns_link *pos; + unsigned int i = 0; + + list_for_each_entry(pos, &net->bpf.links[type], node) { + prog_array->items[i].prog = pos->link.prog; + i++; + } +} + +static void bpf_netns_link_release(struct bpf_link *link) +{ + struct bpf_netns_link *net_link = + container_of(link, struct bpf_netns_link, link); + enum netns_bpf_attach_type type = net_link->netns_type; + struct bpf_prog_array *old_array, *new_array; + struct net *net; + int cnt, idx; + + mutex_lock(&netns_bpf_mutex); + + /* We can race with cleanup_net, but if we see a non-NULL + * struct net pointer, pre_exit has not run yet and wait for + * netns_bpf_mutex. + */ + net = net_link->net; + if (!net) + goto out_unlock; + + /* Mark attach point as unused */ + netns_bpf_attach_type_unneed(type); + + /* Remember link position in case of safe delete */ + idx = link_index(net, type, net_link); + list_del(&net_link->node); + + cnt = link_count(net, type); + if (!cnt) { + netns_bpf_run_array_detach(net, type); + goto out_unlock; + } + + old_array = rcu_dereference_protected(net->bpf.run_array[type], + lockdep_is_held(&netns_bpf_mutex)); + new_array = bpf_prog_array_alloc(cnt, GFP_KERNEL); + if (!new_array) { + WARN_ON(bpf_prog_array_delete_safe_at(old_array, idx)); + goto out_unlock; + } + fill_prog_array(net, type, new_array); + rcu_assign_pointer(net->bpf.run_array[type], new_array); + bpf_prog_array_free(old_array); + +out_unlock: + net_link->net = NULL; + mutex_unlock(&netns_bpf_mutex); +} + +static int bpf_netns_link_detach(struct bpf_link *link) +{ + bpf_netns_link_release(link); + return 0; +} + +static void bpf_netns_link_dealloc(struct bpf_link *link) +{ + struct bpf_netns_link *net_link = + container_of(link, struct bpf_netns_link, link); + + kfree(net_link); +} + +static int bpf_netns_link_update_prog(struct bpf_link *link, + struct bpf_prog *new_prog, + struct bpf_prog *old_prog) +{ + struct bpf_netns_link *net_link = + container_of(link, struct bpf_netns_link, link); + enum netns_bpf_attach_type type = net_link->netns_type; + struct bpf_prog_array *run_array; + struct net *net; + int idx, ret; + + if (old_prog && old_prog != link->prog) + return -EPERM; + if (new_prog->type != link->prog->type) + return -EINVAL; + + mutex_lock(&netns_bpf_mutex); + + net = net_link->net; + if (!net || !check_net(net)) { + /* Link auto-detached or netns dying */ + ret = -ENOLINK; + goto out_unlock; + } + + run_array = rcu_dereference_protected(net->bpf.run_array[type], + lockdep_is_held(&netns_bpf_mutex)); + idx = link_index(net, type, net_link); + ret = bpf_prog_array_update_at(run_array, idx, new_prog); + if (ret) + goto out_unlock; + + old_prog = xchg(&link->prog, new_prog); + bpf_prog_put(old_prog); + +out_unlock: + mutex_unlock(&netns_bpf_mutex); + return ret; +} + +static int bpf_netns_link_fill_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + const struct bpf_netns_link *net_link = + container_of(link, struct bpf_netns_link, link); + unsigned int inum = 0; + struct net *net; + + mutex_lock(&netns_bpf_mutex); + net = net_link->net; + if (net && check_net(net)) + inum = net->ns.inum; + mutex_unlock(&netns_bpf_mutex); + + info->netns.netns_ino = inum; + info->netns.attach_type = net_link->type; + return 0; +} + +static void bpf_netns_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_link_info info = {}; + + bpf_netns_link_fill_info(link, &info); + seq_printf(seq, + "netns_ino:\t%u\n" + "attach_type:\t%u\n", + info.netns.netns_ino, + info.netns.attach_type); +} + +static const struct bpf_link_ops bpf_netns_link_ops = { + .release = bpf_netns_link_release, + .dealloc = bpf_netns_link_dealloc, + .detach = bpf_netns_link_detach, + .update_prog = bpf_netns_link_update_prog, + .fill_link_info = bpf_netns_link_fill_info, + .show_fdinfo = bpf_netns_link_show_fdinfo, +}; + +/* Must be called with netns_bpf_mutex held. */ +static int __netns_bpf_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr, + struct net *net, + enum netns_bpf_attach_type type) +{ + __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); + struct bpf_prog_array *run_array; + u32 prog_cnt = 0, flags = 0; + + run_array = rcu_dereference_protected(net->bpf.run_array[type], + lockdep_is_held(&netns_bpf_mutex)); + if (run_array) + prog_cnt = bpf_prog_array_length(run_array); + + if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) + return -EFAULT; + if (copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt))) + return -EFAULT; + if (!attr->query.prog_cnt || !prog_ids || !prog_cnt) + return 0; + + return bpf_prog_array_copy_to_user(run_array, prog_ids, + attr->query.prog_cnt); +} + +int netns_bpf_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + enum netns_bpf_attach_type type; + struct net *net; + int ret; + + if (attr->query.query_flags) + return -EINVAL; + + type = to_netns_bpf_attach_type(attr->query.attach_type); + if (type < 0) + return -EINVAL; + + net = get_net_ns_by_fd(attr->query.target_fd); + if (IS_ERR(net)) + return PTR_ERR(net); + + mutex_lock(&netns_bpf_mutex); + ret = __netns_bpf_prog_query(attr, uattr, net, type); + mutex_unlock(&netns_bpf_mutex); + + put_net(net); + return ret; +} + +int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct bpf_prog_array *run_array; + enum netns_bpf_attach_type type; + struct bpf_prog *attached; + struct net *net; + int ret; + + if (attr->target_fd || attr->attach_flags || attr->replace_bpf_fd) + return -EINVAL; + + type = to_netns_bpf_attach_type(attr->attach_type); + if (type < 0) + return -EINVAL; + + net = current->nsproxy->net_ns; + mutex_lock(&netns_bpf_mutex); + + /* Attaching prog directly is not compatible with links */ + if (!list_empty(&net->bpf.links[type])) { + ret = -EEXIST; + goto out_unlock; + } + + switch (type) { + case NETNS_BPF_FLOW_DISSECTOR: + ret = flow_dissector_bpf_prog_attach_check(net, prog); + break; + default: + ret = -EINVAL; + break; + } + if (ret) + goto out_unlock; + + attached = net->bpf.progs[type]; + if (attached == prog) { + /* The same program cannot be attached twice */ + ret = -EINVAL; + goto out_unlock; + } + + run_array = rcu_dereference_protected(net->bpf.run_array[type], + lockdep_is_held(&netns_bpf_mutex)); + if (run_array) { + WRITE_ONCE(run_array->items[0].prog, prog); + } else { + run_array = bpf_prog_array_alloc(1, GFP_KERNEL); + if (!run_array) { + ret = -ENOMEM; + goto out_unlock; + } + run_array->items[0].prog = prog; + rcu_assign_pointer(net->bpf.run_array[type], run_array); + } + + net->bpf.progs[type] = prog; + if (attached) + bpf_prog_put(attached); + +out_unlock: + mutex_unlock(&netns_bpf_mutex); + + return ret; +} + +/* Must be called with netns_bpf_mutex held. */ +static int __netns_bpf_prog_detach(struct net *net, + enum netns_bpf_attach_type type, + struct bpf_prog *old) +{ + struct bpf_prog *attached; + + /* Progs attached via links cannot be detached */ + if (!list_empty(&net->bpf.links[type])) + return -EINVAL; + + attached = net->bpf.progs[type]; + if (!attached || attached != old) + return -ENOENT; + netns_bpf_run_array_detach(net, type); + net->bpf.progs[type] = NULL; + bpf_prog_put(attached); + return 0; +} + +int netns_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) +{ + enum netns_bpf_attach_type type; + struct bpf_prog *prog; + int ret; + + if (attr->target_fd) + return -EINVAL; + + type = to_netns_bpf_attach_type(attr->attach_type); + if (type < 0) + return -EINVAL; + + prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + mutex_lock(&netns_bpf_mutex); + ret = __netns_bpf_prog_detach(current->nsproxy->net_ns, type, prog); + mutex_unlock(&netns_bpf_mutex); + + bpf_prog_put(prog); + + return ret; +} + +static int netns_bpf_max_progs(enum netns_bpf_attach_type type) +{ + switch (type) { + case NETNS_BPF_FLOW_DISSECTOR: + return 1; + case NETNS_BPF_SK_LOOKUP: + return 64; + default: + return 0; + } +} + +static int netns_bpf_link_attach(struct net *net, struct bpf_link *link, + enum netns_bpf_attach_type type) +{ + struct bpf_netns_link *net_link = + container_of(link, struct bpf_netns_link, link); + struct bpf_prog_array *run_array; + int cnt, err; + + mutex_lock(&netns_bpf_mutex); + + cnt = link_count(net, type); + if (cnt >= netns_bpf_max_progs(type)) { + err = -E2BIG; + goto out_unlock; + } + /* Links are not compatible with attaching prog directly */ + if (net->bpf.progs[type]) { + err = -EEXIST; + goto out_unlock; + } + + switch (type) { + case NETNS_BPF_FLOW_DISSECTOR: + err = flow_dissector_bpf_prog_attach_check(net, link->prog); + break; + case NETNS_BPF_SK_LOOKUP: + err = 0; /* nothing to check */ + break; + default: + err = -EINVAL; + break; + } + if (err) + goto out_unlock; + + run_array = bpf_prog_array_alloc(cnt + 1, GFP_KERNEL); + if (!run_array) { + err = -ENOMEM; + goto out_unlock; + } + + list_add_tail(&net_link->node, &net->bpf.links[type]); + + fill_prog_array(net, type, run_array); + run_array = rcu_replace_pointer(net->bpf.run_array[type], run_array, + lockdep_is_held(&netns_bpf_mutex)); + bpf_prog_array_free(run_array); + + /* Mark attach point as used */ + netns_bpf_attach_type_need(type); + +out_unlock: + mutex_unlock(&netns_bpf_mutex); + return err; +} + +int netns_bpf_link_create(const union bpf_attr *attr, struct bpf_prog *prog) +{ + enum netns_bpf_attach_type netns_type; + struct bpf_link_primer link_primer; + struct bpf_netns_link *net_link; + enum bpf_attach_type type; + struct net *net; + int err; + + if (attr->link_create.flags) + return -EINVAL; + + type = attr->link_create.attach_type; + netns_type = to_netns_bpf_attach_type(type); + if (netns_type < 0) + return -EINVAL; + + net = get_net_ns_by_fd(attr->link_create.target_fd); + if (IS_ERR(net)) + return PTR_ERR(net); + + net_link = kzalloc(sizeof(*net_link), GFP_USER); + if (!net_link) { + err = -ENOMEM; + goto out_put_net; + } + bpf_link_init(&net_link->link, BPF_LINK_TYPE_NETNS, + &bpf_netns_link_ops, prog); + net_link->net = net; + net_link->type = type; + net_link->netns_type = netns_type; + + err = bpf_link_prime(&net_link->link, &link_primer); + if (err) { + kfree(net_link); + goto out_put_net; + } + + err = netns_bpf_link_attach(net, &net_link->link, netns_type); + if (err) { + bpf_link_cleanup(&link_primer); + goto out_put_net; + } + + put_net(net); + return bpf_link_settle(&link_primer); + +out_put_net: + put_net(net); + return err; +} + +static int __net_init netns_bpf_pernet_init(struct net *net) +{ + int type; + + for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) + INIT_LIST_HEAD(&net->bpf.links[type]); + + return 0; +} + +static void __net_exit netns_bpf_pernet_pre_exit(struct net *net) +{ + enum netns_bpf_attach_type type; + struct bpf_netns_link *net_link; + + mutex_lock(&netns_bpf_mutex); + for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) { + netns_bpf_run_array_detach(net, type); + list_for_each_entry(net_link, &net->bpf.links[type], node) { + net_link->net = NULL; /* auto-detach link */ + netns_bpf_attach_type_unneed(type); + } + if (net->bpf.progs[type]) + bpf_prog_put(net->bpf.progs[type]); + } + mutex_unlock(&netns_bpf_mutex); +} + +static struct pernet_operations netns_bpf_pernet_ops __net_initdata = { + .init = netns_bpf_pernet_init, + .pre_exit = netns_bpf_pernet_pre_exit, +}; + +static int __init netns_bpf_init(void) +{ + return register_pernet_subsys(&netns_bpf_pernet_ops); +} + +subsys_initcall(netns_bpf_init); diff --git a/kernel/bpf/prog_iter.c b/kernel/bpf/prog_iter.c new file mode 100644 index 000000000000..53a73c841c13 --- /dev/null +++ b/kernel/bpf/prog_iter.c @@ -0,0 +1,107 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2020 Facebook */ +#include <linux/bpf.h> +#include <linux/fs.h> +#include <linux/filter.h> +#include <linux/kernel.h> +#include <linux/btf_ids.h> + +struct bpf_iter_seq_prog_info { + u32 prog_id; +}; + +static void *bpf_prog_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct bpf_iter_seq_prog_info *info = seq->private; + struct bpf_prog *prog; + + prog = bpf_prog_get_curr_or_next(&info->prog_id); + if (!prog) + return NULL; + + if (*pos == 0) + ++*pos; + return prog; +} + +static void *bpf_prog_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct bpf_iter_seq_prog_info *info = seq->private; + + ++*pos; + ++info->prog_id; + bpf_prog_put((struct bpf_prog *)v); + return bpf_prog_get_curr_or_next(&info->prog_id); +} + +struct bpf_iter__bpf_prog { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct bpf_prog *, prog); +}; + +DEFINE_BPF_ITER_FUNC(bpf_prog, struct bpf_iter_meta *meta, struct bpf_prog *prog) + +static int __bpf_prog_seq_show(struct seq_file *seq, void *v, bool in_stop) +{ + struct bpf_iter__bpf_prog ctx; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + int ret = 0; + + ctx.meta = &meta; + ctx.prog = v; + meta.seq = seq; + prog = bpf_iter_get_info(&meta, in_stop); + if (prog) + ret = bpf_iter_run_prog(prog, &ctx); + + return ret; +} + +static int bpf_prog_seq_show(struct seq_file *seq, void *v) +{ + return __bpf_prog_seq_show(seq, v, false); +} + +static void bpf_prog_seq_stop(struct seq_file *seq, void *v) +{ + if (!v) + (void)__bpf_prog_seq_show(seq, v, true); + else + bpf_prog_put((struct bpf_prog *)v); +} + +static const struct seq_operations bpf_prog_seq_ops = { + .start = bpf_prog_seq_start, + .next = bpf_prog_seq_next, + .stop = bpf_prog_seq_stop, + .show = bpf_prog_seq_show, +}; + +BTF_ID_LIST(btf_bpf_prog_id) +BTF_ID(struct, bpf_prog) + +static const struct bpf_iter_seq_info bpf_prog_seq_info = { + .seq_ops = &bpf_prog_seq_ops, + .init_seq_private = NULL, + .fini_seq_private = NULL, + .seq_priv_size = sizeof(struct bpf_iter_seq_prog_info), +}; + +static struct bpf_iter_reg bpf_prog_reg_info = { + .target = "bpf_prog", + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__bpf_prog, prog), + PTR_TO_BTF_ID_OR_NULL }, + }, + .seq_info = &bpf_prog_seq_info, +}; + +static int __init bpf_prog_iter_init(void) +{ + bpf_prog_reg_info.ctx_arg_info[0].btf_id = *btf_bpf_prog_id; + return bpf_iter_reg_target(&bpf_prog_reg_info); +} + +late_initcall(bpf_prog_iter_init); diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index f697647ceb54..44184f82916a 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -19,7 +19,7 @@ struct bpf_queue_stack { u32 head, tail; u32 size; /* max_entries + 1 */ - char elements[0] __aligned(8); + char elements[] __aligned(8); }; static struct bpf_queue_stack *bpf_queue_stack(struct bpf_map *map) @@ -45,7 +45,7 @@ static bool queue_stack_map_is_full(struct bpf_queue_stack *qs) /* Called from syscall */ static int queue_stack_map_alloc_check(union bpf_attr *attr) { - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return -EPERM; /* check sanity of attributes */ @@ -101,13 +101,6 @@ static void queue_stack_map_free(struct bpf_map *map) { struct bpf_queue_stack *qs = bpf_queue_stack(map); - /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, - * so the programs (can be more than one that used this map) were - * disconnected from events. Wait for outstanding critical sections in - * these programs to complete - */ - synchronize_rcu(); - bpf_map_area_free(qs); } @@ -262,6 +255,7 @@ static int queue_stack_map_get_next_key(struct bpf_map *map, void *key, return -EINVAL; } +static int queue_map_btf_id; const struct bpf_map_ops queue_map_ops = { .map_alloc_check = queue_stack_map_alloc_check, .map_alloc = queue_stack_map_alloc, @@ -273,8 +267,11 @@ const struct bpf_map_ops queue_map_ops = { .map_pop_elem = queue_map_pop_elem, .map_peek_elem = queue_map_peek_elem, .map_get_next_key = queue_stack_map_get_next_key, + .map_btf_name = "bpf_queue_stack", + .map_btf_id = &queue_map_btf_id, }; +static int stack_map_btf_id; const struct bpf_map_ops stack_map_ops = { .map_alloc_check = queue_stack_map_alloc_check, .map_alloc = queue_stack_map_alloc, @@ -286,4 +283,6 @@ const struct bpf_map_ops stack_map_ops = { .map_pop_elem = stack_map_pop_elem, .map_peek_elem = stack_map_peek_elem, .map_get_next_key = queue_stack_map_get_next_key, + .map_btf_name = "bpf_queue_stack", + .map_btf_id = &stack_map_btf_id, }; diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index 01badd3eda7a..90b29c5b1da7 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -20,11 +20,14 @@ static struct reuseport_array *reuseport_array(struct bpf_map *map) /* The caller must hold the reuseport_lock */ void bpf_sk_reuseport_detach(struct sock *sk) { - struct sock __rcu **socks; + uintptr_t sk_user_data; write_lock_bh(&sk->sk_callback_lock); - socks = sk->sk_user_data; - if (socks) { + sk_user_data = (uintptr_t)sk->sk_user_data; + if (sk_user_data & SK_USER_DATA_BPF) { + struct sock __rcu **socks; + + socks = (void *)(sk_user_data & SK_USER_DATA_PTRMASK); WRITE_ONCE(sk->sk_user_data, NULL); /* * Do not move this NULL assignment outside of @@ -96,8 +99,6 @@ static void reuseport_array_free(struct bpf_map *map) struct sock *sk; u32 i; - synchronize_rcu(); - /* * ops->map_*_elem() will not be able to access this * array now. Hence, this function only races with @@ -154,7 +155,7 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) struct bpf_map_memory mem; u64 array_size; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return ERR_PTR(-EPERM); array_size = sizeof(*array); @@ -252,6 +253,7 @@ int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, struct sock *free_osk = NULL, *osk, *nsk; struct sock_reuseport *reuse; u32 index = *(u32 *)key; + uintptr_t sk_user_data; struct socket *socket; int err, fd; @@ -305,7 +307,9 @@ int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, if (err) goto put_file_unlock; - WRITE_ONCE(nsk->sk_user_data, &array->ptrs[index]); + sk_user_data = (uintptr_t)&array->ptrs[index] | SK_USER_DATA_NOCOPY | + SK_USER_DATA_BPF; + WRITE_ONCE(nsk->sk_user_data, (void *)sk_user_data); rcu_assign_pointer(array->ptrs[index], nsk); free_osk = osk; err = 0; @@ -345,6 +349,7 @@ static int reuseport_array_get_next_key(struct bpf_map *map, void *key, return 0; } +static int reuseport_array_map_btf_id; const struct bpf_map_ops reuseport_array_ops = { .map_alloc_check = reuseport_array_alloc_check, .map_alloc = reuseport_array_alloc, @@ -352,4 +357,6 @@ const struct bpf_map_ops reuseport_array_ops = { .map_lookup_elem = reuseport_array_lookup_elem, .map_get_next_key = reuseport_array_get_next_key, .map_delete_elem = reuseport_array_delete_elem, + .map_btf_name = "reuseport_array", + .map_btf_id = &reuseport_array_map_btf_id, }; diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c new file mode 100644 index 000000000000..002f8a5c9e51 --- /dev/null +++ b/kernel/bpf/ringbuf.c @@ -0,0 +1,495 @@ +#include <linux/bpf.h> +#include <linux/btf.h> +#include <linux/err.h> +#include <linux/irq_work.h> +#include <linux/slab.h> +#include <linux/filter.h> +#include <linux/mm.h> +#include <linux/vmalloc.h> +#include <linux/wait.h> +#include <linux/poll.h> +#include <uapi/linux/btf.h> + +#define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE) + +/* non-mmap()'able part of bpf_ringbuf (everything up to consumer page) */ +#define RINGBUF_PGOFF \ + (offsetof(struct bpf_ringbuf, consumer_pos) >> PAGE_SHIFT) +/* consumer page and producer page */ +#define RINGBUF_POS_PAGES 2 + +#define RINGBUF_MAX_RECORD_SZ (UINT_MAX/4) + +/* Maximum size of ring buffer area is limited by 32-bit page offset within + * record header, counted in pages. Reserve 8 bits for extensibility, and take + * into account few extra pages for consumer/producer pages and + * non-mmap()'able parts. This gives 64GB limit, which seems plenty for single + * ring buffer. + */ +#define RINGBUF_MAX_DATA_SZ \ + (((1ULL << 24) - RINGBUF_POS_PAGES - RINGBUF_PGOFF) * PAGE_SIZE) + +struct bpf_ringbuf { + wait_queue_head_t waitq; + struct irq_work work; + u64 mask; + struct page **pages; + int nr_pages; + spinlock_t spinlock ____cacheline_aligned_in_smp; + /* Consumer and producer counters are put into separate pages to allow + * mapping consumer page as r/w, but restrict producer page to r/o. + * This protects producer position from being modified by user-space + * application and ruining in-kernel position tracking. + */ + unsigned long consumer_pos __aligned(PAGE_SIZE); + unsigned long producer_pos __aligned(PAGE_SIZE); + char data[] __aligned(PAGE_SIZE); +}; + +struct bpf_ringbuf_map { + struct bpf_map map; + struct bpf_map_memory memory; + struct bpf_ringbuf *rb; +}; + +/* 8-byte ring buffer record header structure */ +struct bpf_ringbuf_hdr { + u32 len; + u32 pg_off; +}; + +static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node) +{ + const gfp_t flags = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN | + __GFP_ZERO; + int nr_meta_pages = RINGBUF_PGOFF + RINGBUF_POS_PAGES; + int nr_data_pages = data_sz >> PAGE_SHIFT; + int nr_pages = nr_meta_pages + nr_data_pages; + struct page **pages, *page; + struct bpf_ringbuf *rb; + size_t array_size; + int i; + + /* Each data page is mapped twice to allow "virtual" + * continuous read of samples wrapping around the end of ring + * buffer area: + * ------------------------------------------------------ + * | meta pages | real data pages | same data pages | + * ------------------------------------------------------ + * | | 1 2 3 4 5 6 7 8 9 | 1 2 3 4 5 6 7 8 9 | + * ------------------------------------------------------ + * | | TA DA | TA DA | + * ------------------------------------------------------ + * ^^^^^^^ + * | + * Here, no need to worry about special handling of wrapped-around + * data due to double-mapped data pages. This works both in kernel and + * when mmap()'ed in user-space, simplifying both kernel and + * user-space implementations significantly. + */ + array_size = (nr_meta_pages + 2 * nr_data_pages) * sizeof(*pages); + if (array_size > PAGE_SIZE) + pages = vmalloc_node(array_size, numa_node); + else + pages = kmalloc_node(array_size, flags, numa_node); + if (!pages) + return NULL; + + for (i = 0; i < nr_pages; i++) { + page = alloc_pages_node(numa_node, flags, 0); + if (!page) { + nr_pages = i; + goto err_free_pages; + } + pages[i] = page; + if (i >= nr_meta_pages) + pages[nr_data_pages + i] = page; + } + + rb = vmap(pages, nr_meta_pages + 2 * nr_data_pages, + VM_ALLOC | VM_USERMAP, PAGE_KERNEL); + if (rb) { + rb->pages = pages; + rb->nr_pages = nr_pages; + return rb; + } + +err_free_pages: + for (i = 0; i < nr_pages; i++) + __free_page(pages[i]); + kvfree(pages); + return NULL; +} + +static void bpf_ringbuf_notify(struct irq_work *work) +{ + struct bpf_ringbuf *rb = container_of(work, struct bpf_ringbuf, work); + + wake_up_all(&rb->waitq); +} + +static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) +{ + struct bpf_ringbuf *rb; + + rb = bpf_ringbuf_area_alloc(data_sz, numa_node); + if (!rb) + return ERR_PTR(-ENOMEM); + + spin_lock_init(&rb->spinlock); + init_waitqueue_head(&rb->waitq); + init_irq_work(&rb->work, bpf_ringbuf_notify); + + rb->mask = data_sz - 1; + rb->consumer_pos = 0; + rb->producer_pos = 0; + + return rb; +} + +static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr) +{ + struct bpf_ringbuf_map *rb_map; + u64 cost; + int err; + + if (attr->map_flags & ~RINGBUF_CREATE_FLAG_MASK) + return ERR_PTR(-EINVAL); + + if (attr->key_size || attr->value_size || + !is_power_of_2(attr->max_entries) || + !PAGE_ALIGNED(attr->max_entries)) + return ERR_PTR(-EINVAL); + +#ifdef CONFIG_64BIT + /* on 32-bit arch, it's impossible to overflow record's hdr->pgoff */ + if (attr->max_entries > RINGBUF_MAX_DATA_SZ) + return ERR_PTR(-E2BIG); +#endif + + rb_map = kzalloc(sizeof(*rb_map), GFP_USER); + if (!rb_map) + return ERR_PTR(-ENOMEM); + + bpf_map_init_from_attr(&rb_map->map, attr); + + cost = sizeof(struct bpf_ringbuf_map) + + sizeof(struct bpf_ringbuf) + + attr->max_entries; + err = bpf_map_charge_init(&rb_map->map.memory, cost); + if (err) + goto err_free_map; + + rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node); + if (IS_ERR(rb_map->rb)) { + err = PTR_ERR(rb_map->rb); + goto err_uncharge; + } + + return &rb_map->map; + +err_uncharge: + bpf_map_charge_finish(&rb_map->map.memory); +err_free_map: + kfree(rb_map); + return ERR_PTR(err); +} + +static void bpf_ringbuf_free(struct bpf_ringbuf *rb) +{ + /* copy pages pointer and nr_pages to local variable, as we are going + * to unmap rb itself with vunmap() below + */ + struct page **pages = rb->pages; + int i, nr_pages = rb->nr_pages; + + vunmap(rb); + for (i = 0; i < nr_pages; i++) + __free_page(pages[i]); + kvfree(pages); +} + +static void ringbuf_map_free(struct bpf_map *map) +{ + struct bpf_ringbuf_map *rb_map; + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + bpf_ringbuf_free(rb_map->rb); + kfree(rb_map); +} + +static void *ringbuf_map_lookup_elem(struct bpf_map *map, void *key) +{ + return ERR_PTR(-ENOTSUPP); +} + +static int ringbuf_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 flags) +{ + return -ENOTSUPP; +} + +static int ringbuf_map_delete_elem(struct bpf_map *map, void *key) +{ + return -ENOTSUPP; +} + +static int ringbuf_map_get_next_key(struct bpf_map *map, void *key, + void *next_key) +{ + return -ENOTSUPP; +} + +static size_t bpf_ringbuf_mmap_page_cnt(const struct bpf_ringbuf *rb) +{ + size_t data_pages = (rb->mask + 1) >> PAGE_SHIFT; + + /* consumer page + producer page + 2 x data pages */ + return RINGBUF_POS_PAGES + 2 * data_pages; +} + +static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) +{ + struct bpf_ringbuf_map *rb_map; + size_t mmap_sz; + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + mmap_sz = bpf_ringbuf_mmap_page_cnt(rb_map->rb) << PAGE_SHIFT; + + if (vma->vm_pgoff * PAGE_SIZE + (vma->vm_end - vma->vm_start) > mmap_sz) + return -EINVAL; + + return remap_vmalloc_range(vma, rb_map->rb, + vma->vm_pgoff + RINGBUF_PGOFF); +} + +static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb) +{ + unsigned long cons_pos, prod_pos; + + cons_pos = smp_load_acquire(&rb->consumer_pos); + prod_pos = smp_load_acquire(&rb->producer_pos); + return prod_pos - cons_pos; +} + +static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp, + struct poll_table_struct *pts) +{ + struct bpf_ringbuf_map *rb_map; + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + poll_wait(filp, &rb_map->rb->waitq, pts); + + if (ringbuf_avail_data_sz(rb_map->rb)) + return EPOLLIN | EPOLLRDNORM; + return 0; +} + +static int ringbuf_map_btf_id; +const struct bpf_map_ops ringbuf_map_ops = { + .map_alloc = ringbuf_map_alloc, + .map_free = ringbuf_map_free, + .map_mmap = ringbuf_map_mmap, + .map_poll = ringbuf_map_poll, + .map_lookup_elem = ringbuf_map_lookup_elem, + .map_update_elem = ringbuf_map_update_elem, + .map_delete_elem = ringbuf_map_delete_elem, + .map_get_next_key = ringbuf_map_get_next_key, + .map_btf_name = "bpf_ringbuf_map", + .map_btf_id = &ringbuf_map_btf_id, +}; + +/* Given pointer to ring buffer record metadata and struct bpf_ringbuf itself, + * calculate offset from record metadata to ring buffer in pages, rounded + * down. This page offset is stored as part of record metadata and allows to + * restore struct bpf_ringbuf * from record pointer. This page offset is + * stored at offset 4 of record metadata header. + */ +static size_t bpf_ringbuf_rec_pg_off(struct bpf_ringbuf *rb, + struct bpf_ringbuf_hdr *hdr) +{ + return ((void *)hdr - (void *)rb) >> PAGE_SHIFT; +} + +/* Given pointer to ring buffer record header, restore pointer to struct + * bpf_ringbuf itself by using page offset stored at offset 4 + */ +static struct bpf_ringbuf * +bpf_ringbuf_restore_from_rec(struct bpf_ringbuf_hdr *hdr) +{ + unsigned long addr = (unsigned long)(void *)hdr; + unsigned long off = (unsigned long)hdr->pg_off << PAGE_SHIFT; + + return (void*)((addr & PAGE_MASK) - off); +} + +static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) +{ + unsigned long cons_pos, prod_pos, new_prod_pos, flags; + u32 len, pg_off; + struct bpf_ringbuf_hdr *hdr; + + if (unlikely(size > RINGBUF_MAX_RECORD_SZ)) + return NULL; + + len = round_up(size + BPF_RINGBUF_HDR_SZ, 8); + cons_pos = smp_load_acquire(&rb->consumer_pos); + + if (in_nmi()) { + if (!spin_trylock_irqsave(&rb->spinlock, flags)) + return NULL; + } else { + spin_lock_irqsave(&rb->spinlock, flags); + } + + prod_pos = rb->producer_pos; + new_prod_pos = prod_pos + len; + + /* check for out of ringbuf space by ensuring producer position + * doesn't advance more than (ringbuf_size - 1) ahead + */ + if (new_prod_pos - cons_pos > rb->mask) { + spin_unlock_irqrestore(&rb->spinlock, flags); + return NULL; + } + + hdr = (void *)rb->data + (prod_pos & rb->mask); + pg_off = bpf_ringbuf_rec_pg_off(rb, hdr); + hdr->len = size | BPF_RINGBUF_BUSY_BIT; + hdr->pg_off = pg_off; + + /* pairs with consumer's smp_load_acquire() */ + smp_store_release(&rb->producer_pos, new_prod_pos); + + spin_unlock_irqrestore(&rb->spinlock, flags); + + return (void *)hdr + BPF_RINGBUF_HDR_SZ; +} + +BPF_CALL_3(bpf_ringbuf_reserve, struct bpf_map *, map, u64, size, u64, flags) +{ + struct bpf_ringbuf_map *rb_map; + + if (unlikely(flags)) + return 0; + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + return (unsigned long)__bpf_ringbuf_reserve(rb_map->rb, size); +} + +const struct bpf_func_proto bpf_ringbuf_reserve_proto = { + .func = bpf_ringbuf_reserve, + .ret_type = RET_PTR_TO_ALLOC_MEM_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_CONST_ALLOC_SIZE_OR_ZERO, + .arg3_type = ARG_ANYTHING, +}; + +static void bpf_ringbuf_commit(void *sample, u64 flags, bool discard) +{ + unsigned long rec_pos, cons_pos; + struct bpf_ringbuf_hdr *hdr; + struct bpf_ringbuf *rb; + u32 new_len; + + hdr = sample - BPF_RINGBUF_HDR_SZ; + rb = bpf_ringbuf_restore_from_rec(hdr); + new_len = hdr->len ^ BPF_RINGBUF_BUSY_BIT; + if (discard) + new_len |= BPF_RINGBUF_DISCARD_BIT; + + /* update record header with correct final size prefix */ + xchg(&hdr->len, new_len); + + /* if consumer caught up and is waiting for our record, notify about + * new data availability + */ + rec_pos = (void *)hdr - (void *)rb->data; + cons_pos = smp_load_acquire(&rb->consumer_pos) & rb->mask; + + if (flags & BPF_RB_FORCE_WAKEUP) + irq_work_queue(&rb->work); + else if (cons_pos == rec_pos && !(flags & BPF_RB_NO_WAKEUP)) + irq_work_queue(&rb->work); +} + +BPF_CALL_2(bpf_ringbuf_submit, void *, sample, u64, flags) +{ + bpf_ringbuf_commit(sample, flags, false /* discard */); + return 0; +} + +const struct bpf_func_proto bpf_ringbuf_submit_proto = { + .func = bpf_ringbuf_submit, + .ret_type = RET_VOID, + .arg1_type = ARG_PTR_TO_ALLOC_MEM, + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_ringbuf_discard, void *, sample, u64, flags) +{ + bpf_ringbuf_commit(sample, flags, true /* discard */); + return 0; +} + +const struct bpf_func_proto bpf_ringbuf_discard_proto = { + .func = bpf_ringbuf_discard, + .ret_type = RET_VOID, + .arg1_type = ARG_PTR_TO_ALLOC_MEM, + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_ringbuf_output, struct bpf_map *, map, void *, data, u64, size, + u64, flags) +{ + struct bpf_ringbuf_map *rb_map; + void *rec; + + if (unlikely(flags & ~(BPF_RB_NO_WAKEUP | BPF_RB_FORCE_WAKEUP))) + return -EINVAL; + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + rec = __bpf_ringbuf_reserve(rb_map->rb, size); + if (!rec) + return -EAGAIN; + + memcpy(rec, data, size); + bpf_ringbuf_commit(rec, flags, false /* discard */); + return 0; +} + +const struct bpf_func_proto bpf_ringbuf_output_proto = { + .func = bpf_ringbuf_output, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_ringbuf_query, struct bpf_map *, map, u64, flags) +{ + struct bpf_ringbuf *rb; + + rb = container_of(map, struct bpf_ringbuf_map, map)->rb; + + switch (flags) { + case BPF_RB_AVAIL_DATA: + return ringbuf_avail_data_sz(rb); + case BPF_RB_RING_SIZE: + return rb->mask + 1; + case BPF_RB_CONS_POS: + return smp_load_acquire(&rb->consumer_pos); + case BPF_RB_PROD_POS: + return smp_load_acquire(&rb->producer_pos); + default: + return 0; + } +} + +const struct bpf_func_proto bpf_ringbuf_query_proto = { + .func = bpf_ringbuf_query, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_ANYTHING, +}; diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index db76339fe358..4fd830a62be2 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -4,11 +4,13 @@ #include <linux/bpf.h> #include <linux/jhash.h> #include <linux/filter.h> +#include <linux/kernel.h> #include <linux/stacktrace.h> #include <linux/perf_event.h> #include <linux/elf.h> #include <linux/pagemap.h> #include <linux/irq_work.h> +#include <linux/btf_ids.h> #include "percpu_freelist.h" #define STACK_CREATE_FLAG_MASK \ @@ -33,7 +35,7 @@ struct bpf_stack_map { /* irq_work to run up_read() for build_id lookup in nmi context */ struct stack_map_irq_work { struct irq_work irq_work; - struct rw_semaphore *sem; + struct mm_struct *mm; }; static void do_up_read(struct irq_work *entry) @@ -44,8 +46,7 @@ static void do_up_read(struct irq_work *entry) return; work = container_of(entry, struct stack_map_irq_work, irq_work); - up_read_non_owner(work->sem); - work->sem = NULL; + mmap_read_unlock_non_owner(work->mm); } static DEFINE_PER_CPU(struct stack_map_irq_work, up_read_work); @@ -93,7 +94,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) u64 cost, n_buckets; int err; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return ERR_PTR(-EPERM); if (attr->map_flags & ~STACK_CREATE_FLAG_MASK) @@ -317,7 +318,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, * with build_id. */ if (!user || !current || !current->mm || irq_work_busy || - down_read_trylock(¤t->mm->mmap_sem) == 0) { + !mmap_read_trylock_non_owner(current->mm)) { /* cannot access current->mm, fall back to ips */ for (i = 0; i < trace_nr; i++) { id_offs[i].status = BPF_STACK_BUILD_ID_IP; @@ -342,24 +343,55 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, } if (!work) { - up_read(¤t->mm->mmap_sem); + mmap_read_unlock_non_owner(current->mm); } else { - work->sem = ¤t->mm->mmap_sem; + work->mm = current->mm; irq_work_queue(&work->irq_work); - /* - * The irq_work will release the mmap_sem with - * up_read_non_owner(). The rwsem_release() is called - * here to release the lock from lockdep's perspective. - */ - rwsem_release(¤t->mm->mmap_sem.dep_map, _RET_IP_); } } -BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, - u64, flags) +static struct perf_callchain_entry * +get_callchain_entry_for_task(struct task_struct *task, u32 init_nr) +{ +#ifdef CONFIG_STACKTRACE + struct perf_callchain_entry *entry; + int rctx; + + entry = get_callchain_entry(&rctx); + + if (!entry) + return NULL; + + entry->nr = init_nr + + stack_trace_save_tsk(task, (unsigned long *)(entry->ip + init_nr), + sysctl_perf_event_max_stack - init_nr, 0); + + /* stack_trace_save_tsk() works on unsigned long array, while + * perf_callchain_entry uses u64 array. For 32-bit systems, it is + * necessary to fix this mismatch. + */ + if (__BITS_PER_LONG != 64) { + unsigned long *from = (unsigned long *) entry->ip; + u64 *to = entry->ip; + int i; + + /* copy data from the end to avoid using extra buffer */ + for (i = entry->nr - 1; i >= (int)init_nr; i--) + to[i] = (u64)(from[i]); + } + + put_callchain_entry(rctx); + + return entry; +#else /* CONFIG_STACKTRACE */ + return NULL; +#endif +} + +static long __bpf_get_stackid(struct bpf_map *map, + struct perf_callchain_entry *trace, u64 flags) { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); - struct perf_callchain_entry *trace; struct stack_map_bucket *bucket, *new_bucket, *old_bucket; u32 max_depth = map->value_size / stack_map_data_size(map); /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */ @@ -367,21 +399,9 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, u32 skip = flags & BPF_F_SKIP_FIELD_MASK; u32 hash, id, trace_nr, trace_len; bool user = flags & BPF_F_USER_STACK; - bool kernel = !user; u64 *ips; bool hash_matches; - if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | - BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) - return -EINVAL; - - trace = get_perf_callchain(regs, init_nr, kernel, user, - sysctl_perf_event_max_stack, false, false); - - if (unlikely(!trace)) - /* couldn't fetch the stack trace */ - return -EFAULT; - /* get_perf_callchain() guarantees that trace->nr >= init_nr * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth */ @@ -446,6 +466,30 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, return id; } +BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, + u64, flags) +{ + u32 max_depth = map->value_size / stack_map_data_size(map); + /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */ + u32 init_nr = sysctl_perf_event_max_stack - max_depth; + bool user = flags & BPF_F_USER_STACK; + struct perf_callchain_entry *trace; + bool kernel = !user; + + if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | + BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) + return -EINVAL; + + trace = get_perf_callchain(regs, init_nr, kernel, user, + sysctl_perf_event_max_stack, false, false); + + if (unlikely(!trace)) + /* couldn't fetch the stack trace */ + return -EFAULT; + + return __bpf_get_stackid(map, trace, flags); +} + const struct bpf_func_proto bpf_get_stackid_proto = { .func = bpf_get_stackid, .gpl_only = true, @@ -455,8 +499,78 @@ const struct bpf_func_proto bpf_get_stackid_proto = { .arg3_type = ARG_ANYTHING, }; -BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size, - u64, flags) +static __u64 count_kernel_ip(struct perf_callchain_entry *trace) +{ + __u64 nr_kernel = 0; + + while (nr_kernel < trace->nr) { + if (trace->ip[nr_kernel] == PERF_CONTEXT_USER) + break; + nr_kernel++; + } + return nr_kernel; +} + +BPF_CALL_3(bpf_get_stackid_pe, struct bpf_perf_event_data_kern *, ctx, + struct bpf_map *, map, u64, flags) +{ + struct perf_event *event = ctx->event; + struct perf_callchain_entry *trace; + bool kernel, user; + __u64 nr_kernel; + int ret; + + /* perf_sample_data doesn't have callchain, use bpf_get_stackid */ + if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY)) + return bpf_get_stackid((unsigned long)(ctx->regs), + (unsigned long) map, flags, 0, 0); + + if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | + BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) + return -EINVAL; + + user = flags & BPF_F_USER_STACK; + kernel = !user; + + trace = ctx->data->callchain; + if (unlikely(!trace)) + return -EFAULT; + + nr_kernel = count_kernel_ip(trace); + + if (kernel) { + __u64 nr = trace->nr; + + trace->nr = nr_kernel; + ret = __bpf_get_stackid(map, trace, flags); + + /* restore nr */ + trace->nr = nr; + } else { /* user */ + u64 skip = flags & BPF_F_SKIP_FIELD_MASK; + + skip += nr_kernel; + if (skip > BPF_F_SKIP_FIELD_MASK) + return -EFAULT; + + flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip; + ret = __bpf_get_stackid(map, trace, flags); + } + return ret; +} + +const struct bpf_func_proto bpf_get_stackid_proto_pe = { + .func = bpf_get_stackid_pe, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, +}; + +static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, + struct perf_callchain_entry *trace_in, + void *buf, u32 size, u64 flags) { u32 init_nr, trace_nr, copy_len, elem_size, num_elem; bool user_build_id = flags & BPF_F_USER_BUILD_ID; @@ -478,13 +592,24 @@ BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size, if (unlikely(size % elem_size)) goto clear; + /* cannot get valid user stack for task without user_mode regs */ + if (task && user && !user_mode(regs)) + goto err_fault; + num_elem = size / elem_size; if (sysctl_perf_event_max_stack < num_elem) init_nr = 0; else init_nr = sysctl_perf_event_max_stack - num_elem; - trace = get_perf_callchain(regs, init_nr, kernel, user, - sysctl_perf_event_max_stack, false, false); + + if (trace_in) + trace = trace_in; + else if (kernel && task) + trace = get_callchain_entry_for_task(task, init_nr); + else + trace = get_perf_callchain(regs, init_nr, kernel, user, + sysctl_perf_event_max_stack, + false, false); if (unlikely(!trace)) goto err_fault; @@ -512,6 +637,12 @@ clear: return err; } +BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size, + u64, flags) +{ + return __bpf_get_stack(regs, NULL, NULL, buf, size, flags); +} + const struct bpf_func_proto bpf_get_stack_proto = { .func = bpf_get_stack, .gpl_only = true, @@ -522,6 +653,91 @@ const struct bpf_func_proto bpf_get_stack_proto = { .arg4_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf, + u32, size, u64, flags) +{ + struct pt_regs *regs = task_pt_regs(task); + + return __bpf_get_stack(regs, task, NULL, buf, size, flags); +} + +BTF_ID_LIST(bpf_get_task_stack_btf_ids) +BTF_ID(struct, task_struct) + +const struct bpf_func_proto bpf_get_task_stack_proto = { + .func = bpf_get_task_stack, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, + .btf_id = bpf_get_task_stack_btf_ids, +}; + +BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx, + void *, buf, u32, size, u64, flags) +{ + struct pt_regs *regs = (struct pt_regs *)(ctx->regs); + struct perf_event *event = ctx->event; + struct perf_callchain_entry *trace; + bool kernel, user; + int err = -EINVAL; + __u64 nr_kernel; + + if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY)) + return __bpf_get_stack(regs, NULL, NULL, buf, size, flags); + + if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | + BPF_F_USER_BUILD_ID))) + goto clear; + + user = flags & BPF_F_USER_STACK; + kernel = !user; + + err = -EFAULT; + trace = ctx->data->callchain; + if (unlikely(!trace)) + goto clear; + + nr_kernel = count_kernel_ip(trace); + + if (kernel) { + __u64 nr = trace->nr; + + trace->nr = nr_kernel; + err = __bpf_get_stack(regs, NULL, trace, buf, size, flags); + + /* restore nr */ + trace->nr = nr; + } else { /* user */ + u64 skip = flags & BPF_F_SKIP_FIELD_MASK; + + skip += nr_kernel; + if (skip > BPF_F_SKIP_FIELD_MASK) + goto clear; + + flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip; + err = __bpf_get_stack(regs, NULL, trace, buf, size, flags); + } + return err; + +clear: + memset(buf, 0, size); + return err; + +} + +const struct bpf_func_proto bpf_get_stack_proto_pe = { + .func = bpf_get_stack_pe, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, +}; + /* Called from eBPF program */ static void *stack_map_lookup_elem(struct bpf_map *map, void *key) { @@ -611,15 +827,13 @@ static void stack_map_free(struct bpf_map *map) { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); - /* wait for bpf programs to complete before freeing stack map */ - synchronize_rcu(); - bpf_map_area_free(smap->elems); pcpu_freelist_destroy(&smap->freelist); bpf_map_area_free(smap); put_callchain_buffers(); } +static int stack_trace_map_btf_id; const struct bpf_map_ops stack_trace_map_ops = { .map_alloc = stack_map_alloc, .map_free = stack_map_free, @@ -628,6 +842,8 @@ const struct bpf_map_ops stack_trace_map_ops = { .map_update_elem = stack_map_update_elem, .map_delete_elem = stack_map_delete_elem, .map_check_btf = map_check_no_btf, + .map_btf_name = "bpf_stack_map", + .map_btf_id = &stack_trace_map_btf_id, }; static int __init stack_map_init(void) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4e6dee19a668..2f343ce15747 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -25,7 +25,10 @@ #include <linux/nospec.h> #include <linux/audit.h> #include <uapi/linux/btf.h> +#include <linux/pgtable.h> #include <linux/bpf_lsm.h> +#include <linux/poll.h> +#include <linux/bpf-netns.h> #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ @@ -42,6 +45,8 @@ static DEFINE_IDR(prog_idr); static DEFINE_SPINLOCK(prog_idr_lock); static DEFINE_IDR(map_idr); static DEFINE_SPINLOCK(map_idr_lock); +static DEFINE_IDR(link_idr); +static DEFINE_SPINLOCK(link_idr_lock); int sysctl_unprivileged_bpf_disabled __read_mostly; @@ -49,9 +54,11 @@ static const struct bpf_map_ops * const bpf_map_types[] = { #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) #define BPF_MAP_TYPE(_id, _ops) \ [_id] = &_ops, +#define BPF_LINK_TYPE(_id, _name) #include <linux/bpf_types.h> #undef BPF_PROG_TYPE #undef BPF_MAP_TYPE +#undef BPF_LINK_TYPE }; /* @@ -67,32 +74,19 @@ int bpf_check_uarg_tail_zero(void __user *uaddr, size_t expected_size, size_t actual_size) { - unsigned char __user *addr; - unsigned char __user *end; - unsigned char val; - int err; + unsigned char __user *addr = uaddr + expected_size; + int res; if (unlikely(actual_size > PAGE_SIZE)) /* silly large */ return -E2BIG; - if (unlikely(!access_ok(uaddr, actual_size))) - return -EFAULT; - if (actual_size <= expected_size) return 0; - addr = uaddr + expected_size; - end = uaddr + actual_size; - - for (; addr < end; addr++) { - err = get_user(val, addr); - if (err) - return err; - if (val) - return -E2BIG; - } - - return 0; + res = check_zeroed_user(addr, actual_size - expected_size); + if (res < 0) + return res; + return res ? 0 : -E2BIG; } const struct bpf_map_ops bpf_map_offload_ops = { @@ -281,27 +275,29 @@ static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable) * __GFP_RETRY_MAYFAIL to avoid such situations. */ - const gfp_t flags = __GFP_NOWARN | __GFP_ZERO; + const gfp_t gfp = __GFP_NOWARN | __GFP_ZERO; + unsigned int flags = 0; + unsigned long align = 1; void *area; if (size >= SIZE_MAX) return NULL; /* kmalloc()'ed memory can't be mmap()'ed */ - if (!mmapable && size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { - area = kmalloc_node(size, GFP_USER | __GFP_NORETRY | flags, + if (mmapable) { + BUG_ON(!PAGE_ALIGNED(size)); + align = SHMLBA; + flags = VM_USERMAP; + } else if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { + area = kmalloc_node(size, gfp | GFP_USER | __GFP_NORETRY, numa_node); if (area != NULL) return area; } - if (mmapable) { - BUG_ON(!PAGE_ALIGNED(size)); - return vmalloc_user_node_flags(size, numa_node, GFP_KERNEL | - __GFP_RETRY_MAYFAIL | flags); - } - return __vmalloc_node_flags_caller(size, numa_node, - GFP_KERNEL | __GFP_RETRY_MAYFAIL | - flags, __builtin_return_address(0)); + + return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, + gfp | GFP_KERNEL | __GFP_RETRY_MAYFAIL, PAGE_KERNEL, + flags, numa_node, __builtin_return_address(0)); } void *bpf_map_area_alloc(u64 size, int numa_node) @@ -658,6 +654,16 @@ out: return err; } +static __poll_t bpf_map_poll(struct file *filp, struct poll_table_struct *pts) +{ + struct bpf_map *map = filp->private_data; + + if (map->ops->map_poll) + return map->ops->map_poll(map, filp, pts); + + return EPOLLERR; +} + const struct file_operations bpf_map_fops = { #ifdef CONFIG_PROC_FS .show_fdinfo = bpf_map_show_fdinfo, @@ -666,6 +672,7 @@ const struct file_operations bpf_map_fops = { .read = bpf_dummy_read, .write = bpf_dummy_write, .mmap = bpf_map_mmap, + .poll = bpf_map_poll, }; int bpf_map_new_fd(struct bpf_map *map, int flags) @@ -1383,7 +1390,7 @@ int generic_map_lookup_batch(struct bpf_map *map, buf = kmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN); if (!buf) { - kvfree(buf_prevkey); + kfree(buf_prevkey); return -ENOMEM; } @@ -1468,7 +1475,8 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { + if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ) || + !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { err = -EPERM; goto err_put; } @@ -1543,7 +1551,7 @@ static int map_freeze(const union bpf_attr *attr) err = -EBUSY; goto err_put; } - if (!capable(CAP_SYS_ADMIN)) { + if (!bpf_capable()) { err = -EPERM; goto err_put; } @@ -1559,9 +1567,11 @@ static const struct bpf_prog_ops * const bpf_prog_types[] = { #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ [_id] = & _name ## _prog_ops, #define BPF_MAP_TYPE(_id, _ops) +#define BPF_LINK_TYPE(_id, _name) #include <linux/bpf_types.h> #undef BPF_PROG_TYPE #undef BPF_MAP_TYPE +#undef BPF_LINK_TYPE }; static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) @@ -1971,6 +1981,7 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, case BPF_PROG_TYPE_CGROUP_SOCK: switch (expected_attach_type) { case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_INET_SOCK_RELEASE: case BPF_CGROUP_INET4_POST_BIND: case BPF_CGROUP_INET6_POST_BIND: return 0; @@ -1983,6 +1994,10 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: @@ -2007,6 +2022,10 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, default: return -EINVAL; } + case BPF_PROG_TYPE_SK_LOOKUP: + if (expected_attach_type == BPF_SK_LOOKUP) + return 0; + return -EINVAL; case BPF_PROG_TYPE_EXT: if (expected_attach_type) return -EINVAL; @@ -2016,6 +2035,55 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, } } +static bool is_net_admin_prog_type(enum bpf_prog_type prog_type) +{ + switch (prog_type) { + case BPF_PROG_TYPE_SCHED_CLS: + case BPF_PROG_TYPE_SCHED_ACT: + case BPF_PROG_TYPE_XDP: + case BPF_PROG_TYPE_LWT_IN: + case BPF_PROG_TYPE_LWT_OUT: + case BPF_PROG_TYPE_LWT_XMIT: + case BPF_PROG_TYPE_LWT_SEG6LOCAL: + case BPF_PROG_TYPE_SK_SKB: + case BPF_PROG_TYPE_SK_MSG: + case BPF_PROG_TYPE_LIRC_MODE2: + case BPF_PROG_TYPE_FLOW_DISSECTOR: + case BPF_PROG_TYPE_CGROUP_DEVICE: + case BPF_PROG_TYPE_CGROUP_SOCK: + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + case BPF_PROG_TYPE_CGROUP_SOCKOPT: + case BPF_PROG_TYPE_CGROUP_SYSCTL: + case BPF_PROG_TYPE_SOCK_OPS: + case BPF_PROG_TYPE_EXT: /* extends any prog */ + return true; + case BPF_PROG_TYPE_CGROUP_SKB: + /* always unpriv */ + case BPF_PROG_TYPE_SK_REUSEPORT: + /* equivalent to SOCKET_FILTER. need CAP_BPF only */ + default: + return false; + } +} + +static bool is_perfmon_prog_type(enum bpf_prog_type prog_type) +{ + switch (prog_type) { + case BPF_PROG_TYPE_KPROBE: + case BPF_PROG_TYPE_TRACEPOINT: + case BPF_PROG_TYPE_PERF_EVENT: + case BPF_PROG_TYPE_RAW_TRACEPOINT: + case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: + case BPF_PROG_TYPE_TRACING: + case BPF_PROG_TYPE_LSM: + case BPF_PROG_TYPE_STRUCT_OPS: /* has access to struct sock */ + case BPF_PROG_TYPE_EXT: /* extends any prog */ + return true; + default: + return false; + } +} + /* last field in 'union bpf_attr' used by this command */ #define BPF_PROG_LOAD_LAST_FIELD attach_prog_fd @@ -2038,7 +2106,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && (attr->prog_flags & BPF_F_ANY_ALIGNMENT) && - !capable(CAP_SYS_ADMIN)) + !bpf_capable()) return -EPERM; /* copy eBPF program license from user space */ @@ -2051,11 +2119,16 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) is_gpl = license_is_gpl_compatible(license); if (attr->insn_cnt == 0 || - attr->insn_cnt > (capable(CAP_SYS_ADMIN) ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) + attr->insn_cnt > (bpf_capable() ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) return -E2BIG; if (type != BPF_PROG_TYPE_SOCKET_FILTER && type != BPF_PROG_TYPE_CGROUP_SKB && - !capable(CAP_SYS_ADMIN)) + !bpf_capable()) + return -EPERM; + + if (is_net_admin_prog_type(type) && !capable(CAP_NET_ADMIN) && !capable(CAP_SYS_ADMIN)) + return -EPERM; + if (is_perfmon_prog_type(type) && !perfmon_capable()) return -EPERM; bpf_prog_load_fixup_attach_type(attr); @@ -2194,25 +2267,39 @@ static int bpf_obj_get(const union bpf_attr *attr) attr->file_flags); } -void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops, - struct bpf_prog *prog) +void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, + const struct bpf_link_ops *ops, struct bpf_prog *prog) { atomic64_set(&link->refcnt, 1); + link->type = type; + link->id = 0; link->ops = ops; link->prog = prog; } +static void bpf_link_free_id(int id) +{ + if (!id) + return; + + spin_lock_bh(&link_idr_lock); + idr_remove(&link_idr, id); + spin_unlock_bh(&link_idr_lock); +} + /* Clean up bpf_link and corresponding anon_inode file and FD. After * anon_inode is created, bpf_link can't be just kfree()'d due to deferred - * anon_inode's release() call. This helper manages marking bpf_link as - * defunct, releases anon_inode file and puts reserved FD. + * anon_inode's release() call. This helper marksbpf_link as + * defunct, releases anon_inode file and puts reserved FD. bpf_prog's refcnt + * is not decremented, it's the responsibility of a calling code that failed + * to complete bpf_link initialization. */ -void bpf_link_cleanup(struct bpf_link *link, struct file *link_file, - int link_fd) +void bpf_link_cleanup(struct bpf_link_primer *primer) { - link->prog = NULL; - fput(link_file); - put_unused_fd(link_fd); + primer->link->prog = NULL; + bpf_link_free_id(primer->id); + fput(primer->file); + put_unused_fd(primer->fd); } void bpf_link_inc(struct bpf_link *link) @@ -2223,6 +2310,7 @@ void bpf_link_inc(struct bpf_link *link) /* bpf_link_free is guaranteed to be called from process context */ static void bpf_link_free(struct bpf_link *link) { + bpf_link_free_id(link->id); if (link->prog) { /* detach BPF program, clean up used resources */ link->ops->release(link); @@ -2264,35 +2352,35 @@ static int bpf_link_release(struct inode *inode, struct file *filp) } #ifdef CONFIG_PROC_FS -static const struct bpf_link_ops bpf_raw_tp_lops; -static const struct bpf_link_ops bpf_tracing_link_lops; +#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) +#define BPF_MAP_TYPE(_id, _ops) +#define BPF_LINK_TYPE(_id, _name) [_id] = #_name, +static const char *bpf_link_type_strs[] = { + [BPF_LINK_TYPE_UNSPEC] = "<invalid>", +#include <linux/bpf_types.h> +}; +#undef BPF_PROG_TYPE +#undef BPF_MAP_TYPE +#undef BPF_LINK_TYPE static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) { const struct bpf_link *link = filp->private_data; const struct bpf_prog *prog = link->prog; char prog_tag[sizeof(prog->tag) * 2 + 1] = { }; - const char *link_type; - - if (link->ops == &bpf_raw_tp_lops) - link_type = "raw_tracepoint"; - else if (link->ops == &bpf_tracing_link_lops) - link_type = "tracing"; -#ifdef CONFIG_CGROUP_BPF - else if (link->ops == &bpf_cgroup_link_lops) - link_type = "cgroup"; -#endif - else - link_type = "unknown"; bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); seq_printf(m, "link_type:\t%s\n" + "link_id:\t%u\n" "prog_tag:\t%s\n" "prog_id:\t%u\n", - link_type, + bpf_link_type_strs[link->type], + link->id, prog_tag, prog->aux->id); + if (link->ops->show_fdinfo) + link->ops->show_fdinfo(link, m); } #endif @@ -2305,36 +2393,77 @@ static const struct file_operations bpf_link_fops = { .write = bpf_dummy_write, }; -int bpf_link_new_fd(struct bpf_link *link) +static int bpf_link_alloc_id(struct bpf_link *link) { - return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC); -} + int id; + + idr_preload(GFP_KERNEL); + spin_lock_bh(&link_idr_lock); + id = idr_alloc_cyclic(&link_idr, link, 1, INT_MAX, GFP_ATOMIC); + spin_unlock_bh(&link_idr_lock); + idr_preload_end(); -/* Similar to bpf_link_new_fd, create anon_inode for given bpf_link, but - * instead of immediately installing fd in fdtable, just reserve it and - * return. Caller then need to either install it with fd_install(fd, file) or - * release with put_unused_fd(fd). - * This is useful for cases when bpf_link attachment/detachment are - * complicated and expensive operations and should be delayed until all the fd - * reservation and anon_inode creation succeeds. + return id; +} + +/* Prepare bpf_link to be exposed to user-space by allocating anon_inode file, + * reserving unused FD and allocating ID from link_idr. This is to be paired + * with bpf_link_settle() to install FD and ID and expose bpf_link to + * user-space, if bpf_link is successfully attached. If not, bpf_link and + * pre-allocated resources are to be freed with bpf_cleanup() call. All the + * transient state is passed around in struct bpf_link_primer. + * This is preferred way to create and initialize bpf_link, especially when + * there are complicated and expensive operations inbetween creating bpf_link + * itself and attaching it to BPF hook. By using bpf_link_prime() and + * bpf_link_settle() kernel code using bpf_link doesn't have to perform + * expensive (and potentially failing) roll back operations in a rare case + * that file, FD, or ID can't be allocated. */ -struct file *bpf_link_new_file(struct bpf_link *link, int *reserved_fd) +int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer) { struct file *file; - int fd; + int fd, id; fd = get_unused_fd_flags(O_CLOEXEC); if (fd < 0) - return ERR_PTR(fd); + return fd; + + + id = bpf_link_alloc_id(link); + if (id < 0) { + put_unused_fd(fd); + return id; + } file = anon_inode_getfile("bpf_link", &bpf_link_fops, link, O_CLOEXEC); if (IS_ERR(file)) { + bpf_link_free_id(id); put_unused_fd(fd); - return file; + return PTR_ERR(file); } - *reserved_fd = fd; - return file; + primer->link = link; + primer->file = file; + primer->fd = fd; + primer->id = id; + return 0; +} + +int bpf_link_settle(struct bpf_link_primer *primer) +{ + /* make bpf_link fetchable by ID */ + spin_lock_bh(&link_idr_lock); + primer->link->id = primer->id; + spin_unlock_bh(&link_idr_lock); + /* make bpf_link fetchable by FD */ + fd_install(primer->fd, primer->file); + /* pass through installed FD */ + return primer->fd; +} + +int bpf_link_new_fd(struct bpf_link *link) +{ + return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC); } struct bpf_link *bpf_link_get_from_fd(u32 ufd) @@ -2358,6 +2487,7 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd) struct bpf_tracing_link { struct bpf_link link; + enum bpf_attach_type attach_type; }; static void bpf_tracing_link_release(struct bpf_link *link) @@ -2373,16 +2503,40 @@ static void bpf_tracing_link_dealloc(struct bpf_link *link) kfree(tr_link); } +static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_tracing_link *tr_link = + container_of(link, struct bpf_tracing_link, link); + + seq_printf(seq, + "attach_type:\t%d\n", + tr_link->attach_type); +} + +static int bpf_tracing_link_fill_link_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + struct bpf_tracing_link *tr_link = + container_of(link, struct bpf_tracing_link, link); + + info->tracing.attach_type = tr_link->attach_type; + + return 0; +} + static const struct bpf_link_ops bpf_tracing_link_lops = { .release = bpf_tracing_link_release, .dealloc = bpf_tracing_link_dealloc, + .show_fdinfo = bpf_tracing_link_show_fdinfo, + .fill_link_info = bpf_tracing_link_fill_link_info, }; static int bpf_tracing_prog_attach(struct bpf_prog *prog) { + struct bpf_link_primer link_primer; struct bpf_tracing_link *link; - struct file *link_file; - int link_fd, err; + int err; switch (prog->type) { case BPF_PROG_TYPE_TRACING: @@ -2415,24 +2569,23 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog) err = -ENOMEM; goto out_put_prog; } - bpf_link_init(&link->link, &bpf_tracing_link_lops, prog); + bpf_link_init(&link->link, BPF_LINK_TYPE_TRACING, + &bpf_tracing_link_lops, prog); + link->attach_type = prog->expected_attach_type; - link_file = bpf_link_new_file(&link->link, &link_fd); - if (IS_ERR(link_file)) { + err = bpf_link_prime(&link->link, &link_primer); + if (err) { kfree(link); - err = PTR_ERR(link_file); goto out_put_prog; } err = bpf_trampoline_link_prog(prog); if (err) { - bpf_link_cleanup(&link->link, link_file, link_fd); + bpf_link_cleanup(&link_primer); goto out_put_prog; } - fd_install(link_fd, link_file); - return link_fd; - + return bpf_link_settle(&link_primer); out_put_prog: bpf_prog_put(prog); return err; @@ -2460,22 +2613,69 @@ static void bpf_raw_tp_link_dealloc(struct bpf_link *link) kfree(raw_tp); } -static const struct bpf_link_ops bpf_raw_tp_lops = { +static void bpf_raw_tp_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_raw_tp_link *raw_tp_link = + container_of(link, struct bpf_raw_tp_link, link); + + seq_printf(seq, + "tp_name:\t%s\n", + raw_tp_link->btp->tp->name); +} + +static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + struct bpf_raw_tp_link *raw_tp_link = + container_of(link, struct bpf_raw_tp_link, link); + char __user *ubuf = u64_to_user_ptr(info->raw_tracepoint.tp_name); + const char *tp_name = raw_tp_link->btp->tp->name; + u32 ulen = info->raw_tracepoint.tp_name_len; + size_t tp_len = strlen(tp_name); + + if (ulen && !ubuf) + return -EINVAL; + + info->raw_tracepoint.tp_name_len = tp_len + 1; + + if (!ubuf) + return 0; + + if (ulen >= tp_len + 1) { + if (copy_to_user(ubuf, tp_name, tp_len + 1)) + return -EFAULT; + } else { + char zero = '\0'; + + if (copy_to_user(ubuf, tp_name, ulen - 1)) + return -EFAULT; + if (put_user(zero, ubuf + ulen - 1)) + return -EFAULT; + return -ENOSPC; + } + + return 0; +} + +static const struct bpf_link_ops bpf_raw_tp_link_lops = { .release = bpf_raw_tp_link_release, .dealloc = bpf_raw_tp_link_dealloc, + .show_fdinfo = bpf_raw_tp_link_show_fdinfo, + .fill_link_info = bpf_raw_tp_link_fill_link_info, }; #define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd static int bpf_raw_tracepoint_open(const union bpf_attr *attr) { + struct bpf_link_primer link_primer; struct bpf_raw_tp_link *link; struct bpf_raw_event_map *btp; - struct file *link_file; struct bpf_prog *prog; const char *tp_name; char buf[128]; - int link_fd, err; + int err; if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN)) return -EINVAL; @@ -2528,24 +2728,23 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) err = -ENOMEM; goto out_put_btp; } - bpf_link_init(&link->link, &bpf_raw_tp_lops, prog); + bpf_link_init(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT, + &bpf_raw_tp_link_lops, prog); link->btp = btp; - link_file = bpf_link_new_file(&link->link, &link_fd); - if (IS_ERR(link_file)) { + err = bpf_link_prime(&link->link, &link_primer); + if (err) { kfree(link); - err = PTR_ERR(link_file); goto out_put_btp; } err = bpf_probe_register(link->btp, prog); if (err) { - bpf_link_cleanup(&link->link, link_file, link_fd); + bpf_link_cleanup(&link_primer); goto out_put_btp; } - fd_install(link_fd, link_file); - return link_fd; + return bpf_link_settle(&link_primer); out_put_btp: bpf_put_raw_tracepoint(btp); @@ -2561,8 +2760,14 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: case BPF_PROG_TYPE_CGROUP_SOCKOPT: + case BPF_PROG_TYPE_SK_LOOKUP: return attach_type == prog->expected_attach_type ? 0 : -EINVAL; case BPF_PROG_TYPE_CGROUP_SKB: + if (!capable(CAP_NET_ADMIN)) + /* cg-skb progs can be loaded by unpriv user. + * check permissions at attach time. + */ + return -EPERM; return prog->enforce_expected_attach_type && prog->expected_attach_type != attach_type ? -EINVAL : 0; @@ -2580,6 +2785,7 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) return BPF_PROG_TYPE_CGROUP_SKB; break; case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_INET_SOCK_RELEASE: case BPF_CGROUP_INET4_POST_BIND: case BPF_CGROUP_INET6_POST_BIND: return BPF_PROG_TYPE_CGROUP_SOCK; @@ -2587,6 +2793,10 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: @@ -2610,6 +2820,12 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) case BPF_CGROUP_GETSOCKOPT: case BPF_CGROUP_SETSOCKOPT: return BPF_PROG_TYPE_CGROUP_SOCKOPT; + case BPF_TRACE_ITER: + return BPF_PROG_TYPE_TRACING; + case BPF_SK_LOOKUP: + return BPF_PROG_TYPE_SK_LOOKUP; + case BPF_XDP: + return BPF_PROG_TYPE_XDP; default: return BPF_PROG_TYPE_UNSPEC; } @@ -2626,9 +2842,6 @@ static int bpf_prog_attach(const union bpf_attr *attr) struct bpf_prog *prog; int ret; - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - if (CHECK_ATTR(BPF_PROG_ATTACH)) return -EINVAL; @@ -2657,7 +2870,7 @@ static int bpf_prog_attach(const union bpf_attr *attr) ret = lirc_prog_attach(attr, prog); break; case BPF_PROG_TYPE_FLOW_DISSECTOR: - ret = skb_flow_dissector_bpf_prog_attach(attr, prog); + ret = netns_bpf_prog_attach(attr, prog); break; case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_SKB: @@ -2683,9 +2896,6 @@ static int bpf_prog_detach(const union bpf_attr *attr) { enum bpf_prog_type ptype; - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - if (CHECK_ATTR(BPF_PROG_DETACH)) return -EINVAL; @@ -2694,11 +2904,11 @@ static int bpf_prog_detach(const union bpf_attr *attr) switch (ptype) { case BPF_PROG_TYPE_SK_MSG: case BPF_PROG_TYPE_SK_SKB: - return sock_map_get_from_fd(attr, NULL); + return sock_map_prog_detach(attr, ptype); case BPF_PROG_TYPE_LIRC_MODE2: return lirc_prog_detach(attr); case BPF_PROG_TYPE_FLOW_DISSECTOR: - return skb_flow_dissector_bpf_prog_detach(attr); + return netns_bpf_prog_detach(attr, ptype); case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_SKB: case BPF_PROG_TYPE_CGROUP_SOCK: @@ -2728,12 +2938,17 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_INET_INGRESS: case BPF_CGROUP_INET_EGRESS: case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_INET_SOCK_RELEASE: case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_POST_BIND: case BPF_CGROUP_INET6_POST_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: @@ -2747,7 +2962,8 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_LIRC_MODE2: return lirc_prog_query(attr, uattr); case BPF_FLOW_DISSECTOR: - return skb_flow_dissector_prog_query(attr, uattr); + case BPF_SK_LOOKUP: + return netns_bpf_prog_query(attr, uattr); default: return -EINVAL; } @@ -2761,8 +2977,6 @@ static int bpf_prog_test_run(const union bpf_attr *attr, struct bpf_prog *prog; int ret = -ENOTSUPP; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; if (CHECK_ATTR(BPF_PROG_TEST_RUN)) return -EINVAL; @@ -2813,6 +3027,44 @@ static int bpf_obj_get_next_id(const union bpf_attr *attr, return err; } +struct bpf_map *bpf_map_get_curr_or_next(u32 *id) +{ + struct bpf_map *map; + + spin_lock_bh(&map_idr_lock); +again: + map = idr_get_next(&map_idr, id); + if (map) { + map = __bpf_map_inc_not_zero(map, false); + if (IS_ERR(map)) { + (*id)++; + goto again; + } + } + spin_unlock_bh(&map_idr_lock); + + return map; +} + +struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id) +{ + struct bpf_prog *prog; + + spin_lock_bh(&prog_idr_lock); +again: + prog = idr_get_next(&prog_idr, id); + if (prog) { + prog = bpf_prog_inc_not_zero(prog); + if (IS_ERR(prog)) { + (*id)++; + goto again; + } + } + spin_unlock_bh(&prog_idr_lock); + + return prog; +} + #define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id struct bpf_prog *bpf_prog_by_id(u32 id) @@ -2917,12 +3169,14 @@ static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog, return NULL; } -static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog) +static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog, + const struct cred *f_cred) { const struct bpf_map *map; struct bpf_insn *insns; u32 off, type; u64 imm; + u8 code; int i; insns = kmemdup(prog->insnsi, bpf_prog_insn_size(prog), @@ -2931,21 +3185,27 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog) return insns; for (i = 0; i < prog->len; i++) { - if (insns[i].code == (BPF_JMP | BPF_TAIL_CALL)) { + code = insns[i].code; + + if (code == (BPF_JMP | BPF_TAIL_CALL)) { insns[i].code = BPF_JMP | BPF_CALL; insns[i].imm = BPF_FUNC_tail_call; /* fall-through */ } - if (insns[i].code == (BPF_JMP | BPF_CALL) || - insns[i].code == (BPF_JMP | BPF_CALL_ARGS)) { - if (insns[i].code == (BPF_JMP | BPF_CALL_ARGS)) + if (code == (BPF_JMP | BPF_CALL) || + code == (BPF_JMP | BPF_CALL_ARGS)) { + if (code == (BPF_JMP | BPF_CALL_ARGS)) insns[i].code = BPF_JMP | BPF_CALL; - if (!bpf_dump_raw_ok()) + if (!bpf_dump_raw_ok(f_cred)) insns[i].imm = 0; continue; } + if (BPF_CLASS(code) == BPF_LDX && BPF_MODE(code) == BPF_PROBE_MEM) { + insns[i].code = BPF_LDX | BPF_SIZE(code) | BPF_MEM; + continue; + } - if (insns[i].code != (BPF_LD | BPF_IMM | BPF_DW)) + if (code != (BPF_LD | BPF_IMM | BPF_DW)) continue; imm = ((u64)insns[i + 1].imm << 32) | (u32)insns[i].imm; @@ -2992,7 +3252,8 @@ static int set_info_rec_size(struct bpf_prog_info *info) return 0; } -static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, +static int bpf_prog_get_info_by_fd(struct file *file, + struct bpf_prog *prog, const union bpf_attr *attr, union bpf_attr __user *uattr) { @@ -3044,7 +3305,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, info.run_time_ns = stats.nsecs; info.run_cnt = stats.cnt; - if (!capable(CAP_SYS_ADMIN)) { + if (!bpf_capable()) { info.jited_prog_len = 0; info.xlated_prog_len = 0; info.nr_jited_ksyms = 0; @@ -3061,11 +3322,11 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, struct bpf_insn *insns_sanitized; bool fault; - if (prog->blinded && !bpf_dump_raw_ok()) { + if (prog->blinded && !bpf_dump_raw_ok(file->f_cred)) { info.xlated_prog_insns = 0; goto done; } - insns_sanitized = bpf_insn_prepare_dump(prog); + insns_sanitized = bpf_insn_prepare_dump(prog, file->f_cred); if (!insns_sanitized) return -ENOMEM; uinsns = u64_to_user_ptr(info.xlated_prog_insns); @@ -3099,7 +3360,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, } if (info.jited_prog_len && ulen) { - if (bpf_dump_raw_ok()) { + if (bpf_dump_raw_ok(file->f_cred)) { uinsns = u64_to_user_ptr(info.jited_prog_insns); ulen = min_t(u32, info.jited_prog_len, ulen); @@ -3134,7 +3395,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, ulen = info.nr_jited_ksyms; info.nr_jited_ksyms = prog->aux->func_cnt ? : 1; if (ulen) { - if (bpf_dump_raw_ok()) { + if (bpf_dump_raw_ok(file->f_cred)) { unsigned long ksym_addr; u64 __user *user_ksyms; u32 i; @@ -3165,7 +3426,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, ulen = info.nr_jited_func_lens; info.nr_jited_func_lens = prog->aux->func_cnt ? : 1; if (ulen) { - if (bpf_dump_raw_ok()) { + if (bpf_dump_raw_ok(file->f_cred)) { u32 __user *user_lens; u32 func_len, i; @@ -3222,7 +3483,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, else info.nr_jited_line_info = 0; if (info.nr_jited_line_info && ulen) { - if (bpf_dump_raw_ok()) { + if (bpf_dump_raw_ok(file->f_cred)) { __u64 __user *user_linfo; u32 i; @@ -3268,7 +3529,8 @@ done: return 0; } -static int bpf_map_get_info_by_fd(struct bpf_map *map, +static int bpf_map_get_info_by_fd(struct file *file, + struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr) { @@ -3311,7 +3573,8 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map, return 0; } -static int bpf_btf_get_info_by_fd(struct btf *btf, +static int bpf_btf_get_info_by_fd(struct file *file, + struct btf *btf, const union bpf_attr *attr, union bpf_attr __user *uattr) { @@ -3326,6 +3589,43 @@ static int bpf_btf_get_info_by_fd(struct btf *btf, return btf_get_info_by_fd(btf, attr, uattr); } +static int bpf_link_get_info_by_fd(struct file *file, + struct bpf_link *link, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + struct bpf_link_info __user *uinfo = u64_to_user_ptr(attr->info.info); + struct bpf_link_info info; + u32 info_len = attr->info.info_len; + int err; + + err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len); + if (err) + return err; + info_len = min_t(u32, sizeof(info), info_len); + + memset(&info, 0, sizeof(info)); + if (copy_from_user(&info, uinfo, info_len)) + return -EFAULT; + + info.type = link->type; + info.id = link->id; + info.prog_id = link->prog->aux->id; + + if (link->ops->fill_link_info) { + err = link->ops->fill_link_info(link, &info); + if (err) + return err; + } + + if (copy_to_user(uinfo, &info, info_len) || + put_user(info_len, &uattr->info.info_len)) + return -EFAULT; + + return 0; +} + + #define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, @@ -3343,13 +3643,16 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, return -EBADFD; if (f.file->f_op == &bpf_prog_fops) - err = bpf_prog_get_info_by_fd(f.file->private_data, attr, + err = bpf_prog_get_info_by_fd(f.file, f.file->private_data, attr, uattr); else if (f.file->f_op == &bpf_map_fops) - err = bpf_map_get_info_by_fd(f.file->private_data, attr, + err = bpf_map_get_info_by_fd(f.file, f.file->private_data, attr, uattr); else if (f.file->f_op == &btf_fops) - err = bpf_btf_get_info_by_fd(f.file->private_data, attr, uattr); + err = bpf_btf_get_info_by_fd(f.file, f.file->private_data, attr, uattr); + else if (f.file->f_op == &bpf_link_fops) + err = bpf_link_get_info_by_fd(f.file, f.file->private_data, + attr, uattr); else err = -EINVAL; @@ -3364,7 +3667,7 @@ static int bpf_btf_load(const union bpf_attr *attr) if (CHECK_ATTR(BPF_BTF_LOAD)) return -EINVAL; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return -EPERM; return btf_new_fd(attr); @@ -3477,7 +3780,7 @@ static int bpf_task_fd_query(const union bpf_attr *attr, if (file->f_op == &bpf_link_fops) { struct bpf_link *link = file->private_data; - if (link->ops == &bpf_raw_tp_lops) { + if (link->ops == &bpf_raw_tp_link_lops) { struct bpf_raw_tp_link *raw_tp = container_of(link, struct bpf_raw_tp_link, link); struct bpf_raw_event_map *btp = raw_tp->btp; @@ -3571,6 +3874,15 @@ err_put: return err; } +static int tracing_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + if (attr->link_create.attach_type == BPF_TRACE_ITER && + prog->expected_attach_type == BPF_TRACE_ITER) + return bpf_iter_link_attach(attr, prog); + + return -EINVAL; +} + #define BPF_LINK_CREATE_LAST_FIELD link_create.flags static int link_create(union bpf_attr *attr) { @@ -3578,9 +3890,6 @@ static int link_create(union bpf_attr *attr) struct bpf_prog *prog; int ret; - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - if (CHECK_ATTR(BPF_LINK_CREATE)) return -EINVAL; @@ -3607,6 +3916,18 @@ static int link_create(union bpf_attr *attr) case BPF_PROG_TYPE_CGROUP_SOCKOPT: ret = cgroup_bpf_link_attach(attr, prog); break; + case BPF_PROG_TYPE_TRACING: + ret = tracing_bpf_link_attach(attr, prog); + break; + case BPF_PROG_TYPE_FLOW_DISSECTOR: + case BPF_PROG_TYPE_SK_LOOKUP: + ret = netns_bpf_link_create(attr, prog); + break; +#ifdef CONFIG_NET + case BPF_PROG_TYPE_XDP: + ret = bpf_xdp_link_attach(attr, prog); + break; +#endif default: ret = -EINVAL; } @@ -3626,9 +3947,6 @@ static int link_update(union bpf_attr *attr) u32 flags; int ret; - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - if (CHECK_ATTR(BPF_LINK_UPDATE)) return -EINVAL; @@ -3658,13 +3976,10 @@ static int link_update(union bpf_attr *attr) goto out_put_progs; } -#ifdef CONFIG_CGROUP_BPF - if (link->ops == &bpf_cgroup_link_lops) { - ret = cgroup_bpf_replace(link, old_prog, new_prog); - goto out_put_progs; - } -#endif - ret = -EINVAL; + if (link->ops->update_prog) + ret = link->ops->update_prog(link, new_prog, old_prog); + else + ret = -EINVAL; out_put_progs: if (old_prog) @@ -3676,12 +3991,154 @@ out_put_link: return ret; } +#define BPF_LINK_DETACH_LAST_FIELD link_detach.link_fd + +static int link_detach(union bpf_attr *attr) +{ + struct bpf_link *link; + int ret; + + if (CHECK_ATTR(BPF_LINK_DETACH)) + return -EINVAL; + + link = bpf_link_get_from_fd(attr->link_detach.link_fd); + if (IS_ERR(link)) + return PTR_ERR(link); + + if (link->ops->detach) + ret = link->ops->detach(link); + else + ret = -EOPNOTSUPP; + + bpf_link_put(link); + return ret; +} + +static int bpf_link_inc_not_zero(struct bpf_link *link) +{ + return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? 0 : -ENOENT; +} + +#define BPF_LINK_GET_FD_BY_ID_LAST_FIELD link_id + +static int bpf_link_get_fd_by_id(const union bpf_attr *attr) +{ + struct bpf_link *link; + u32 id = attr->link_id; + int fd, err; + + if (CHECK_ATTR(BPF_LINK_GET_FD_BY_ID)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + spin_lock_bh(&link_idr_lock); + link = idr_find(&link_idr, id); + /* before link is "settled", ID is 0, pretend it doesn't exist yet */ + if (link) { + if (link->id) + err = bpf_link_inc_not_zero(link); + else + err = -EAGAIN; + } else { + err = -ENOENT; + } + spin_unlock_bh(&link_idr_lock); + + if (err) + return err; + + fd = bpf_link_new_fd(link); + if (fd < 0) + bpf_link_put(link); + + return fd; +} + +DEFINE_MUTEX(bpf_stats_enabled_mutex); + +static int bpf_stats_release(struct inode *inode, struct file *file) +{ + mutex_lock(&bpf_stats_enabled_mutex); + static_key_slow_dec(&bpf_stats_enabled_key.key); + mutex_unlock(&bpf_stats_enabled_mutex); + return 0; +} + +static const struct file_operations bpf_stats_fops = { + .release = bpf_stats_release, +}; + +static int bpf_enable_runtime_stats(void) +{ + int fd; + + mutex_lock(&bpf_stats_enabled_mutex); + + /* Set a very high limit to avoid overflow */ + if (static_key_count(&bpf_stats_enabled_key.key) > INT_MAX / 2) { + mutex_unlock(&bpf_stats_enabled_mutex); + return -EBUSY; + } + + fd = anon_inode_getfd("bpf-stats", &bpf_stats_fops, NULL, O_CLOEXEC); + if (fd >= 0) + static_key_slow_inc(&bpf_stats_enabled_key.key); + + mutex_unlock(&bpf_stats_enabled_mutex); + return fd; +} + +#define BPF_ENABLE_STATS_LAST_FIELD enable_stats.type + +static int bpf_enable_stats(union bpf_attr *attr) +{ + + if (CHECK_ATTR(BPF_ENABLE_STATS)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + switch (attr->enable_stats.type) { + case BPF_STATS_RUN_TIME: + return bpf_enable_runtime_stats(); + default: + break; + } + return -EINVAL; +} + +#define BPF_ITER_CREATE_LAST_FIELD iter_create.flags + +static int bpf_iter_create(union bpf_attr *attr) +{ + struct bpf_link *link; + int err; + + if (CHECK_ATTR(BPF_ITER_CREATE)) + return -EINVAL; + + if (attr->iter_create.flags) + return -EINVAL; + + link = bpf_link_get_from_fd(attr->iter_create.link_fd); + if (IS_ERR(link)) + return PTR_ERR(link); + + err = bpf_iter_new_fd(link); + bpf_link_put(link); + + return err; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr; int err; - if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN)) + if (sysctl_unprivileged_bpf_disabled && !bpf_capable()) return -EPERM; err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size); @@ -3793,6 +4250,22 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_LINK_UPDATE: err = link_update(&attr); break; + case BPF_LINK_GET_FD_BY_ID: + err = bpf_link_get_fd_by_id(&attr); + break; + case BPF_LINK_GET_NEXT_ID: + err = bpf_obj_get_next_id(&attr, uattr, + &link_idr, &link_idr_lock); + break; + case BPF_ENABLE_STATS: + err = bpf_enable_stats(&attr); + break; + case BPF_ITER_CREATE: + err = bpf_iter_create(&attr); + break; + case BPF_LINK_DETACH: + err = link_detach(&attr); + break; default: err = -EINVAL; break; diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c new file mode 100644 index 000000000000..232df29793e9 --- /dev/null +++ b/kernel/bpf/task_iter.c @@ -0,0 +1,371 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2020 Facebook */ + +#include <linux/init.h> +#include <linux/namei.h> +#include <linux/pid_namespace.h> +#include <linux/fs.h> +#include <linux/fdtable.h> +#include <linux/filter.h> +#include <linux/btf_ids.h> + +struct bpf_iter_seq_task_common { + struct pid_namespace *ns; +}; + +struct bpf_iter_seq_task_info { + /* The first field must be struct bpf_iter_seq_task_common. + * this is assumed by {init, fini}_seq_pidns() callback functions. + */ + struct bpf_iter_seq_task_common common; + u32 tid; +}; + +static struct task_struct *task_seq_get_next(struct pid_namespace *ns, + u32 *tid) +{ + struct task_struct *task = NULL; + struct pid *pid; + + rcu_read_lock(); +retry: + pid = idr_get_next(&ns->idr, tid); + if (pid) { + task = get_pid_task(pid, PIDTYPE_PID); + if (!task) { + ++*tid; + goto retry; + } + } + rcu_read_unlock(); + + return task; +} + +static void *task_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct bpf_iter_seq_task_info *info = seq->private; + struct task_struct *task; + + task = task_seq_get_next(info->common.ns, &info->tid); + if (!task) + return NULL; + + if (*pos == 0) + ++*pos; + return task; +} + +static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct bpf_iter_seq_task_info *info = seq->private; + struct task_struct *task; + + ++*pos; + ++info->tid; + put_task_struct((struct task_struct *)v); + task = task_seq_get_next(info->common.ns, &info->tid); + if (!task) + return NULL; + + return task; +} + +struct bpf_iter__task { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct task_struct *, task); +}; + +DEFINE_BPF_ITER_FUNC(task, struct bpf_iter_meta *meta, struct task_struct *task) + +static int __task_seq_show(struct seq_file *seq, struct task_struct *task, + bool in_stop) +{ + struct bpf_iter_meta meta; + struct bpf_iter__task ctx; + struct bpf_prog *prog; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, in_stop); + if (!prog) + return 0; + + meta.seq = seq; + ctx.meta = &meta; + ctx.task = task; + return bpf_iter_run_prog(prog, &ctx); +} + +static int task_seq_show(struct seq_file *seq, void *v) +{ + return __task_seq_show(seq, v, false); +} + +static void task_seq_stop(struct seq_file *seq, void *v) +{ + if (!v) + (void)__task_seq_show(seq, v, true); + else + put_task_struct((struct task_struct *)v); +} + +static const struct seq_operations task_seq_ops = { + .start = task_seq_start, + .next = task_seq_next, + .stop = task_seq_stop, + .show = task_seq_show, +}; + +struct bpf_iter_seq_task_file_info { + /* The first field must be struct bpf_iter_seq_task_common. + * this is assumed by {init, fini}_seq_pidns() callback functions. + */ + struct bpf_iter_seq_task_common common; + struct task_struct *task; + struct files_struct *files; + u32 tid; + u32 fd; +}; + +static struct file * +task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info, + struct task_struct **task, struct files_struct **fstruct) +{ + struct pid_namespace *ns = info->common.ns; + u32 curr_tid = info->tid, max_fds; + struct files_struct *curr_files; + struct task_struct *curr_task; + int curr_fd = info->fd; + + /* If this function returns a non-NULL file object, + * it held a reference to the task/files_struct/file. + * Otherwise, it does not hold any reference. + */ +again: + if (*task) { + curr_task = *task; + curr_files = *fstruct; + curr_fd = info->fd; + } else { + curr_task = task_seq_get_next(ns, &curr_tid); + if (!curr_task) + return NULL; + + curr_files = get_files_struct(curr_task); + if (!curr_files) { + put_task_struct(curr_task); + curr_tid = ++(info->tid); + info->fd = 0; + goto again; + } + + /* set *fstruct, *task and info->tid */ + *fstruct = curr_files; + *task = curr_task; + if (curr_tid == info->tid) { + curr_fd = info->fd; + } else { + info->tid = curr_tid; + curr_fd = 0; + } + } + + rcu_read_lock(); + max_fds = files_fdtable(curr_files)->max_fds; + for (; curr_fd < max_fds; curr_fd++) { + struct file *f; + + f = fcheck_files(curr_files, curr_fd); + if (!f) + continue; + + /* set info->fd */ + info->fd = curr_fd; + get_file(f); + rcu_read_unlock(); + return f; + } + + /* the current task is done, go to the next task */ + rcu_read_unlock(); + put_files_struct(curr_files); + put_task_struct(curr_task); + *task = NULL; + *fstruct = NULL; + info->fd = 0; + curr_tid = ++(info->tid); + goto again; +} + +static void *task_file_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct bpf_iter_seq_task_file_info *info = seq->private; + struct files_struct *files = NULL; + struct task_struct *task = NULL; + struct file *file; + + file = task_file_seq_get_next(info, &task, &files); + if (!file) { + info->files = NULL; + info->task = NULL; + return NULL; + } + + if (*pos == 0) + ++*pos; + info->task = task; + info->files = files; + + return file; +} + +static void *task_file_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct bpf_iter_seq_task_file_info *info = seq->private; + struct files_struct *files = info->files; + struct task_struct *task = info->task; + struct file *file; + + ++*pos; + ++info->fd; + fput((struct file *)v); + file = task_file_seq_get_next(info, &task, &files); + if (!file) { + info->files = NULL; + info->task = NULL; + return NULL; + } + + info->task = task; + info->files = files; + + return file; +} + +struct bpf_iter__task_file { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct task_struct *, task); + u32 fd __aligned(8); + __bpf_md_ptr(struct file *, file); +}; + +DEFINE_BPF_ITER_FUNC(task_file, struct bpf_iter_meta *meta, + struct task_struct *task, u32 fd, + struct file *file) + +static int __task_file_seq_show(struct seq_file *seq, struct file *file, + bool in_stop) +{ + struct bpf_iter_seq_task_file_info *info = seq->private; + struct bpf_iter__task_file ctx; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, in_stop); + if (!prog) + return 0; + + ctx.meta = &meta; + ctx.task = info->task; + ctx.fd = info->fd; + ctx.file = file; + return bpf_iter_run_prog(prog, &ctx); +} + +static int task_file_seq_show(struct seq_file *seq, void *v) +{ + return __task_file_seq_show(seq, v, false); +} + +static void task_file_seq_stop(struct seq_file *seq, void *v) +{ + struct bpf_iter_seq_task_file_info *info = seq->private; + + if (!v) { + (void)__task_file_seq_show(seq, v, true); + } else { + fput((struct file *)v); + put_files_struct(info->files); + put_task_struct(info->task); + info->files = NULL; + info->task = NULL; + } +} + +static int init_seq_pidns(void *priv_data, struct bpf_iter_aux_info *aux) +{ + struct bpf_iter_seq_task_common *common = priv_data; + + common->ns = get_pid_ns(task_active_pid_ns(current)); + return 0; +} + +static void fini_seq_pidns(void *priv_data) +{ + struct bpf_iter_seq_task_common *common = priv_data; + + put_pid_ns(common->ns); +} + +static const struct seq_operations task_file_seq_ops = { + .start = task_file_seq_start, + .next = task_file_seq_next, + .stop = task_file_seq_stop, + .show = task_file_seq_show, +}; + +BTF_ID_LIST(btf_task_file_ids) +BTF_ID(struct, task_struct) +BTF_ID(struct, file) + +static const struct bpf_iter_seq_info task_seq_info = { + .seq_ops = &task_seq_ops, + .init_seq_private = init_seq_pidns, + .fini_seq_private = fini_seq_pidns, + .seq_priv_size = sizeof(struct bpf_iter_seq_task_info), +}; + +static struct bpf_iter_reg task_reg_info = { + .target = "task", + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__task, task), + PTR_TO_BTF_ID_OR_NULL }, + }, + .seq_info = &task_seq_info, +}; + +static const struct bpf_iter_seq_info task_file_seq_info = { + .seq_ops = &task_file_seq_ops, + .init_seq_private = init_seq_pidns, + .fini_seq_private = fini_seq_pidns, + .seq_priv_size = sizeof(struct bpf_iter_seq_task_file_info), +}; + +static struct bpf_iter_reg task_file_reg_info = { + .target = "task_file", + .ctx_arg_info_size = 2, + .ctx_arg_info = { + { offsetof(struct bpf_iter__task_file, task), + PTR_TO_BTF_ID_OR_NULL }, + { offsetof(struct bpf_iter__task_file, file), + PTR_TO_BTF_ID_OR_NULL }, + }, + .seq_info = &task_file_seq_info, +}; + +static int __init task_iter_init(void) +{ + int ret; + + task_reg_info.ctx_arg_info[0].btf_id = btf_task_file_ids[0]; + ret = bpf_iter_reg_target(&task_reg_info); + if (ret) + return ret; + + task_file_reg_info.ctx_arg_info[0].btf_id = btf_task_file_ids[0]; + task_file_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[1]; + return bpf_iter_reg_target(&task_file_reg_info); +} +late_initcall(task_iter_init); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8d7ee40e2748..b6ccfce3bf4c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -28,9 +28,11 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = { #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ [_id] = & _name ## _verifier_ops, #define BPF_MAP_TYPE(_id, _ops) +#define BPF_LINK_TYPE(_id, _name) #include <linux/bpf_types.h> #undef BPF_PROG_TYPE #undef BPF_MAP_TYPE +#undef BPF_LINK_TYPE }; /* bpf_check() is a static code analyzer that walks eBPF program @@ -168,6 +170,8 @@ struct bpf_verifier_stack_elem { int insn_idx; int prev_insn_idx; struct bpf_verifier_stack_elem *next; + /* length of verifier log at the time this state was pushed on stack */ + u32 log_pos; }; #define BPF_COMPLEXITY_LIMIT_JMP_SEQ 8192 @@ -229,6 +233,7 @@ struct bpf_call_arg_meta { bool pkt_access; int regno; int access_size; + int mem_size; u64 msize_max_value; int ref_obj_id; int func_id; @@ -283,6 +288,18 @@ void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, log->ubuf = NULL; } +static void bpf_vlog_reset(struct bpf_verifier_log *log, u32 new_pos) +{ + char zero = 0; + + if (!bpf_verifier_log_needed(log)) + return; + + log->len_used = new_pos; + if (put_user(zero, log->ubuf + new_pos)) + log->ubuf = NULL; +} + /* log_level controls verbosity level of eBPF verifier. * bpf_verifier_log_write() is used to dump the verification trace to the log, * so the user can figure out what's wrong with the program @@ -377,12 +394,24 @@ static bool type_is_sk_pointer(enum bpf_reg_type type) type == PTR_TO_XDP_SOCK; } +static bool reg_type_not_null(enum bpf_reg_type type) +{ + return type == PTR_TO_SOCKET || + type == PTR_TO_TCP_SOCK || + type == PTR_TO_MAP_VALUE || + type == PTR_TO_SOCK_COMMON; +} + static bool reg_type_may_be_null(enum bpf_reg_type type) { return type == PTR_TO_MAP_VALUE_OR_NULL || type == PTR_TO_SOCKET_OR_NULL || type == PTR_TO_SOCK_COMMON_OR_NULL || - type == PTR_TO_TCP_SOCK_OR_NULL; + type == PTR_TO_TCP_SOCK_OR_NULL || + type == PTR_TO_BTF_ID_OR_NULL || + type == PTR_TO_MEM_OR_NULL || + type == PTR_TO_RDONLY_BUF_OR_NULL || + type == PTR_TO_RDWR_BUF_OR_NULL; } static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) @@ -396,7 +425,9 @@ static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type) return type == PTR_TO_SOCKET || type == PTR_TO_SOCKET_OR_NULL || type == PTR_TO_TCP_SOCK || - type == PTR_TO_TCP_SOCK_OR_NULL; + type == PTR_TO_TCP_SOCK_OR_NULL || + type == PTR_TO_MEM || + type == PTR_TO_MEM_OR_NULL; } static bool arg_type_may_be_refcounted(enum bpf_arg_type type) @@ -410,14 +441,37 @@ static bool arg_type_may_be_refcounted(enum bpf_arg_type type) */ static bool is_release_function(enum bpf_func_id func_id) { - return func_id == BPF_FUNC_sk_release; + return func_id == BPF_FUNC_sk_release || + func_id == BPF_FUNC_ringbuf_submit || + func_id == BPF_FUNC_ringbuf_discard; } -static bool is_acquire_function(enum bpf_func_id func_id) +static bool may_be_acquire_function(enum bpf_func_id func_id) { return func_id == BPF_FUNC_sk_lookup_tcp || func_id == BPF_FUNC_sk_lookup_udp || - func_id == BPF_FUNC_skc_lookup_tcp; + func_id == BPF_FUNC_skc_lookup_tcp || + func_id == BPF_FUNC_map_lookup_elem || + func_id == BPF_FUNC_ringbuf_reserve; +} + +static bool is_acquire_function(enum bpf_func_id func_id, + const struct bpf_map *map) +{ + enum bpf_map_type map_type = map ? map->map_type : BPF_MAP_TYPE_UNSPEC; + + if (func_id == BPF_FUNC_sk_lookup_tcp || + func_id == BPF_FUNC_sk_lookup_udp || + func_id == BPF_FUNC_skc_lookup_tcp || + func_id == BPF_FUNC_ringbuf_reserve) + return true; + + if (func_id == BPF_FUNC_map_lookup_elem && + (map_type == BPF_MAP_TYPE_SOCKMAP || + map_type == BPF_MAP_TYPE_SOCKHASH)) + return true; + + return false; } static bool is_ptr_cast_function(enum bpf_func_id func_id) @@ -448,6 +502,13 @@ static const char * const reg_type_str[] = { [PTR_TO_TP_BUFFER] = "tp_buffer", [PTR_TO_XDP_SOCK] = "xdp_sock", [PTR_TO_BTF_ID] = "ptr_", + [PTR_TO_BTF_ID_OR_NULL] = "ptr_or_null_", + [PTR_TO_MEM] = "mem", + [PTR_TO_MEM_OR_NULL] = "mem_or_null", + [PTR_TO_RDONLY_BUF] = "rdonly_buf", + [PTR_TO_RDONLY_BUF_OR_NULL] = "rdonly_buf_or_null", + [PTR_TO_RDWR_BUF] = "rdwr_buf", + [PTR_TO_RDWR_BUF_OR_NULL] = "rdwr_buf_or_null", }; static char slot_type_char[] = { @@ -508,7 +569,7 @@ static void print_verifier_state(struct bpf_verifier_env *env, /* reg->off should be 0 for SCALAR_VALUE */ verbose(env, "%lld", reg->var_off.value + reg->off); } else { - if (t == PTR_TO_BTF_ID) + if (t == PTR_TO_BTF_ID || t == PTR_TO_BTF_ID_OR_NULL) verbose(env, "%s", kernel_type_name(reg->btf_id)); verbose(env, "(id=%d", reg->id); if (reg_type_may_be_refcounted_or_null(t)) @@ -846,7 +907,7 @@ static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifi } static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, - int *insn_idx) + int *insn_idx, bool pop_log) { struct bpf_verifier_state *cur = env->cur_state; struct bpf_verifier_stack_elem *elem, *head = env->head; @@ -860,6 +921,8 @@ static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, if (err) return err; } + if (pop_log) + bpf_vlog_reset(&env->log, head->log_pos); if (insn_idx) *insn_idx = head->insn_idx; if (prev_insn_idx) @@ -887,6 +950,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, elem->insn_idx = insn_idx; elem->prev_insn_idx = prev_insn_idx; elem->next = env->head; + elem->log_pos = env->log.len_used; env->head = elem; env->stack_size++; err = copy_verifier_state(&elem->st, cur); @@ -915,7 +979,7 @@ err: free_verifier_state(env->cur_state, true); env->cur_state = NULL; /* pop all elements and return */ - while (!pop_stack(env, NULL, NULL)); + while (!pop_stack(env, NULL, NULL, false)); return NULL; } @@ -1168,14 +1232,14 @@ static void __reg_assign_32_into_64(struct bpf_reg_state *reg) * but must be positive otherwise set to worse case bounds * and refine later from tnum. */ - if (reg->s32_min_value > 0) - reg->smin_value = reg->s32_min_value; - else - reg->smin_value = 0; - if (reg->s32_max_value > 0) + if (reg->s32_min_value >= 0 && reg->s32_max_value >= 0) reg->smax_value = reg->s32_max_value; else reg->smax_value = U32_MAX; + if (reg->s32_min_value >= 0) + reg->smin_value = reg->s32_min_value; + else + reg->smin_value = 0; } static void __reg_combine_32_into_64(struct bpf_reg_state *reg) @@ -1255,7 +1319,7 @@ static void __mark_reg_unknown(const struct bpf_verifier_env *env, reg->type = SCALAR_VALUE; reg->var_off = tnum_unknown; reg->frameno = 0; - reg->precise = env->subprog_cnt > 1 || !env->allow_ptr_leaks; + reg->precise = env->subprog_cnt > 1 || !env->bpf_capable; __mark_reg_unbounded(reg); } @@ -1292,6 +1356,19 @@ static void mark_reg_not_init(struct bpf_verifier_env *env, __mark_reg_not_init(env, regs + regno); } +static void mark_btf_ld_reg(struct bpf_verifier_env *env, + struct bpf_reg_state *regs, u32 regno, + enum bpf_reg_type reg_type, u32 btf_id) +{ + if (reg_type == SCALAR_VALUE) { + mark_reg_unknown(env, regs, regno); + return; + } + mark_reg_known_zero(env, regs, regno); + regs[regno].type = PTR_TO_BTF_ID; + regs[regno].btf_id = btf_id; +} + #define DEF_NOT_SUBREG (0) static void init_reg_state(struct bpf_verifier_env *env, struct bpf_func_state *state) @@ -1387,8 +1464,9 @@ static int check_subprogs(struct bpf_verifier_env *env) continue; if (insn[i].src_reg != BPF_PSEUDO_CALL) continue; - if (!env->allow_ptr_leaks) { - verbose(env, "function calls to other bpf functions are allowed for root only\n"); + if (!env->bpf_capable) { + verbose(env, + "function calls to other bpf functions are allowed for CAP_BPF and CAP_SYS_ADMIN\n"); return -EPERM; } ret = add_subprog(env, i + insn[i].imm + 1); @@ -1922,8 +2000,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno, bool new_marks = false; int i, err; - if (!env->allow_ptr_leaks) - /* backtracking is root only for now */ + if (!env->bpf_capable) return 0; func = st->frame[st->curframe]; @@ -2101,6 +2178,11 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_TCP_SOCK_OR_NULL: case PTR_TO_XDP_SOCK: case PTR_TO_BTF_ID: + case PTR_TO_BTF_ID_OR_NULL: + case PTR_TO_RDONLY_BUF: + case PTR_TO_RDONLY_BUF_OR_NULL: + case PTR_TO_RDWR_BUF: + case PTR_TO_RDWR_BUF_OR_NULL: return true; default: return false; @@ -2170,7 +2252,7 @@ static int check_stack_write(struct bpf_verifier_env *env, reg = &cur->regs[value_regno]; if (reg && size == BPF_REG_SIZE && register_is_const(reg) && - !register_is_null(reg) && env->allow_ptr_leaks) { + !register_is_null(reg) && env->bpf_capable) { if (dst_reg != BPF_REG_FP) { /* The backtracking logic can only recognize explicit * stack slot address like [fp - 8]. Other spill of @@ -2196,7 +2278,7 @@ static int check_stack_write(struct bpf_verifier_env *env, return -EINVAL; } - if (!env->allow_ptr_leaks) { + if (!env->bypass_spec_v4) { bool sanitize = false; if (state->stack[spi].slot_type[0] == STACK_SPILL && @@ -2418,32 +2500,49 @@ static int check_map_access_type(struct bpf_verifier_env *env, u32 regno, return 0; } -/* check read/write into map element returned by bpf_map_lookup_elem() */ -static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off, - int size, bool zero_size_allowed) +/* check read/write into memory region (e.g., map value, ringbuf sample, etc) */ +static int __check_mem_access(struct bpf_verifier_env *env, int regno, + int off, int size, u32 mem_size, + bool zero_size_allowed) { - struct bpf_reg_state *regs = cur_regs(env); - struct bpf_map *map = regs[regno].map_ptr; + bool size_ok = size > 0 || (size == 0 && zero_size_allowed); + struct bpf_reg_state *reg; - if (off < 0 || size < 0 || (size == 0 && !zero_size_allowed) || - off + size > map->value_size) { + if (off >= 0 && size_ok && (u64)off + size <= mem_size) + return 0; + + reg = &cur_regs(env)[regno]; + switch (reg->type) { + case PTR_TO_MAP_VALUE: verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n", - map->value_size, off, size); - return -EACCES; + mem_size, off, size); + break; + case PTR_TO_PACKET: + case PTR_TO_PACKET_META: + case PTR_TO_PACKET_END: + verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n", + off, size, regno, reg->id, off, mem_size); + break; + case PTR_TO_MEM: + default: + verbose(env, "invalid access to memory, mem_size=%u off=%d size=%d\n", + mem_size, off, size); } - return 0; + + return -EACCES; } -/* check read/write into a map element with possible variable offset */ -static int check_map_access(struct bpf_verifier_env *env, u32 regno, - int off, int size, bool zero_size_allowed) +/* check read/write into a memory region with possible variable offset */ +static int check_mem_region_access(struct bpf_verifier_env *env, u32 regno, + int off, int size, u32 mem_size, + bool zero_size_allowed) { struct bpf_verifier_state *vstate = env->cur_state; struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *reg = &state->regs[regno]; int err; - /* We may have adjusted the register to this map value, so we + /* We may have adjusted the register pointing to memory region, so we * need to try adding each of min_value and max_value to off * to make sure our theoretical access will be safe. */ @@ -2464,10 +2563,10 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, regno); return -EACCES; } - err = __check_map_access(env, regno, reg->smin_value + off, size, - zero_size_allowed); + err = __check_mem_access(env, regno, reg->smin_value + off, size, + mem_size, zero_size_allowed); if (err) { - verbose(env, "R%d min value is outside of the array range\n", + verbose(env, "R%d min value is outside of the allowed memory range\n", regno); return err; } @@ -2477,18 +2576,38 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, * If reg->umax_value + off could overflow, treat that as unbounded too. */ if (reg->umax_value >= BPF_MAX_VAR_OFF) { - verbose(env, "R%d unbounded memory access, make sure to bounds check any array access into a map\n", + verbose(env, "R%d unbounded memory access, make sure to bounds check any such access\n", regno); return -EACCES; } - err = __check_map_access(env, regno, reg->umax_value + off, size, - zero_size_allowed); - if (err) - verbose(env, "R%d max value is outside of the array range\n", + err = __check_mem_access(env, regno, reg->umax_value + off, size, + mem_size, zero_size_allowed); + if (err) { + verbose(env, "R%d max value is outside of the allowed memory range\n", regno); + return err; + } - if (map_value_has_spin_lock(reg->map_ptr)) { - u32 lock = reg->map_ptr->spin_lock_off; + return 0; +} + +/* check read/write into a map element with possible variable offset */ +static int check_map_access(struct bpf_verifier_env *env, u32 regno, + int off, int size, bool zero_size_allowed) +{ + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; + struct bpf_reg_state *reg = &state->regs[regno]; + struct bpf_map *map = reg->map_ptr; + int err; + + err = check_mem_region_access(env, regno, off, size, map->value_size, + zero_size_allowed); + if (err) + return err; + + if (map_value_has_spin_lock(map)) { + u32 lock = map->spin_lock_off; /* if any part of struct bpf_spin_lock can be touched by * load/store reject this program. @@ -2546,21 +2665,6 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, } } -static int __check_packet_access(struct bpf_verifier_env *env, u32 regno, - int off, int size, bool zero_size_allowed) -{ - struct bpf_reg_state *regs = cur_regs(env); - struct bpf_reg_state *reg = ®s[regno]; - - if (off < 0 || size < 0 || (size == 0 && !zero_size_allowed) || - (u64)off + size > reg->range) { - verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n", - off, size, regno, reg->id, reg->off, reg->range); - return -EACCES; - } - return 0; -} - static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, int size, bool zero_size_allowed) { @@ -2581,16 +2685,17 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, regno); return -EACCES; } - err = __check_packet_access(env, regno, off, size, zero_size_allowed); + err = __check_mem_access(env, regno, off, size, reg->range, + zero_size_allowed); if (err) { verbose(env, "R%d offset is outside of the packet\n", regno); return err; } - /* __check_packet_access has made sure "off + size - 1" is within u16. + /* __check_mem_access has made sure "off + size - 1" is within u16. * reg->umax_value can't be bigger than MAX_PACKET_OFF which is 0xffff, * otherwise find_good_pkt_pointers would have refused to set range info - * that __check_packet_access would have rejected this pkt access. + * that __check_mem_access would have rejected this pkt access. * Therefore, "off + reg->umax_value + size - 1" won't overflow u32. */ env->prog->aux->max_pkt_offset = @@ -2621,7 +2726,7 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, */ *reg_type = info.reg_type; - if (*reg_type == PTR_TO_BTF_ID) + if (*reg_type == PTR_TO_BTF_ID || *reg_type == PTR_TO_BTF_ID_OR_NULL) *btf_id = info.btf_id; else env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size; @@ -2957,14 +3062,15 @@ int check_ctx_reg(struct bpf_verifier_env *env, return 0; } -static int check_tp_buffer_access(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, - int regno, int off, int size) +static int __check_buffer_access(struct bpf_verifier_env *env, + const char *buf_info, + const struct bpf_reg_state *reg, + int regno, int off, int size) { if (off < 0) { verbose(env, - "R%d invalid tracepoint buffer access: off=%d, size=%d", - regno, off, size); + "R%d invalid %s buffer access: off=%d, size=%d\n", + regno, buf_info, off, size); return -EACCES; } if (!tnum_is_const(reg->var_off) || reg->var_off.value) { @@ -2972,16 +3078,49 @@ static int check_tp_buffer_access(struct bpf_verifier_env *env, tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); verbose(env, - "R%d invalid variable buffer offset: off=%d, var_off=%s", + "R%d invalid variable buffer offset: off=%d, var_off=%s\n", regno, off, tn_buf); return -EACCES; } + + return 0; +} + +static int check_tp_buffer_access(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, + int regno, int off, int size) +{ + int err; + + err = __check_buffer_access(env, "tracepoint", reg, regno, off, size); + if (err) + return err; + if (off + size > env->prog->aux->max_tp_access) env->prog->aux->max_tp_access = off + size; return 0; } +static int check_buffer_access(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, + int regno, int off, int size, + bool zero_size_allowed, + const char *buf_info, + u32 *max_access) +{ + int err; + + err = __check_buffer_access(env, buf_info, reg, regno, off, size); + if (err) + return err; + + if (off + size > *max_access) + *max_access = off + size; + + return 0; +} + /* BPF architecture zero extends alu32 ops into 64-bit registesr */ static void zext_32_to_64(struct bpf_reg_state *reg) { @@ -3099,19 +3238,68 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, if (ret < 0) return ret; - if (atype == BPF_READ && value_regno >= 0) { - if (ret == SCALAR_VALUE) { - mark_reg_unknown(env, regs, value_regno); - return 0; - } - mark_reg_known_zero(env, regs, value_regno); - regs[value_regno].type = PTR_TO_BTF_ID; - regs[value_regno].btf_id = btf_id; + if (atype == BPF_READ && value_regno >= 0) + mark_btf_ld_reg(env, regs, value_regno, ret, btf_id); + + return 0; +} + +static int check_ptr_to_map_access(struct bpf_verifier_env *env, + struct bpf_reg_state *regs, + int regno, int off, int size, + enum bpf_access_type atype, + int value_regno) +{ + struct bpf_reg_state *reg = regs + regno; + struct bpf_map *map = reg->map_ptr; + const struct btf_type *t; + const char *tname; + u32 btf_id; + int ret; + + if (!btf_vmlinux) { + verbose(env, "map_ptr access not supported without CONFIG_DEBUG_INFO_BTF\n"); + return -ENOTSUPP; + } + + if (!map->ops->map_btf_id || !*map->ops->map_btf_id) { + verbose(env, "map_ptr access not supported for map type %d\n", + map->map_type); + return -ENOTSUPP; + } + + t = btf_type_by_id(btf_vmlinux, *map->ops->map_btf_id); + tname = btf_name_by_offset(btf_vmlinux, t->name_off); + + if (!env->allow_ptr_to_map_access) { + verbose(env, + "%s access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN\n", + tname); + return -EPERM; + } + + if (off < 0) { + verbose(env, "R%d is %s invalid negative access: off=%d\n", + regno, tname, off); + return -EACCES; + } + + if (atype != BPF_READ) { + verbose(env, "only read from %s is supported\n", tname); + return -EACCES; } + ret = btf_struct_access(&env->log, t, off, size, atype, &btf_id); + if (ret < 0) + return ret; + + if (value_regno >= 0) + mark_btf_ld_reg(env, regs, value_regno, ret, btf_id); + return 0; } + /* check whether memory at (regno + off) is accessible for t = (read | write) * if t==write, value_regno is a register which value is stored into memory * if t==read, value_regno is a register which will receive the value from memory @@ -3170,6 +3358,16 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn mark_reg_unknown(env, regs, value_regno); } } + } else if (reg->type == PTR_TO_MEM) { + if (t == BPF_WRITE && value_regno >= 0 && + is_pointer_value(env, value_regno)) { + verbose(env, "R%d leaks addr into mem\n", value_regno); + return -EACCES; + } + err = check_mem_region_access(env, regno, off, size, + reg->mem_size, false); + if (!err && t == BPF_READ && value_regno >= 0) + mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_CTX) { enum bpf_reg_type reg_type = SCALAR_VALUE; u32 btf_id = 0; @@ -3205,7 +3403,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn * a sub-register. */ regs[value_regno].subreg_def = DEF_NOT_SUBREG; - if (reg_type == PTR_TO_BTF_ID) + if (reg_type == PTR_TO_BTF_ID || + reg_type == PTR_TO_BTF_ID_OR_NULL) regs[value_regno].btf_id = btf_id; } regs[value_regno].type = reg_type; @@ -3269,6 +3468,26 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } else if (reg->type == PTR_TO_BTF_ID) { err = check_ptr_to_btf_access(env, regs, regno, off, size, t, value_regno); + } else if (reg->type == CONST_PTR_TO_MAP) { + err = check_ptr_to_map_access(env, regs, regno, off, size, t, + value_regno); + } else if (reg->type == PTR_TO_RDONLY_BUF) { + if (t == BPF_WRITE) { + verbose(env, "R%d cannot write into %s\n", + regno, reg_type_str[reg->type]); + return -EACCES; + } + err = check_buffer_access(env, reg, regno, off, size, false, + "rdonly", + &env->prog->aux->max_rdonly_access); + if (!err && value_regno >= 0) + mark_reg_unknown(env, regs, value_regno); + } else if (reg->type == PTR_TO_RDWR_BUF) { + err = check_buffer_access(env, reg, regno, off, size, false, + "rdwr", + &env->prog->aux->max_rdwr_access); + if (!err && t == BPF_READ && value_regno >= 0) + mark_reg_unknown(env, regs, value_regno); } else { verbose(env, "R%d invalid mem access '%s'\n", regno, reg_type_str[reg->type]); @@ -3390,7 +3609,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, * Spectre masking for stack ALU. * See also retrieve_ptr_limit(). */ - if (!env->allow_ptr_leaks) { + if (!env->bypass_spec_v1) { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); @@ -3452,6 +3671,11 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, *stype = STACK_MISC; goto mark; } + + if (state->stack[spi].slot_type[0] == STACK_SPILL && + state->stack[spi].spilled_ptr.type == PTR_TO_BTF_ID) + goto mark; + if (state->stack[spi].slot_type[0] == STACK_SPILL && state->stack[spi].spilled_ptr.type == SCALAR_VALUE) { __mark_reg_unknown(env, &state->stack[spi].spilled_ptr); @@ -3501,6 +3725,22 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, return -EACCES; return check_map_access(env, regno, reg->off, access_size, zero_size_allowed); + case PTR_TO_MEM: + return check_mem_region_access(env, regno, reg->off, + access_size, reg->mem_size, + zero_size_allowed); + case PTR_TO_RDONLY_BUF: + if (meta && meta->raw_mode) + return -EACCES; + return check_buffer_access(env, reg, regno, reg->off, + access_size, zero_size_allowed, + "rdonly", + &env->prog->aux->max_rdonly_access); + case PTR_TO_RDWR_BUF: + return check_buffer_access(env, reg, regno, reg->off, + access_size, zero_size_allowed, + "rdwr", + &env->prog->aux->max_rdwr_access); default: /* scalar_value|ptr_to_stack or invalid ptr */ return check_stack_boundary(env, regno, access_size, zero_size_allowed, meta); @@ -3605,6 +3845,17 @@ static bool arg_type_is_mem_size(enum bpf_arg_type type) type == ARG_CONST_SIZE_OR_ZERO; } +static bool arg_type_is_alloc_mem_ptr(enum bpf_arg_type type) +{ + return type == ARG_PTR_TO_ALLOC_MEM || + type == ARG_PTR_TO_ALLOC_MEM_OR_NULL; +} + +static bool arg_type_is_alloc_size(enum bpf_arg_type type) +{ + return type == ARG_CONST_ALLOC_SIZE_OR_ZERO; +} + static bool arg_type_is_int_ptr(enum bpf_arg_type type) { return type == ARG_PTR_TO_INT || @@ -3621,12 +3872,14 @@ static int int_ptr_type_to_size(enum bpf_arg_type type) return -EINVAL; } -static int check_func_arg(struct bpf_verifier_env *env, u32 regno, - enum bpf_arg_type arg_type, - struct bpf_call_arg_meta *meta) +static int check_func_arg(struct bpf_verifier_env *env, u32 arg, + struct bpf_call_arg_meta *meta, + const struct bpf_func_proto *fn) { + u32 regno = BPF_REG_1 + arg; struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; enum bpf_reg_type expected_type, type = reg->type; + enum bpf_arg_type arg_type = fn->arg_type[arg]; int err = 0; if (arg_type == ARG_DONTCARE) @@ -3664,7 +3917,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, type != expected_type) goto err_type; } else if (arg_type == ARG_CONST_SIZE || - arg_type == ARG_CONST_SIZE_OR_ZERO) { + arg_type == ARG_CONST_SIZE_OR_ZERO || + arg_type == ARG_CONST_ALLOC_SIZE_OR_ZERO) { expected_type = SCALAR_VALUE; if (type != expected_type) goto err_type; @@ -3697,17 +3951,28 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, } meta->ref_obj_id = reg->ref_obj_id; } - } else if (arg_type == ARG_PTR_TO_SOCKET) { + } else if (arg_type == ARG_PTR_TO_SOCKET || + arg_type == ARG_PTR_TO_SOCKET_OR_NULL) { expected_type = PTR_TO_SOCKET; - if (type != expected_type) - goto err_type; + if (!(register_is_null(reg) && + arg_type == ARG_PTR_TO_SOCKET_OR_NULL)) { + if (type != expected_type) + goto err_type; + } } else if (arg_type == ARG_PTR_TO_BTF_ID) { expected_type = PTR_TO_BTF_ID; if (type != expected_type) goto err_type; - if (reg->btf_id != meta->btf_id) { - verbose(env, "Helper has type %s got %s in R%d\n", - kernel_type_name(meta->btf_id), + if (!fn->check_btf_id) { + if (reg->btf_id != meta->btf_id) { + verbose(env, "Helper has type %s got %s in R%d\n", + kernel_type_name(meta->btf_id), + kernel_type_name(reg->btf_id), regno); + + return -EACCES; + } + } else if (!fn->check_btf_id(reg->btf_id, arg)) { + verbose(env, "Helper does not support %s in R%d\n", kernel_type_name(reg->btf_id), regno); return -EACCES; @@ -3735,13 +4000,31 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, * happens during stack boundary checking. */ if (register_is_null(reg) && - arg_type == ARG_PTR_TO_MEM_OR_NULL) + (arg_type == ARG_PTR_TO_MEM_OR_NULL || + arg_type == ARG_PTR_TO_ALLOC_MEM_OR_NULL)) /* final test in check_stack_boundary() */; else if (!type_is_pkt_pointer(type) && type != PTR_TO_MAP_VALUE && + type != PTR_TO_MEM && + type != PTR_TO_RDONLY_BUF && + type != PTR_TO_RDWR_BUF && type != expected_type) goto err_type; meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM; + } else if (arg_type_is_alloc_mem_ptr(arg_type)) { + expected_type = PTR_TO_MEM; + if (register_is_null(reg) && + arg_type == ARG_PTR_TO_ALLOC_MEM_OR_NULL) + /* final test in check_stack_boundary() */; + else if (type != expected_type) + goto err_type; + if (meta->ref_obj_id) { + verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", + regno, reg->ref_obj_id, + meta->ref_obj_id); + return -EFAULT; + } + meta->ref_obj_id = reg->ref_obj_id; } else if (arg_type_is_int_ptr(arg_type)) { expected_type = PTR_TO_STACK; if (!type_is_pkt_pointer(type) && @@ -3837,6 +4120,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, zero_size_allowed, meta); if (!err) err = mark_chain_precision(env, regno); + } else if (arg_type_is_alloc_size(arg_type)) { + if (!tnum_is_const(reg->var_off)) { + verbose(env, "R%d unbounded size, use 'var &= const' or 'if (var < const)'\n", + regno); + return -EACCES; + } + meta->mem_size = reg->var_off.value; } else if (arg_type_is_int_ptr(arg_type)) { int size = int_ptr_type_to_size(arg_type); @@ -3873,6 +4163,14 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_xdp_output) goto error; break; + case BPF_MAP_TYPE_RINGBUF: + if (func_id != BPF_FUNC_ringbuf_output && + func_id != BPF_FUNC_ringbuf_reserve && + func_id != BPF_FUNC_ringbuf_submit && + func_id != BPF_FUNC_ringbuf_discard && + func_id != BPF_FUNC_ringbuf_query) + goto error; + break; case BPF_MAP_TYPE_STACK_TRACE: if (func_id != BPF_FUNC_get_stackid) goto error; @@ -3915,7 +4213,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_sock_map_update && func_id != BPF_FUNC_map_delete_elem && func_id != BPF_FUNC_msg_redirect_map && - func_id != BPF_FUNC_sk_select_reuseport) + func_id != BPF_FUNC_sk_select_reuseport && + func_id != BPF_FUNC_map_lookup_elem) goto error; break; case BPF_MAP_TYPE_SOCKHASH: @@ -3923,7 +4222,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_sock_hash_update && func_id != BPF_FUNC_map_delete_elem && func_id != BPF_FUNC_msg_redirect_hash && - func_id != BPF_FUNC_sk_select_reuseport) + func_id != BPF_FUNC_sk_select_reuseport && + func_id != BPF_FUNC_map_lookup_elem) goto error; break; case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY: @@ -4093,7 +4393,7 @@ static bool check_refcount_ok(const struct bpf_func_proto *fn, int func_id) /* A reference acquiring function cannot acquire * another refcounted ptr. */ - if (is_acquire_function(func_id) && count) + if (may_be_acquire_function(func_id) && count) return false; /* We only support one arg being unreferenced at the moment, @@ -4388,10 +4688,10 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, if (!BPF_MAP_PTR(aux->map_ptr_state)) bpf_map_ptr_store(aux, meta->map_ptr, - meta->map_ptr->unpriv_array); + !meta->map_ptr->bypass_spec_v1); else if (BPF_MAP_PTR(aux->map_ptr_state) != meta->map_ptr) bpf_map_ptr_store(aux, BPF_MAP_PTR_POISON, - meta->map_ptr->unpriv_array); + !meta->map_ptr->bypass_spec_v1); return 0; } @@ -4496,10 +4796,12 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn meta.func_id = func_id; /* check args */ for (i = 0; i < 5; i++) { - err = btf_resolve_helper_id(&env->log, fn, i); - if (err > 0) - meta.btf_id = err; - err = check_func_arg(env, BPF_REG_1 + i, fn->arg_type[i], &meta); + if (!fn->check_btf_id) { + err = btf_resolve_helper_id(&env->log, fn, i); + if (err > 0) + meta.btf_id = err; + } + err = check_func_arg(env, i, &meta, fn); if (err) return err; } @@ -4597,6 +4899,23 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL; regs[BPF_REG_0].id = ++env->id_gen; + } else if (fn->ret_type == RET_PTR_TO_ALLOC_MEM_OR_NULL) { + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL; + regs[BPF_REG_0].id = ++env->id_gen; + regs[BPF_REG_0].mem_size = meta.mem_size; + } else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL) { + int ret_btf_id; + + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_BTF_ID_OR_NULL; + ret_btf_id = *fn->ret_btf_id; + if (ret_btf_id == 0) { + verbose(env, "invalid return type %d of func %s#%d\n", + fn->ret_type, func_id_name(func_id), func_id); + return -EINVAL; + } + regs[BPF_REG_0].btf_id = ret_btf_id; } else { verbose(env, "unknown return type %d of func %s#%d\n", fn->ret_type, func_id_name(func_id), func_id); @@ -4606,7 +4925,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn if (is_ptr_cast_function(func_id)) { /* For release_reference() */ regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id; - } else if (is_acquire_function(func_id)) { + } else if (is_acquire_function(func_id, meta.map_ptr)) { int id = acquire_reference_state(env, insn_idx); if (id < 0) @@ -4623,7 +4942,9 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn if (err) return err; - if (func_id == BPF_FUNC_get_stack && !env->prog->has_callchain_buf) { + if ((func_id == BPF_FUNC_get_stack || + func_id == BPF_FUNC_get_task_stack) && + !env->prog->has_callchain_buf) { const char *err_str; #ifdef CONFIG_PERF_EVENTS @@ -4641,6 +4962,9 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn env->prog->has_callchain_buf = true; } + if (func_id == BPF_FUNC_get_stackid || func_id == BPF_FUNC_get_stack) + env->prog->call_get_stack = true; + if (changes_data) clear_all_pkt_pointers(env); return 0; @@ -4760,7 +5084,7 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, static bool can_skip_alu_sanitation(const struct bpf_verifier_env *env, const struct bpf_insn *insn) { - return env->allow_ptr_leaks || BPF_SRC(insn->code) == BPF_K; + return env->bypass_spec_v1 || BPF_SRC(insn->code) == BPF_K; } static int update_alu_sanitation_state(struct bpf_insn_aux_data *aux, @@ -4878,6 +5202,11 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, if (BPF_CLASS(insn->code) != BPF_ALU64) { /* 32-bit ALU ops on pointers produce (meaningless) scalars */ + if (opcode == BPF_SUB && env->allow_ptr_leaks) { + __mark_reg_unknown(env, dst_reg); + return 0; + } + verbose(env, "R%d 32-bit pointer arithmetic prohibited\n", dst); @@ -5070,7 +5399,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, /* For unprivileged we require that resulting offset must be in bounds * in order to be able to sanitize access later on. */ - if (!env->allow_ptr_leaks) { + if (!env->bypass_spec_v1) { if (dst_reg->type == PTR_TO_MAP_VALUE && check_map_access(env, dst, dst_reg->off, 1, false)) { verbose(env, "R%d pointer arithmetic of map value goes out of range, " @@ -5611,7 +5940,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, { struct bpf_reg_state *regs = cur_regs(env); u8 opcode = BPF_OP(insn->code); - bool src_known, dst_known; + bool src_known; s64 smin_val, smax_val; u64 umin_val, umax_val; s32 s32_min_val, s32_max_val; @@ -5633,7 +5962,6 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, if (alu32) { src_known = tnum_subreg_is_const(src_reg.var_off); - dst_known = tnum_subreg_is_const(dst_reg->var_off); if ((src_known && (s32_min_val != s32_max_val || u32_min_val != u32_max_val)) || s32_min_val > s32_max_val || u32_min_val > u32_max_val) { @@ -5645,7 +5973,6 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, } } else { src_known = tnum_is_const(src_reg.var_off); - dst_known = tnum_is_const(dst_reg->var_off); if ((src_known && (smin_val != smax_val || umin_val != umax_val)) || smin_val > smax_val || umin_val > umax_val) { @@ -6261,8 +6588,25 @@ static int is_branch64_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode, bool is_jmp32) { - if (__is_pointer_value(false, reg)) - return -1; + if (__is_pointer_value(false, reg)) { + if (!reg_type_not_null(reg->type)) + return -1; + + /* If pointer is valid tests against zero will fail so we can + * use this to direct branch taken. + */ + if (val != 0) + return -1; + + switch (opcode) { + case BPF_JEQ: + return 0; + case BPF_JNE: + return 1; + default: + return -1; + } + } if (is_jmp32) return is_branch32_taken(reg, val, opcode); @@ -6517,12 +6861,16 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, if (is_null) { reg->type = SCALAR_VALUE; } else if (reg->type == PTR_TO_MAP_VALUE_OR_NULL) { - if (reg->map_ptr->inner_map_meta) { + const struct bpf_map *map = reg->map_ptr; + + if (map->inner_map_meta) { reg->type = CONST_PTR_TO_MAP; - reg->map_ptr = reg->map_ptr->inner_map_meta; - } else if (reg->map_ptr->map_type == - BPF_MAP_TYPE_XSKMAP) { + reg->map_ptr = map->inner_map_meta; + } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { reg->type = PTR_TO_XDP_SOCK; + } else if (map->map_type == BPF_MAP_TYPE_SOCKMAP || + map->map_type == BPF_MAP_TYPE_SOCKHASH) { + reg->type = PTR_TO_SOCKET; } else { reg->type = PTR_TO_MAP_VALUE; } @@ -6532,6 +6880,14 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, reg->type = PTR_TO_SOCK_COMMON; } else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) { reg->type = PTR_TO_TCP_SOCK; + } else if (reg->type == PTR_TO_BTF_ID_OR_NULL) { + reg->type = PTR_TO_BTF_ID; + } else if (reg->type == PTR_TO_MEM_OR_NULL) { + reg->type = PTR_TO_MEM; + } else if (reg->type == PTR_TO_RDONLY_BUF_OR_NULL) { + reg->type = PTR_TO_RDONLY_BUF; + } else if (reg->type == PTR_TO_RDWR_BUF_OR_NULL) { + reg->type = PTR_TO_RDWR_BUF; } if (is_null) { /* We don't need id and ref_obj_id from this point @@ -6755,7 +7111,11 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, } if (pred >= 0) { - err = mark_chain_precision(env, insn->dst_reg); + /* If we get here with a dst_reg pointer type it is because + * above is_branch_taken() special cased the 0 comparison. + */ + if (!__is_pointer_value(false, dst_reg)) + err = mark_chain_precision(env, insn->dst_reg); if (BPF_SRC(insn->code) == BPF_X && !err) err = mark_chain_precision(env, insn->src_reg); if (err) @@ -7041,7 +7401,11 @@ static int check_return_code(struct bpf_verifier_env *env) switch (env->prog->type) { case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG || - env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG) + env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG || + env->prog->expected_attach_type == BPF_CGROUP_INET4_GETPEERNAME || + env->prog->expected_attach_type == BPF_CGROUP_INET6_GETPEERNAME || + env->prog->expected_attach_type == BPF_CGROUP_INET4_GETSOCKNAME || + env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME) range = tnum_range(1, 1); break; case BPF_PROG_TYPE_CGROUP_SKB: @@ -7070,10 +7434,15 @@ static int check_return_code(struct bpf_verifier_env *env) case BPF_TRACE_RAW_TP: case BPF_MODIFY_RETURN: return 0; + case BPF_TRACE_ITER: + break; default: return -ENOTSUPP; } break; + case BPF_PROG_TYPE_SK_LOOKUP: + range = tnum_range(SK_DROP, SK_PASS); + break; case BPF_PROG_TYPE_EXT: /* freplace program can return anything as its return value * depends on the to-be-replaced kernel func or bpf program. @@ -7206,7 +7575,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env, insn_stack[env->cfg.cur_stack++] = w; return 1; } else if ((insn_state[w] & 0xF0) == DISCOVERED) { - if (loop_ok && env->allow_ptr_leaks) + if (loop_ok && env->bpf_capable) return 0; verbose_linfo(env, t, "%d: ", t); verbose_linfo(env, w, "%d: ", w); @@ -7366,7 +7735,7 @@ static int check_btf_func(struct bpf_verifier_env *env, const struct btf *btf; void __user *urecord; u32 prev_offset = 0; - int ret = 0; + int ret = -ENOMEM; nfuncs = attr->func_info_cnt; if (!nfuncs) @@ -8315,7 +8684,7 @@ next: if (env->max_states_per_insn < states_cnt) env->max_states_per_insn = states_cnt; - if (!env->allow_ptr_leaks && states_cnt > BPF_COMPLEXITY_LIMIT_STATES) + if (!env->bpf_capable && states_cnt > BPF_COMPLEXITY_LIMIT_STATES) return push_jmp_history(env, cur); if (!add_new_state) @@ -8402,6 +8771,7 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type) case PTR_TO_TCP_SOCK_OR_NULL: case PTR_TO_XDP_SOCK: case PTR_TO_BTF_ID: + case PTR_TO_BTF_ID_OR_NULL: return false; default: return true; @@ -8428,6 +8798,7 @@ static bool reg_type_mismatch(enum bpf_reg_type src, enum bpf_reg_type prev) static int do_check(struct bpf_verifier_env *env) { + bool pop_log = !(env->log.level & BPF_LOG_LEVEL2); struct bpf_verifier_state *state = env->cur_state; struct bpf_insn *insns = env->prog->insnsi; struct bpf_reg_state *regs; @@ -8704,7 +9075,7 @@ static int do_check(struct bpf_verifier_env *env) process_bpf_exit: update_branch_counts(env, env->cur_state); err = pop_stack(env, &prev_insn_idx, - &env->insn_idx); + &env->insn_idx, pop_log); if (err < 0) { if (err != -ENOENT) return err; @@ -9613,7 +9984,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) int i, j, subprog_start, subprog_end = 0, len, subprog; struct bpf_insn *insn; void *old_bpf_func; - int err; + int err, num_exentries; if (env->subprog_cnt <= 1) return 0; @@ -9688,6 +10059,14 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->nr_linfo = prog->aux->nr_linfo; func[i]->aux->jited_linfo = prog->aux->jited_linfo; func[i]->aux->linfo_idx = env->subprog_info[i].linfo_idx; + num_exentries = 0; + insn = func[i]->insnsi; + for (j = 0; j < func[i]->len; j++, insn++) { + if (BPF_CLASS(insn->code) == BPF_LDX && + BPF_MODE(insn->code) == BPF_PROBE_MEM) + num_exentries++; + } + func[i]->aux->num_exentries = num_exentries; func[i] = bpf_int_jit_compile(func[i]); if (!func[i]->jited) { err = -ENOTSUPP; @@ -9974,7 +10353,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) insn->code = BPF_JMP | BPF_TAIL_CALL; aux = &env->insn_aux_data[i + delta]; - if (env->allow_ptr_leaks && !expect_blinding && + if (env->bpf_capable && !expect_blinding && prog->jit_requested && !bpf_map_key_poisoned(aux) && !bpf_map_ptr_poisoned(aux) && @@ -10227,6 +10606,7 @@ static void sanitize_insn_aux_data(struct bpf_verifier_env *env) static int do_check_common(struct bpf_verifier_env *env, int subprog) { + bool pop_log = !(env->log.level & BPF_LOG_LEVEL2); struct bpf_verifier_state *state; struct bpf_reg_state *regs; int ret, i; @@ -10289,7 +10669,9 @@ out: free_verifier_state(env->cur_state, true); env->cur_state = NULL; } - while (!pop_stack(env, NULL, NULL)); + while (!pop_stack(env, NULL, NULL, false)); + if (!ret && pop_log) + bpf_vlog_reset(&env->log, 0); free_states(env); if (ret) /* clean aux data in case subprog was rejected */ @@ -10428,22 +10810,13 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env) } #define SECURITY_PREFIX "security_" -static int check_attach_modify_return(struct bpf_verifier_env *env) +static int check_attach_modify_return(struct bpf_prog *prog, unsigned long addr) { - struct bpf_prog *prog = env->prog; - unsigned long addr = (unsigned long) prog->aux->trampoline->func.addr; - - /* This is expected to be cleaned up in the future with the KRSI effort - * introducing the LSM_HOOK macro for cleaning up lsm_hooks.h. - */ if (within_error_injection_list(addr) || !strncmp(SECURITY_PREFIX, prog->aux->attach_func_name, sizeof(SECURITY_PREFIX) - 1)) return 0; - verbose(env, "fmod_ret attach_btf_id %u (%s) is not modifiable\n", - prog->aux->attach_btf_id, prog->aux->attach_func_name); - return -EINVAL; } @@ -10454,6 +10827,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) struct bpf_prog *tgt_prog = prog->aux->linked_prog; u32 btf_id = prog->aux->attach_btf_id; const char prefix[] = "btf_trace_"; + struct btf_func_model fmodel; int ret = 0, subprog = -1, i; struct bpf_trampoline *tr; const struct btf_type *t; @@ -10595,6 +10969,22 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) prog->aux->attach_func_proto = t; prog->aux->attach_btf_trace = true; return 0; + case BPF_TRACE_ITER: + if (!btf_type_is_func(t)) { + verbose(env, "attach_btf_id %u is not a function\n", + btf_id); + return -EINVAL; + } + t = btf_type_by_id(btf, t->type); + if (!btf_type_is_func_proto(t)) + return -EINVAL; + prog->aux->attach_func_name = tname; + prog->aux->attach_func_proto = t; + if (!bpf_iter_prog_supported(prog)) + return -EINVAL; + ret = btf_distill_func_proto(&env->log, btf, t, + tname, &fmodel); + return ret; default: if (!prog_extension) return -EINVAL; @@ -10654,11 +11044,18 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) goto out; } } + + if (prog->expected_attach_type == BPF_MODIFY_RETURN) { + ret = check_attach_modify_return(prog, addr); + if (ret) + verbose(env, "%s() is not modifiable\n", + prog->aux->attach_func_name); + } + + if (ret) + goto out; tr->func.addr = (void *)addr; prog->aux->trampoline = tr; - - if (prog->expected_attach_type == BPF_MODIFY_RETURN) - ret = check_attach_modify_return(env); out: mutex_unlock(&tr->mutex); if (ret) @@ -10698,7 +11095,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, env->insn_aux_data[i].orig_idx = i; env->prog = *prog; env->ops = bpf_verifier_ops[env->prog->type]; - is_priv = capable(CAP_SYS_ADMIN); + is_priv = bpf_capable(); if (!btf_vmlinux && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) { mutex_lock(&bpf_verifier_lock); @@ -10739,7 +11136,11 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, if (attr->prog_flags & BPF_F_ANY_ALIGNMENT) env->strict_alignment = false; - env->allow_ptr_leaks = is_priv; + env->allow_ptr_leaks = bpf_allow_ptr_leaks(); + env->allow_ptr_to_map_access = bpf_allow_ptr_to_map_access(); + env->bypass_spec_v1 = bpf_bypass_spec_v1(); + env->bypass_spec_v4 = bpf_bypass_spec_v4(); + env->bpf_capable = bpf_capable(); if (is_priv) env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ; diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c deleted file mode 100644 index 2cc5c8f4c800..000000000000 --- a/kernel/bpf/xskmap.c +++ /dev/null @@ -1,265 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* XSKMAP used for AF_XDP sockets - * Copyright(c) 2018 Intel Corporation. - */ - -#include <linux/bpf.h> -#include <linux/capability.h> -#include <net/xdp_sock.h> -#include <linux/slab.h> -#include <linux/sched.h> - -int xsk_map_inc(struct xsk_map *map) -{ - bpf_map_inc(&map->map); - return 0; -} - -void xsk_map_put(struct xsk_map *map) -{ - bpf_map_put(&map->map); -} - -static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map, - struct xdp_sock **map_entry) -{ - struct xsk_map_node *node; - int err; - - node = kzalloc(sizeof(*node), GFP_ATOMIC | __GFP_NOWARN); - if (!node) - return ERR_PTR(-ENOMEM); - - err = xsk_map_inc(map); - if (err) { - kfree(node); - return ERR_PTR(err); - } - - node->map = map; - node->map_entry = map_entry; - return node; -} - -static void xsk_map_node_free(struct xsk_map_node *node) -{ - xsk_map_put(node->map); - kfree(node); -} - -static void xsk_map_sock_add(struct xdp_sock *xs, struct xsk_map_node *node) -{ - spin_lock_bh(&xs->map_list_lock); - list_add_tail(&node->node, &xs->map_list); - spin_unlock_bh(&xs->map_list_lock); -} - -static void xsk_map_sock_delete(struct xdp_sock *xs, - struct xdp_sock **map_entry) -{ - struct xsk_map_node *n, *tmp; - - spin_lock_bh(&xs->map_list_lock); - list_for_each_entry_safe(n, tmp, &xs->map_list, node) { - if (map_entry == n->map_entry) { - list_del(&n->node); - xsk_map_node_free(n); - } - } - spin_unlock_bh(&xs->map_list_lock); -} - -static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) -{ - struct bpf_map_memory mem; - int err, numa_node; - struct xsk_map *m; - u64 size; - - if (!capable(CAP_NET_ADMIN)) - return ERR_PTR(-EPERM); - - if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size != 4 || - attr->map_flags & ~(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)) - return ERR_PTR(-EINVAL); - - numa_node = bpf_map_attr_numa_node(attr); - size = struct_size(m, xsk_map, attr->max_entries); - - err = bpf_map_charge_init(&mem, size); - if (err < 0) - return ERR_PTR(err); - - m = bpf_map_area_alloc(size, numa_node); - if (!m) { - bpf_map_charge_finish(&mem); - return ERR_PTR(-ENOMEM); - } - - bpf_map_init_from_attr(&m->map, attr); - bpf_map_charge_move(&m->map.memory, &mem); - spin_lock_init(&m->lock); - - return &m->map; -} - -static void xsk_map_free(struct bpf_map *map) -{ - struct xsk_map *m = container_of(map, struct xsk_map, map); - - bpf_clear_redirect_map(map); - synchronize_net(); - bpf_map_area_free(m); -} - -static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key) -{ - struct xsk_map *m = container_of(map, struct xsk_map, map); - u32 index = key ? *(u32 *)key : U32_MAX; - u32 *next = next_key; - - if (index >= m->map.max_entries) { - *next = 0; - return 0; - } - - if (index == m->map.max_entries - 1) - return -ENOENT; - *next = index + 1; - return 0; -} - -static u32 xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) -{ - const int ret = BPF_REG_0, mp = BPF_REG_1, index = BPF_REG_2; - struct bpf_insn *insn = insn_buf; - - *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); - *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5); - *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(sizeof(struct xsk_sock *))); - *insn++ = BPF_ALU64_IMM(BPF_ADD, mp, offsetof(struct xsk_map, xsk_map)); - *insn++ = BPF_ALU64_REG(BPF_ADD, ret, mp); - *insn++ = BPF_LDX_MEM(BPF_SIZEOF(struct xsk_sock *), ret, ret, 0); - *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); - *insn++ = BPF_MOV64_IMM(ret, 0); - return insn - insn_buf; -} - -static void *xsk_map_lookup_elem(struct bpf_map *map, void *key) -{ - WARN_ON_ONCE(!rcu_read_lock_held()); - return __xsk_map_lookup_elem(map, *(u32 *)key); -} - -static void *xsk_map_lookup_elem_sys_only(struct bpf_map *map, void *key) -{ - return ERR_PTR(-EOPNOTSUPP); -} - -static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value, - u64 map_flags) -{ - struct xsk_map *m = container_of(map, struct xsk_map, map); - struct xdp_sock *xs, *old_xs, **map_entry; - u32 i = *(u32 *)key, fd = *(u32 *)value; - struct xsk_map_node *node; - struct socket *sock; - int err; - - if (unlikely(map_flags > BPF_EXIST)) - return -EINVAL; - if (unlikely(i >= m->map.max_entries)) - return -E2BIG; - - sock = sockfd_lookup(fd, &err); - if (!sock) - return err; - - if (sock->sk->sk_family != PF_XDP) { - sockfd_put(sock); - return -EOPNOTSUPP; - } - - xs = (struct xdp_sock *)sock->sk; - - if (!xsk_is_setup_for_bpf_map(xs)) { - sockfd_put(sock); - return -EOPNOTSUPP; - } - - map_entry = &m->xsk_map[i]; - node = xsk_map_node_alloc(m, map_entry); - if (IS_ERR(node)) { - sockfd_put(sock); - return PTR_ERR(node); - } - - spin_lock_bh(&m->lock); - old_xs = READ_ONCE(*map_entry); - if (old_xs == xs) { - err = 0; - goto out; - } else if (old_xs && map_flags == BPF_NOEXIST) { - err = -EEXIST; - goto out; - } else if (!old_xs && map_flags == BPF_EXIST) { - err = -ENOENT; - goto out; - } - xsk_map_sock_add(xs, node); - WRITE_ONCE(*map_entry, xs); - if (old_xs) - xsk_map_sock_delete(old_xs, map_entry); - spin_unlock_bh(&m->lock); - sockfd_put(sock); - return 0; - -out: - spin_unlock_bh(&m->lock); - sockfd_put(sock); - xsk_map_node_free(node); - return err; -} - -static int xsk_map_delete_elem(struct bpf_map *map, void *key) -{ - struct xsk_map *m = container_of(map, struct xsk_map, map); - struct xdp_sock *old_xs, **map_entry; - int k = *(u32 *)key; - - if (k >= map->max_entries) - return -EINVAL; - - spin_lock_bh(&m->lock); - map_entry = &m->xsk_map[k]; - old_xs = xchg(map_entry, NULL); - if (old_xs) - xsk_map_sock_delete(old_xs, map_entry); - spin_unlock_bh(&m->lock); - - return 0; -} - -void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, - struct xdp_sock **map_entry) -{ - spin_lock_bh(&map->lock); - if (READ_ONCE(*map_entry) == xs) { - WRITE_ONCE(*map_entry, NULL); - xsk_map_sock_delete(xs, map_entry); - } - spin_unlock_bh(&map->lock); -} - -const struct bpf_map_ops xsk_map_ops = { - .map_alloc = xsk_map_alloc, - .map_free = xsk_map_free, - .map_get_next_key = xsk_map_get_next_key, - .map_lookup_elem = xsk_map_lookup_elem, - .map_gen_lookup = xsk_map_gen_lookup, - .map_lookup_elem_sys_only = xsk_map_lookup_elem_sys_only, - .map_update_elem = xsk_map_update_elem, - .map_delete_elem = xsk_map_delete_elem, - .map_check_btf = map_check_no_btf, -}; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 06b5ea9d899d..dd247747ec14 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -153,11 +153,7 @@ static struct static_key_true *cgroup_subsys_on_dfl_key[] = { static DEFINE_PER_CPU(struct cgroup_rstat_cpu, cgrp_dfl_root_rstat_cpu); -/* - * The default hierarchy, reserved for the subsystems that are otherwise - * unattached - it never has more than a single cgroup, and all tasks are - * part of that cgroup. - */ +/* the default hierarchy */ struct cgroup_root cgrp_dfl_root = { .cgrp.rstat_cpu = &cgrp_dfl_root_rstat_cpu }; EXPORT_SYMBOL_GPL(cgrp_dfl_root); @@ -251,9 +247,6 @@ bool cgroup_ssid_enabled(int ssid) * cases where a subsystem should behave differnetly depending on the * interface version. * - * The set of behaviors which change on the default hierarchy are still - * being determined and the mount option is prefixed with __DEVEL__. - * * List of changed behaviors: * * - Mount options "noprefix", "xattr", "clone_children", "release_agent" @@ -4881,7 +4874,6 @@ static struct cftype cgroup_base_files[] = { }, { .name = "cpu.stat", - .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cpu_stat_show, }, #ifdef CONFIG_PSI @@ -6447,18 +6439,8 @@ void cgroup_sk_alloc_disable(void) void cgroup_sk_alloc(struct sock_cgroup_data *skcd) { - if (cgroup_sk_alloc_disabled) - return; - - /* Socket clone path */ - if (skcd->val) { - /* - * We might be cloning a socket which is left in an empty - * cgroup and the cgroup might have already been rmdir'd. - * Don't use cgroup_get_live(). - */ - cgroup_get(sock_cgroup_ptr(skcd)); - cgroup_bpf_get(sock_cgroup_ptr(skcd)); + if (cgroup_sk_alloc_disabled) { + skcd->no_refcnt = 1; return; } @@ -6483,10 +6465,27 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd) rcu_read_unlock(); } +void cgroup_sk_clone(struct sock_cgroup_data *skcd) +{ + if (skcd->val) { + if (skcd->no_refcnt) + return; + /* + * We might be cloning a socket which is left in an empty + * cgroup and the cgroup might have already been rmdir'd. + * Don't use cgroup_get_live(). + */ + cgroup_get(sock_cgroup_ptr(skcd)); + cgroup_bpf_get(sock_cgroup_ptr(skcd)); + } +} + void cgroup_sk_free(struct sock_cgroup_data *skcd) { struct cgroup *cgrp = sock_cgroup_ptr(skcd); + if (skcd->no_refcnt) + return; cgroup_bpf_put(cgrp); cgroup_put(cgrp); } @@ -6508,33 +6507,6 @@ int cgroup_bpf_attach(struct cgroup *cgrp, return ret; } -int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *old_prog, - struct bpf_prog *new_prog) -{ - struct bpf_cgroup_link *cg_link; - int ret; - - if (link->ops != &bpf_cgroup_link_lops) - return -EINVAL; - - cg_link = container_of(link, struct bpf_cgroup_link, link); - - mutex_lock(&cgroup_mutex); - /* link might have been auto-released by dying cgroup, so fail */ - if (!cg_link->cgroup) { - ret = -EINVAL; - goto out_unlock; - } - if (old_prog && link->prog != old_prog) { - ret = -EPERM; - goto out_unlock; - } - ret = __cgroup_bpf_replace(cg_link->cgroup, cg_link, new_prog); -out_unlock: - mutex_unlock(&cgroup_mutex); - return ret; -} - int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, enum bpf_attach_type type) { diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 729d3a5c772e..642415b8c3c9 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1655,7 +1655,7 @@ static void update_tasks_nodemask(struct cpuset *cs) guarantee_online_mems(cs, &newmems); /* - * The mpol_rebind_mm() call takes mmap_sem, which we couldn't + * The mpol_rebind_mm() call takes mmap_lock, which we couldn't * take while holding tasklist_lock. Forks can happen - the * mpol_dup() cpuset_being_rebound check will catch such forks, * and rebind their vma mempolicies too. Because we still hold @@ -1760,7 +1760,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) * * Call with cpuset_mutex held. May take callback_lock during call. * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, - * lock each such tasks mm->mmap_sem, scan its vma's and rebind + * lock each such tasks mm->mmap_lock, scan its vma's and rebind * their mempolicies to the cpusets new mems_allowed. */ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index b05f1dd58a62..812a61afd538 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -95,11 +95,12 @@ static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) return container_of(ns, struct cgroup_namespace, ns); } -static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns) +static int cgroupns_install(struct nsset *nsset, struct ns_common *ns) { + struct nsproxy *nsproxy = nsset->nsproxy; struct cgroup_namespace *cgroup_ns = to_cg_ns(ns); - if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) || + if (!ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN) || !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 6f87352f8219..d51175cedfca 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -33,12 +33,9 @@ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) return; /* - * Paired with the one in cgroup_rstat_cpu_pop_updated(). Either we - * see NULL updated_next or they see our updated stat. - */ - smp_mb(); - - /* + * Speculative already-on-list test. This may race leading to + * temporary inaccuracies, which is fine. + * * Because @parent's updated_children is terminated with @parent * instead of NULL, we can tell whether @cgrp is on the list by * testing the next pointer for NULL. @@ -67,7 +64,6 @@ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) raw_spin_unlock_irqrestore(cpu_lock, flags); } -EXPORT_SYMBOL_GPL(cgroup_rstat_updated); /** * cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree @@ -134,13 +130,6 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, *nextp = rstatc->updated_next; rstatc->updated_next = NULL; - /* - * Paired with the one in cgroup_rstat_cpu_updated(). - * Either they see NULL updated_next or we see their - * updated stat. - */ - smp_mb(); - return pos; } @@ -399,18 +388,62 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp, cgroup_base_stat_cputime_account_end(cgrp, rstatc); } +/* + * compute the cputime for the root cgroup by getting the per cpu data + * at a global level, then categorizing the fields in a manner consistent + * with how it is done by __cgroup_account_cputime_field for each bit of + * cpu time attributed to a cgroup. + */ +static void root_cgroup_cputime(struct task_cputime *cputime) +{ + int i; + + cputime->stime = 0; + cputime->utime = 0; + cputime->sum_exec_runtime = 0; + for_each_possible_cpu(i) { + struct kernel_cpustat kcpustat; + u64 *cpustat = kcpustat.cpustat; + u64 user = 0; + u64 sys = 0; + + kcpustat_cpu_fetch(&kcpustat, i); + + user += cpustat[CPUTIME_USER]; + user += cpustat[CPUTIME_NICE]; + cputime->utime += user; + + sys += cpustat[CPUTIME_SYSTEM]; + sys += cpustat[CPUTIME_IRQ]; + sys += cpustat[CPUTIME_SOFTIRQ]; + cputime->stime += sys; + + cputime->sum_exec_runtime += user; + cputime->sum_exec_runtime += sys; + cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL]; + cputime->sum_exec_runtime += cpustat[CPUTIME_GUEST]; + cputime->sum_exec_runtime += cpustat[CPUTIME_GUEST_NICE]; + } +} + void cgroup_base_stat_cputime_show(struct seq_file *seq) { struct cgroup *cgrp = seq_css(seq)->cgroup; u64 usage, utime, stime; - - if (!cgroup_parent(cgrp)) - return; - - cgroup_rstat_flush_hold(cgrp); - usage = cgrp->bstat.cputime.sum_exec_runtime; - cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, &utime, &stime); - cgroup_rstat_flush_release(); + struct task_cputime cputime; + + if (cgroup_parent(cgrp)) { + cgroup_rstat_flush_hold(cgrp); + usage = cgrp->bstat.cputime.sum_exec_runtime; + cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, + &utime, &stime); + cgroup_rstat_flush_release(); + } else { + root_cgroup_cputime(&cputime); + usage = cputime.sum_exec_runtime; + utime = cputime.utime; + stime = cputime.stime; + } do_div(usage, NSEC_PER_USEC); do_div(utime, NSEC_PER_USEC); diff --git a/kernel/compat.c b/kernel/compat.c index 843dd17e6078..b8d2800bb4b7 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -199,7 +199,7 @@ long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask, bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG); nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size); - if (!user_access_begin(umask, bitmap_size / 8)) + if (!user_read_access_begin(umask, bitmap_size / 8)) return -EFAULT; while (nr_compat_longs > 1) { @@ -211,11 +211,11 @@ long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask, } if (nr_compat_longs) unsafe_get_user(*mask, umask++, Efault); - user_access_end(); + user_read_access_end(); return 0; Efault: - user_access_end(); + user_read_access_end(); return -EFAULT; } @@ -228,7 +228,7 @@ long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask, bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG); nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size); - if (!user_access_begin(umask, bitmap_size / 8)) + if (!user_write_access_begin(umask, bitmap_size / 8)) return -EFAULT; while (nr_compat_longs > 1) { @@ -239,10 +239,10 @@ long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask, } if (nr_compat_longs) unsafe_put_user((compat_ulong_t)*mask, umask++, Efault); - user_access_end(); + user_write_access_end(); return 0; Efault: - user_access_end(); + user_write_access_end(); return -EFAULT; } diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index ce430885c26c..36a98c48aedc 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -31,7 +31,7 @@ EXPORT_SYMBOL_GPL(context_tracking_key); DEFINE_PER_CPU(struct context_tracking, context_tracking); EXPORT_SYMBOL_GPL(context_tracking); -static bool context_tracking_recursion_enter(void) +static noinstr bool context_tracking_recursion_enter(void) { int recursion; @@ -45,7 +45,7 @@ static bool context_tracking_recursion_enter(void) return false; } -static void context_tracking_recursion_exit(void) +static __always_inline void context_tracking_recursion_exit(void) { __this_cpu_dec(context_tracking.recursion); } @@ -59,7 +59,7 @@ static void context_tracking_recursion_exit(void) * instructions to execute won't use any RCU read side critical section * because this function sets RCU in extended quiescent state. */ -void __context_tracking_enter(enum ctx_state state) +void noinstr __context_tracking_enter(enum ctx_state state) { /* Kernel threads aren't supposed to go to userspace */ WARN_ON_ONCE(!current->mm); @@ -77,8 +77,10 @@ void __context_tracking_enter(enum ctx_state state) * on the tick. */ if (state == CONTEXT_USER) { + instrumentation_begin(); trace_user_enter(0); vtime_user_enter(current); + instrumentation_end(); } rcu_user_enter(); } @@ -99,7 +101,6 @@ void __context_tracking_enter(enum ctx_state state) } context_tracking_recursion_exit(); } -NOKPROBE_SYMBOL(__context_tracking_enter); EXPORT_SYMBOL_GPL(__context_tracking_enter); void context_tracking_enter(enum ctx_state state) @@ -142,7 +143,7 @@ NOKPROBE_SYMBOL(context_tracking_user_enter); * This call supports re-entrancy. This way it can be called from any exception * handler without needing to know if we came from userspace or not. */ -void __context_tracking_exit(enum ctx_state state) +void noinstr __context_tracking_exit(enum ctx_state state) { if (!context_tracking_recursion_enter()) return; @@ -155,15 +156,16 @@ void __context_tracking_exit(enum ctx_state state) */ rcu_user_exit(); if (state == CONTEXT_USER) { + instrumentation_begin(); vtime_user_exit(current); trace_user_exit(0); + instrumentation_end(); } } __this_cpu_write(context_tracking.state, CONTEXT_KERNEL); } context_tracking_recursion_exit(); } -NOKPROBE_SYMBOL(__context_tracking_exit); EXPORT_SYMBOL_GPL(__context_tracking_exit); void context_tracking_exit(enum ctx_state state) diff --git a/kernel/cpu.c b/kernel/cpu.c index 2371292f30b0..6ff2578ecf17 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -3,6 +3,7 @@ * * This code is licenced under the GPL. */ +#include <linux/sched/mm.h> #include <linux/proc_fs.h> #include <linux/smp.h> #include <linux/init.h> @@ -432,7 +433,7 @@ static inline bool cpu_smt_allowed(unsigned int cpu) /* * On x86 it's required to boot all logical CPUs at least once so * that the init code can get a chance to set CR4.MCE on each - * CPU. Otherwise, a broadacasted MCE observing CR4.MCE=0b on any + * CPU. Otherwise, a broadcasted MCE observing CR4.MCE=0b on any * core will shutdown the machine. */ return !cpumask_test_cpu(cpu, &cpus_booted_once_mask); @@ -564,6 +565,21 @@ static int bringup_cpu(unsigned int cpu) return bringup_wait_for_ap(cpu); } +static int finish_cpu(unsigned int cpu) +{ + struct task_struct *idle = idle_thread_get(cpu); + struct mm_struct *mm = idle->active_mm; + + /* + * idle_task_exit() will have switched to &init_mm, now + * clean up any remaining active_mm state. + */ + if (mm != &init_mm) + idle->active_mm = &init_mm; + mmdrop(mm); + return 0; +} + /* * Hotplug state machine related functions */ @@ -1327,7 +1343,7 @@ void bringup_nonboot_cpus(unsigned int setup_max_cpus) #ifdef CONFIG_PM_SLEEP_SMP static cpumask_var_t frozen_cpus; -int __freeze_secondary_cpus(int primary, bool suspend) +int freeze_secondary_cpus(int primary) { int cpu, error = 0; @@ -1352,7 +1368,7 @@ int __freeze_secondary_cpus(int primary, bool suspend) if (cpu == primary) continue; - if (suspend && pm_wakeup_pending()) { + if (pm_wakeup_pending()) { pr_info("Wakeup pending. Abort CPU freeze\n"); error = -EBUSY; break; @@ -1376,8 +1392,8 @@ int __freeze_secondary_cpus(int primary, bool suspend) /* * Make sure the CPUs won't be enabled by someone else. We need to do - * this even in case of failure as all disable_nonboot_cpus() users are - * supposed to do enable_nonboot_cpus() on the failure path. + * this even in case of failure as all freeze_secondary_cpus() users are + * supposed to do thaw_secondary_cpus() on the failure path. */ cpu_hotplug_disabled++; @@ -1385,15 +1401,15 @@ int __freeze_secondary_cpus(int primary, bool suspend) return error; } -void __weak arch_enable_nonboot_cpus_begin(void) +void __weak arch_thaw_secondary_cpus_begin(void) { } -void __weak arch_enable_nonboot_cpus_end(void) +void __weak arch_thaw_secondary_cpus_end(void) { } -void enable_nonboot_cpus(void) +void thaw_secondary_cpus(void) { int cpu, error; @@ -1405,7 +1421,7 @@ void enable_nonboot_cpus(void) pr_info("Enabling non-boot CPUs ...\n"); - arch_enable_nonboot_cpus_begin(); + arch_thaw_secondary_cpus_begin(); for_each_cpu(cpu, frozen_cpus) { trace_suspend_resume(TPS("CPU_ON"), cpu, true); @@ -1418,7 +1434,7 @@ void enable_nonboot_cpus(void) pr_warn("Error taking CPU%d up: %d\n", cpu, error); } - arch_enable_nonboot_cpus_end(); + arch_thaw_secondary_cpus_end(); cpumask_clear(frozen_cpus); out: @@ -1549,7 +1565,7 @@ static struct cpuhp_step cpuhp_hp_states[] = { [CPUHP_BRINGUP_CPU] = { .name = "cpu:bringup", .startup.single = bringup_cpu, - .teardown.single = NULL, + .teardown.single = finish_cpu, .cant_stop = true, }, /* Final state before CPU kills itself */ diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c index cbca6879ab7d..44a259338e33 100644 --- a/kernel/cpu_pm.c +++ b/kernel/cpu_pm.c @@ -80,7 +80,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier); */ int cpu_pm_enter(void) { - int nr_calls; + int nr_calls = 0; int ret = 0; ret = cpu_pm_notify(CPU_PM_ENTER, -1, &nr_calls); @@ -131,7 +131,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_exit); */ int cpu_cluster_pm_enter(void) { - int nr_calls; + int nr_calls = 0; int ret = 0; ret = cpu_pm_notify(CPU_CLUSTER_PM_ENTER, -1, &nr_calls); diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 9f1557b98468..18175687133a 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -413,6 +413,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); VMCOREINFO_STRUCT_SIZE(mem_section); VMCOREINFO_OFFSET(mem_section, section_mem_map); + VMCOREINFO_NUMBER(MAX_PHYSMEM_BITS); #endif VMCOREINFO_STRUCT_SIZE(page); VMCOREINFO_STRUCT_SIZE(pglist_data); diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c index 9c23ae074b40..92da32275af5 100644 --- a/kernel/crash_dump.c +++ b/kernel/crash_dump.c @@ -6,12 +6,6 @@ #include <linux/export.h> /* - * If we have booted due to a crash, max_pfn will be a very low value. We need - * to know the amount of memory that the previous kernel used. - */ -unsigned long saved_max_pfn; - -/* * stores the physical address of elf header of crash image * * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by diff --git a/kernel/cred.c b/kernel/cred.c index 71a792616917..421b1149c651 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -315,6 +315,9 @@ struct cred *prepare_exec_creds(void) new->process_keyring = NULL; #endif + new->suid = new->fsuid = new->euid; + new->sgid = new->fsgid = new->egid; + return new; } diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 2b7c9b67931d..b16dbc1bf056 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -67,9 +67,7 @@ static int kgdb_break_asap; struct debuggerinfo_struct kgdb_info[NR_CPUS]; -/** - * kgdb_connected - Is a host GDB connected to us? - */ +/* kgdb_connected - Is a host GDB connected to us? */ int kgdb_connected; EXPORT_SYMBOL_GPL(kgdb_connected); @@ -171,18 +169,18 @@ int __weak kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) { int err; - err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr, + err = copy_from_kernel_nofault(bpt->saved_instr, (char *)bpt->bpt_addr, BREAK_INSTR_SIZE); if (err) return err; - err = probe_kernel_write((char *)bpt->bpt_addr, + err = copy_to_kernel_nofault((char *)bpt->bpt_addr, arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE); return err; } int __weak kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt) { - return probe_kernel_write((char *)bpt->bpt_addr, + return copy_to_kernel_nofault((char *)bpt->bpt_addr, (char *)bpt->saved_instr, BREAK_INSTR_SIZE); } @@ -417,6 +415,18 @@ int kgdb_isremovedbreak(unsigned long addr) return 0; } +int kgdb_has_hit_break(unsigned long addr) +{ + int i; + + for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { + if (kgdb_break[i].state == BP_ACTIVE && + kgdb_break[i].bpt_addr == addr) + return 1; + } + return 0; +} + int dbg_remove_all_break(void) { int error; @@ -532,6 +542,7 @@ static int kgdb_reenter_check(struct kgdb_state *ks) if (exception_level > 1) { dump_stack(); + kgdb_io_module_registered = false; panic("Recursive entry to debugger"); } @@ -576,6 +587,7 @@ static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs, arch_kgdb_ops.disable_hw_break(regs); acquirelock: + rcu_read_lock(); /* * Interrupts will be restored by the 'trap return' code, except when * single stepping. @@ -635,6 +647,7 @@ return_normal: atomic_dec(&slaves_in_kgdb); dbg_touch_watchdogs(); local_irq_restore(flags); + rcu_read_unlock(); return 0; } cpu_relax(); @@ -653,6 +666,7 @@ return_normal: raw_spin_unlock(&dbg_master_lock); dbg_touch_watchdogs(); local_irq_restore(flags); + rcu_read_unlock(); goto acquirelock; } @@ -668,6 +682,8 @@ return_normal: if (kgdb_skipexception(ks->ex_vector, ks->linux_regs)) goto kgdb_restore; + atomic_inc(&ignore_console_lock_warning); + /* Call the I/O driver's pre_exception routine */ if (dbg_io_ops->pre_exception) dbg_io_ops->pre_exception(); @@ -740,6 +756,8 @@ cpu_master_loop: if (dbg_io_ops->post_exception) dbg_io_ops->post_exception(); + atomic_dec(&ignore_console_lock_warning); + if (!kgdb_single_step) { raw_spin_unlock(&dbg_slave_lock); /* Wait till all the CPUs have quit from the debugger. */ @@ -772,6 +790,7 @@ kgdb_restore: raw_spin_unlock(&dbg_master_lock); dbg_touch_watchdogs(); local_irq_restore(flags); + rcu_read_unlock(); return kgdb_info[cpu].ret_state; } @@ -920,7 +939,7 @@ static void sysrq_handle_dbg(int key) kgdb_breakpoint(); } -static struct sysrq_key_op sysrq_dbg_op = { +static const struct sysrq_key_op sysrq_dbg_op = { .handler = sysrq_handle_dbg, .help_msg = "debug(g)", .action_msg = "DEBUG", @@ -946,6 +965,14 @@ void kgdb_panic(const char *msg) kgdb_breakpoint(); } +static void kgdb_initial_breakpoint(void) +{ + kgdb_break_asap = 0; + + pr_crit("Waiting for connection from remote gdb...\n"); + kgdb_breakpoint(); +} + void __weak kgdb_arch_late(void) { } @@ -956,6 +983,9 @@ void __init dbg_late_init(void) if (kgdb_io_module_registered) kgdb_arch_late(); kdb_init(KDB_INIT_FULL); + + if (kgdb_io_module_registered && kgdb_break_asap) + kgdb_initial_breakpoint(); } static int @@ -1038,7 +1068,7 @@ static void kgdb_tasklet_bpt(unsigned long ing) atomic_set(&kgdb_break_tasklet_var, 0); } -static DECLARE_TASKLET(kgdb_tasklet_breakpoint, kgdb_tasklet_bpt, 0); +static DECLARE_TASKLET_OLD(kgdb_tasklet_breakpoint, kgdb_tasklet_bpt); void kgdb_schedule_breakpoint(void) { @@ -1051,14 +1081,6 @@ void kgdb_schedule_breakpoint(void) } EXPORT_SYMBOL_GPL(kgdb_schedule_breakpoint); -static void kgdb_initial_breakpoint(void) -{ - kgdb_break_asap = 0; - - pr_crit("Waiting for connection from remote gdb...\n"); - kgdb_breakpoint(); -} - /** * kgdb_register_io_module - register KGDB IO module * @new_dbg_io_ops: the io ops vector @@ -1067,15 +1089,22 @@ static void kgdb_initial_breakpoint(void) */ int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops) { + struct kgdb_io *old_dbg_io_ops; int err; spin_lock(&kgdb_registration_lock); - if (dbg_io_ops) { - spin_unlock(&kgdb_registration_lock); + old_dbg_io_ops = dbg_io_ops; + if (old_dbg_io_ops) { + if (!old_dbg_io_ops->deinit) { + spin_unlock(&kgdb_registration_lock); - pr_err("Another I/O driver is already registered with KGDB\n"); - return -EBUSY; + pr_err("KGDB I/O driver %s can't replace %s.\n", + new_dbg_io_ops->name, old_dbg_io_ops->name); + return -EBUSY; + } + pr_info("Replacing I/O driver %s with %s\n", + old_dbg_io_ops->name, new_dbg_io_ops->name); } if (new_dbg_io_ops->init) { @@ -1090,12 +1119,18 @@ int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops) spin_unlock(&kgdb_registration_lock); + if (old_dbg_io_ops) { + old_dbg_io_ops->deinit(); + return 0; + } + pr_info("Registered I/O driver %s\n", new_dbg_io_ops->name); /* Arm KGDB now. */ kgdb_register_callbacks(); - if (kgdb_break_asap) + if (kgdb_break_asap && + (!dbg_is_early || IS_ENABLED(CONFIG_ARCH_HAS_EARLY_DEBUG))) kgdb_initial_breakpoint(); return 0; @@ -1125,6 +1160,9 @@ void kgdb_unregister_io_module(struct kgdb_io *old_dbg_io_ops) spin_unlock(&kgdb_registration_lock); + if (old_dbg_io_ops->deinit) + old_dbg_io_ops->deinit(); + pr_info("Unregistered I/O driver %s, debugger disabled\n", old_dbg_io_ops->name); } @@ -1165,7 +1203,8 @@ static int __init opt_kgdb_wait(char *str) kgdb_break_asap = 1; kdb_init(KDB_INIT_EARLY); - if (kgdb_io_module_registered) + if (kgdb_io_module_registered && + IS_ENABLED(CONFIG_ARCH_HAS_EARLY_DEBUG)) kgdb_initial_breakpoint(); return 0; diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index 4b280fc7dd67..a790026e42d0 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -247,7 +247,7 @@ char *kgdb_mem2hex(char *mem, char *buf, int count) */ tmp = buf + count; - err = probe_kernel_read(tmp, mem, count); + err = copy_from_kernel_nofault(tmp, mem, count); if (err) return NULL; while (count > 0) { @@ -283,7 +283,7 @@ int kgdb_hex2mem(char *buf, char *mem, int count) *tmp_raw |= hex_to_bin(*tmp_hex--) << 4; } - return probe_kernel_write(mem, tmp_raw, count); + return copy_to_kernel_nofault(mem, tmp_raw, count); } /* @@ -335,7 +335,7 @@ static int kgdb_ebin2mem(char *buf, char *mem, int count) size++; } - return probe_kernel_write(mem, c, size); + return copy_to_kernel_nofault(mem, c, size); } #if DBG_MAX_REG_NUM > 0 @@ -792,6 +792,19 @@ static void gdb_cmd_query(struct kgdb_state *ks) } break; #endif +#ifdef CONFIG_HAVE_ARCH_KGDB_QXFER_PKT + case 'S': + if (!strncmp(remcom_in_buffer, "qSupported:", 11)) + strcpy(remcom_out_buffer, kgdb_arch_gdb_stub_feature); + break; + case 'X': + if (!strncmp(remcom_in_buffer, "qXfer:", 6)) + kgdb_arch_handle_qxfer_pkt(remcom_in_buffer, + remcom_out_buffer); + break; +#endif + default: + break; } } diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index 3de0cc780c16..18e03aba2cfc 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c @@ -21,17 +21,18 @@ static void kdb_show_stack(struct task_struct *p, void *addr) { - int old_lvl = console_loglevel; - - console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH; kdb_trap_printk++; - if (!addr && kdb_task_has_cpu(p)) + if (!addr && kdb_task_has_cpu(p)) { + int old_lvl = console_loglevel; + + console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH; kdb_dump_stack_on_cpu(kdb_process_cpu(p)); - else - show_stack(p, addr); + console_loglevel = old_lvl; + } else { + show_stack(p, addr, KERN_EMERG); + } - console_loglevel = old_lvl; kdb_trap_printk--; } diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 924bc9298a42..9d847ab851db 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -542,6 +542,44 @@ static int kdb_search_string(char *searched, char *searchfor) return 0; } +static void kdb_msg_write(const char *msg, int msg_len) +{ + struct console *c; + + if (msg_len == 0) + return; + + if (dbg_io_ops) { + const char *cp = msg; + int len = msg_len; + + while (len--) { + dbg_io_ops->write_char(*cp); + cp++; + } + } + + for_each_console(c) { + if (!(c->flags & CON_ENABLED)) + continue; + if (c == dbg_io_ops->cons) + continue; + /* + * Set oops_in_progress to encourage the console drivers to + * disregard their internal spin locks: in the current calling + * context the risk of deadlock is a bigger problem than risks + * due to re-entering the console driver. We operate directly on + * oops_in_progress rather than using bust_spinlocks() because + * the calls bust_spinlocks() makes on exit are not appropriate + * for this calling context. + */ + ++oops_in_progress; + c->write(c, msg, msg_len); + --oops_in_progress; + touch_nmi_watchdog(); + } +} + int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap) { int diag; @@ -553,8 +591,7 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap) int this_cpu, old_cpu; char *cp, *cp2, *cphold = NULL, replaced_byte = ' '; char *moreprompt = "more> "; - struct console *c; - unsigned long uninitialized_var(flags); + unsigned long flags; /* Serialize kdb_printf if multiple cpus try to write at once. * But if any cpu goes recursive in kdb, just print the output, @@ -687,22 +724,11 @@ kdb_printit: */ retlen = strlen(kdb_buffer); cp = (char *) printk_skip_headers(kdb_buffer); - if (!dbg_kdb_mode && kgdb_connected) { + if (!dbg_kdb_mode && kgdb_connected) gdbstub_msg_write(cp, retlen - (cp - kdb_buffer)); - } else { - if (dbg_io_ops && !dbg_io_ops->is_console) { - len = retlen - (cp - kdb_buffer); - cp2 = cp; - while (len--) { - dbg_io_ops->write_char(*cp2); - cp2++; - } - } - for_each_console(c) { - c->write(c, cp, retlen - (cp - kdb_buffer)); - touch_nmi_watchdog(); - } - } + else + kdb_msg_write(cp, retlen - (cp - kdb_buffer)); + if (logging) { saved_loglevel = console_loglevel; console_loglevel = CONSOLE_LOGLEVEL_SILENT; @@ -751,19 +777,7 @@ kdb_printit: moreprompt = "more> "; kdb_input_flush(); - - if (dbg_io_ops && !dbg_io_ops->is_console) { - len = strlen(moreprompt); - cp = moreprompt; - while (len--) { - dbg_io_ops->write_char(*cp); - cp++; - } - } - for_each_console(c) { - c->write(c, moreprompt, strlen(moreprompt)); - touch_nmi_watchdog(); - } + kdb_msg_write(moreprompt, strlen(moreprompt)); if (logging) printk("%s", moreprompt); diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 515379cbf209..5c7949061671 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -62,7 +62,7 @@ int kdb_grep_trailing; /* * Kernel debugger state flags */ -int kdb_flags; +unsigned int kdb_flags; /* * kdb_lock protects updates to kdb_initial_cpu. Used to @@ -418,8 +418,7 @@ int kdb_set(int argc, const char **argv) argv[2]); return 0; } - kdb_flags = (kdb_flags & - ~(KDB_DEBUG_FLAG_MASK << KDB_DEBUG_FLAG_SHIFT)) + kdb_flags = (kdb_flags & ~KDB_DEBUG(MASK)) | (debugflags << KDB_DEBUG_FLAG_SHIFT); return 0; @@ -1108,7 +1107,8 @@ static int handle_ctrl_cmd(char *cmd) switch (*cmd) { case CTRL_P: if (cmdptr != cmd_tail) - cmdptr = (cmdptr-1) % KDB_CMD_HISTORY_COUNT; + cmdptr = (cmdptr + KDB_CMD_HISTORY_COUNT - 1) % + KDB_CMD_HISTORY_COUNT; strscpy(cmd_cur, cmd_hist[cmdptr], CMD_BUFLEN); return 1; case CTRL_N: @@ -2081,7 +2081,8 @@ static int kdb_env(int argc, const char **argv) } if (KDB_DEBUG(MASK)) - kdb_printf("KDBFLAGS=0x%x\n", kdb_flags); + kdb_printf("KDBDEBUG=0x%x\n", + (kdb_flags & KDB_DEBUG(MASK)) >> KDB_DEBUG_FLAG_SHIFT); return 0; } @@ -2325,7 +2326,8 @@ void kdb_ps1(const struct task_struct *p) int cpu; unsigned long tmp; - if (!p || probe_kernel_read(&tmp, (char *)p, sizeof(unsigned long))) + if (!p || + copy_from_kernel_nofault(&tmp, (char *)p, sizeof(unsigned long))) return; cpu = kdb_process_cpu(p); diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index b8e6306e7e13..004c5b6c87f8 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c @@ -325,7 +325,7 @@ char *kdb_strdup(const char *str, gfp_t type) */ int kdb_getarea_size(void *res, unsigned long addr, size_t size) { - int ret = probe_kernel_read((char *)res, (char *)addr, size); + int ret = copy_from_kernel_nofault((char *)res, (char *)addr, size); if (ret) { if (!KDB_STATE(SUPPRESS)) { kdb_printf("kdb_getarea: Bad address 0x%lx\n", addr); @@ -350,7 +350,7 @@ int kdb_getarea_size(void *res, unsigned long addr, size_t size) */ int kdb_putarea_size(unsigned long addr, void *res, size_t size) { - int ret = probe_kernel_read((char *)addr, (char *)res, size); + int ret = copy_from_kernel_nofault((char *)addr, (char *)res, size); if (ret) { if (!KDB_STATE(SUPPRESS)) { kdb_printf("kdb_putarea: Bad address 0x%lx\n", addr); @@ -624,7 +624,8 @@ char kdb_task_state_char (const struct task_struct *p) char state; unsigned long tmp; - if (!p || probe_kernel_read(&tmp, (char *)p, sizeof(unsigned long))) + if (!p || + copy_from_kernel_nofault(&tmp, (char *)p, sizeof(unsigned long))) return 'E'; cpu = kdb_process_cpu(p); diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 4c103a24e380..f4770fcfa62b 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -5,6 +5,17 @@ config HAS_DMA depends on !NO_DMA default y +config DMA_OPS + bool + +# +# IOMMU drivers that can bypass the IOMMU code and optionally use the direct +# mapping fast path should select this option and set the dma_ops_bypass +# flag in struct device where applicable +# +config DMA_OPS_BYPASS + bool + config NEED_SG_DMA_LENGTH bool @@ -60,6 +71,7 @@ config DMA_NONCOHERENT_CACHE_SYNC config DMA_VIRT_OPS bool depends on HAS_DMA + select DMA_OPS config SWIOTLB bool @@ -71,17 +83,22 @@ config SWIOTLB # in the pagetables # config DMA_NONCOHERENT_MMAP + default y if !MMU + bool + +config DMA_COHERENT_POOL + select GENERIC_ALLOCATOR bool config DMA_REMAP + bool depends on MMU - select GENERIC_ALLOCATOR select DMA_NONCOHERENT_MMAP - bool config DMA_DIRECT_REMAP bool select DMA_REMAP + select DMA_COHERENT_POOL config DMA_CMA bool "DMA Contiguous Memory Allocator" diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile index d237cf3dc181..32c7c1942bbd 100644 --- a/kernel/dma/Makefile +++ b/kernel/dma/Makefile @@ -1,9 +1,11 @@ # SPDX-License-Identifier: GPL-2.0 -obj-$(CONFIG_HAS_DMA) += mapping.o direct.o dummy.o +obj-$(CONFIG_HAS_DMA) += mapping.o direct.o +obj-$(CONFIG_DMA_OPS) += dummy.o obj-$(CONFIG_DMA_CMA) += contiguous.o obj-$(CONFIG_DMA_DECLARE_COHERENT) += coherent.o obj-$(CONFIG_DMA_VIRT_OPS) += virt.o obj-$(CONFIG_DMA_API_DEBUG) += debug.o obj-$(CONFIG_SWIOTLB) += swiotlb.o +obj-$(CONFIG_DMA_COHERENT_POOL) += pool.o obj-$(CONFIG_DMA_REMAP) += remap.o diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index 8bc6f2d670f9..cff7e60968b9 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -215,6 +215,13 @@ bool dma_release_from_contiguous(struct device *dev, struct page *pages, return cma_release(dev_get_cma_area(dev), pages, count); } +static struct page *cma_alloc_aligned(struct cma *cma, size_t size, gfp_t gfp) +{ + unsigned int align = min(get_order(size), CONFIG_CMA_ALIGNMENT); + + return cma_alloc(cma, size >> PAGE_SHIFT, align, gfp & __GFP_NOWARN); +} + /** * dma_alloc_contiguous() - allocate contiguous pages * @dev: Pointer to device for which the allocation is performed. @@ -222,8 +229,8 @@ bool dma_release_from_contiguous(struct device *dev, struct page *pages, * @gfp: Allocation flags. * * This function allocates contiguous memory buffer for specified device. It - * first tries to use device specific contiguous memory area if available or - * the default global one, then tries a fallback allocation of normal pages. + * tries to use device specific contiguous memory area if available, or the + * default global one. * * Note that it byapss one-page size of allocations from the global area as * the addresses within one page are always contiguous, so there is no need @@ -231,24 +238,14 @@ bool dma_release_from_contiguous(struct device *dev, struct page *pages, */ struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp) { - size_t count = size >> PAGE_SHIFT; - struct page *page = NULL; - struct cma *cma = NULL; - - if (dev && dev->cma_area) - cma = dev->cma_area; - else if (count > 1) - cma = dma_contiguous_default_area; - /* CMA can be used only in the context which permits sleeping */ - if (cma && gfpflags_allow_blocking(gfp)) { - size_t align = get_order(size); - size_t cma_align = min_t(size_t, align, CONFIG_CMA_ALIGNMENT); - - page = cma_alloc(cma, count, cma_align, gfp & __GFP_NOWARN); - } - - return page; + if (!gfpflags_allow_blocking(gfp)) + return NULL; + if (dev->cma_area) + return cma_alloc_aligned(dev->cma_area, size, gfp); + if (size <= PAGE_SIZE || !dma_contiguous_default_area) + return NULL; + return cma_alloc_aligned(dma_contiguous_default_area, size, gfp); } /** diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 9e1777c81f55..f7f807fb6ebb 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -144,8 +144,12 @@ static const char *type2name[] = { [dma_debug_resource] = "resource", }; -static const char *dir2name[4] = { "DMA_BIDIRECTIONAL", "DMA_TO_DEVICE", - "DMA_FROM_DEVICE", "DMA_NONE" }; +static const char *dir2name[] = { + [DMA_BIDIRECTIONAL] = "DMA_BIDIRECTIONAL", + [DMA_TO_DEVICE] = "DMA_TO_DEVICE", + [DMA_FROM_DEVICE] = "DMA_FROM_DEVICE", + [DMA_NONE] = "DMA_NONE", +}; /* * The access to some variables in this macro is racy. We can't use atomic_t @@ -656,7 +660,7 @@ static struct dma_debug_entry *__dma_entry_alloc(void) return entry; } -void __dma_entry_alloc_check_leak(void) +static void __dma_entry_alloc_check_leak(void) { u32 tmp = nr_total_entries % nr_prealloc_entries; @@ -882,7 +886,7 @@ static int device_dma_allocations(struct device *dev, struct dma_debug_entry **o static int dma_debug_device_change(struct notifier_block *nb, unsigned long action, void *data) { struct device *dev = data; - struct dma_debug_entry *uninitialized_var(entry); + struct dma_debug_entry *entry; int count; if (dma_debug_disabled()) @@ -1071,7 +1075,7 @@ static void check_unmap(struct dma_debug_entry *ref) /* * Drivers should use dma_mapping_error() to check the returned * addresses of dma_map_single() and dma_map_page(). - * If not, print this warning message. See Documentation/DMA-API.txt. + * If not, print this warning message. See Documentation/core-api/dma-api.rst. */ if (entry->map_err_type == MAP_ERR_NOT_CHECKED) { err_printk(ref->dev, entry, diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 8f4bbdaf965e..bb0041e99659 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -10,11 +10,9 @@ #include <linux/dma-direct.h> #include <linux/scatterlist.h> #include <linux/dma-contiguous.h> -#include <linux/dma-noncoherent.h> #include <linux/pfn.h> #include <linux/vmalloc.h> #include <linux/set_memory.h> -#include <linux/swiotlb.h> /* * Most architectures use ZONE_DMA for the first 16 Megabytes, but some use it @@ -45,8 +43,8 @@ u64 dma_direct_get_required_mask(struct device *dev) return (1ULL << (fls64(max_dma) - 1)) * 2 - 1; } -static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, - u64 *phys_limit) +gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, + u64 *phys_limit) { u64 dma_limit = min_not_zero(dma_mask, dev->bus_dma_limit); @@ -70,35 +68,69 @@ static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, return 0; } -static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) +bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) { return phys_to_dma_direct(dev, phys) + size - 1 <= min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit); } -struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, +/* + * Decrypting memory is allowed to block, so if this device requires + * unencrypted memory it must come from atomic pools. + */ +static inline bool dma_should_alloc_from_pool(struct device *dev, gfp_t gfp, + unsigned long attrs) +{ + if (!IS_ENABLED(CONFIG_DMA_COHERENT_POOL)) + return false; + if (gfpflags_allow_blocking(gfp)) + return false; + if (force_dma_unencrypted(dev)) + return true; + if (!IS_ENABLED(CONFIG_DMA_DIRECT_REMAP)) + return false; + if (dma_alloc_need_uncached(dev, attrs)) + return true; + return false; +} + +static inline bool dma_should_free_from_pool(struct device *dev, + unsigned long attrs) +{ + if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL)) + return true; + if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) && + !force_dma_unencrypted(dev)) + return false; + if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP)) + return true; + return false; +} + +static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, gfp_t gfp, unsigned long attrs) { - size_t alloc_size = PAGE_ALIGN(size); int node = dev_to_node(dev); struct page *page = NULL; u64 phys_limit; + WARN_ON_ONCE(!PAGE_ALIGNED(size)); + if (attrs & DMA_ATTR_NO_WARN) gfp |= __GFP_NOWARN; /* we always manually zero the memory once we are done: */ gfp &= ~__GFP_ZERO; - gfp |= __dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, - &phys_limit); - page = dma_alloc_contiguous(dev, alloc_size, gfp); + gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, + &phys_limit); + page = dma_alloc_contiguous(dev, size, gfp); if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { - dma_free_contiguous(dev, page, alloc_size); + dma_free_contiguous(dev, page, size); page = NULL; } again: if (!page) - page = alloc_pages_node(node, gfp, get_order(alloc_size)); + page = alloc_pages_node(node, gfp, get_order(size)); if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { dma_free_contiguous(dev, page, size); page = NULL; @@ -124,11 +156,12 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, { struct page *page; void *ret; + int err; + + size = PAGE_ALIGN(size); - if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && - dma_alloc_need_uncached(dev, attrs) && - !gfpflags_allow_blocking(gfp)) { - ret = dma_alloc_from_pool(PAGE_ALIGN(size), &page, gfp); + if (dma_should_alloc_from_pool(dev, gfp, attrs)) { + ret = dma_alloc_from_pool(dev, size, &page, gfp); if (!ret) return NULL; goto done; @@ -152,14 +185,20 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, dma_alloc_need_uncached(dev, attrs)) || (IS_ENABLED(CONFIG_DMA_REMAP) && PageHighMem(page))) { /* remove any dirty cache lines on the kernel alias */ - arch_dma_prep_coherent(page, PAGE_ALIGN(size)); + arch_dma_prep_coherent(page, size); /* create a coherent mapping */ - ret = dma_common_contiguous_remap(page, PAGE_ALIGN(size), + ret = dma_common_contiguous_remap(page, size, dma_pgprot(dev, PAGE_KERNEL, attrs), __builtin_return_address(0)); if (!ret) goto out_free_pages; + if (force_dma_unencrypted(dev)) { + err = set_memory_decrypted((unsigned long)ret, + 1 << get_order(size)); + if (err) + goto out_free_pages; + } memset(ret, 0, size); goto done; } @@ -176,8 +215,12 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, } ret = page_address(page); - if (force_dma_unencrypted(dev)) - set_memory_decrypted((unsigned long)ret, 1 << get_order(size)); + if (force_dma_unencrypted(dev)) { + err = set_memory_decrypted((unsigned long)ret, + 1 << get_order(size)); + if (err) + goto out_free_pages; + } memset(ret, 0, size); @@ -186,7 +229,7 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, arch_dma_prep_coherent(page, size); ret = arch_dma_set_uncached(ret, size); if (IS_ERR(ret)) - goto out_free_pages; + goto out_encrypt_pages; } done: if (force_dma_unencrypted(dev)) @@ -194,6 +237,15 @@ done: else *dma_handle = phys_to_dma(dev, page_to_phys(page)); return ret; + +out_encrypt_pages: + if (force_dma_unencrypted(dev)) { + err = set_memory_encrypted((unsigned long)page_address(page), + 1 << get_order(size)); + /* If memory cannot be re-encrypted, it must be leaked */ + if (err) + return NULL; + } out_free_pages: dma_free_contiguous(dev, page, size); return NULL; @@ -204,6 +256,11 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, { unsigned int page_order = get_order(size); + /* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */ + if (dma_should_free_from_pool(dev, attrs) && + dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size))) + return; + if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) && !force_dma_unencrypted(dev)) { /* cpu_addr is a struct page cookie, not a kernel address */ @@ -211,10 +268,6 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, return; } - if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && - dma_free_from_pool(cpu_addr, PAGE_ALIGN(size))) - return; - if (force_dma_unencrypted(dev)) set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order); @@ -249,19 +302,6 @@ void dma_direct_free(struct device *dev, size_t size, #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \ defined(CONFIG_SWIOTLB) -void dma_direct_sync_single_for_device(struct device *dev, - dma_addr_t addr, size_t size, enum dma_data_direction dir) -{ - phys_addr_t paddr = dma_to_phys(dev, addr); - - if (unlikely(is_swiotlb_buffer(paddr))) - swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_DEVICE); - - if (!dev_is_dma_coherent(dev)) - arch_sync_dma_for_device(paddr, size, dir); -} -EXPORT_SYMBOL(dma_direct_sync_single_for_device); - void dma_direct_sync_sg_for_device(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir) { @@ -280,27 +320,11 @@ void dma_direct_sync_sg_for_device(struct device *dev, dir); } } -EXPORT_SYMBOL(dma_direct_sync_sg_for_device); #endif #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \ defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \ defined(CONFIG_SWIOTLB) -void dma_direct_sync_single_for_cpu(struct device *dev, - dma_addr_t addr, size_t size, enum dma_data_direction dir) -{ - phys_addr_t paddr = dma_to_phys(dev, addr); - - if (!dev_is_dma_coherent(dev)) { - arch_sync_dma_for_cpu(paddr, size, dir); - arch_sync_dma_for_cpu_all(); - } - - if (unlikely(is_swiotlb_buffer(paddr))) - swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_CPU); -} -EXPORT_SYMBOL(dma_direct_sync_single_for_cpu); - void dma_direct_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir) { @@ -321,20 +345,6 @@ void dma_direct_sync_sg_for_cpu(struct device *dev, if (!dev_is_dma_coherent(dev)) arch_sync_dma_for_cpu_all(); } -EXPORT_SYMBOL(dma_direct_sync_sg_for_cpu); - -void dma_direct_unmap_page(struct device *dev, dma_addr_t addr, - size_t size, enum dma_data_direction dir, unsigned long attrs) -{ - phys_addr_t phys = dma_to_phys(dev, addr); - - if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) - dma_direct_sync_single_for_cpu(dev, addr, size, dir); - - if (unlikely(is_swiotlb_buffer(phys))) - swiotlb_tbl_unmap_single(dev, phys, size, size, dir, attrs); -} -EXPORT_SYMBOL(dma_direct_unmap_page); void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir, unsigned long attrs) @@ -346,35 +356,8 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl, dma_direct_unmap_page(dev, sg->dma_address, sg_dma_len(sg), dir, attrs); } -EXPORT_SYMBOL(dma_direct_unmap_sg); #endif -dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, enum dma_data_direction dir, - unsigned long attrs) -{ - phys_addr_t phys = page_to_phys(page) + offset; - dma_addr_t dma_addr = phys_to_dma(dev, phys); - - if (unlikely(swiotlb_force == SWIOTLB_FORCE)) - return swiotlb_map(dev, phys, size, dir, attrs); - - if (unlikely(!dma_capable(dev, dma_addr, size, true))) { - if (swiotlb_force != SWIOTLB_NO_FORCE) - return swiotlb_map(dev, phys, size, dir, attrs); - - dev_WARN_ONCE(dev, 1, - "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n", - &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); - return DMA_MAPPING_ERROR; - } - - if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) - arch_sync_dma_for_device(phys, size, dir); - return dma_addr; -} -EXPORT_SYMBOL(dma_direct_map_page); - int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir, unsigned long attrs) { @@ -395,7 +378,6 @@ out_unmap: dma_direct_unmap_sg(dev, sgl, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC); return 0; } -EXPORT_SYMBOL(dma_direct_map_sg); dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr, size_t size, enum dma_data_direction dir, unsigned long attrs) @@ -412,7 +394,6 @@ dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr, return dma_addr; } -EXPORT_SYMBOL(dma_direct_map_resource); int dma_direct_get_sgtable(struct device *dev, struct sg_table *sgt, void *cpu_addr, dma_addr_t dma_addr, size_t size, @@ -427,7 +408,6 @@ int dma_direct_get_sgtable(struct device *dev, struct sg_table *sgt, return ret; } -#ifdef CONFIG_MMU bool dma_direct_can_mmap(struct device *dev) { return dev_is_dma_coherent(dev) || @@ -453,19 +433,6 @@ int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma, return remap_pfn_range(vma, vma->vm_start, pfn + vma->vm_pgoff, user_count << PAGE_SHIFT, vma->vm_page_prot); } -#else /* CONFIG_MMU */ -bool dma_direct_can_mmap(struct device *dev) -{ - return false; -} - -int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma, - void *cpu_addr, dma_addr_t dma_addr, size_t size, - unsigned long attrs) -{ - return -ENXIO; -} -#endif /* CONFIG_MMU */ int dma_direct_supported(struct device *dev, u64 mask) { @@ -498,3 +465,9 @@ size_t dma_direct_max_mapping_size(struct device *dev) return swiotlb_max_mapping_size(dev); return SIZE_MAX; } + +bool dma_direct_need_sync(struct device *dev, dma_addr_t dma_addr) +{ + return !dev_is_dma_coherent(dev) || + is_swiotlb_buffer(dma_to_phys(dev, dma_addr)); +} diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 98e3d873792e..0d129421e75f 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -105,6 +105,196 @@ void *dmam_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, } EXPORT_SYMBOL(dmam_alloc_attrs); +static bool dma_go_direct(struct device *dev, dma_addr_t mask, + const struct dma_map_ops *ops) +{ + if (likely(!ops)) + return true; +#ifdef CONFIG_DMA_OPS_BYPASS + if (dev->dma_ops_bypass) + return min_not_zero(mask, dev->bus_dma_limit) >= + dma_direct_get_required_mask(dev); +#endif + return false; +} + + +/* + * Check if the devices uses a direct mapping for streaming DMA operations. + * This allows IOMMU drivers to set a bypass mode if the DMA mask is large + * enough. + */ +static inline bool dma_alloc_direct(struct device *dev, + const struct dma_map_ops *ops) +{ + return dma_go_direct(dev, dev->coherent_dma_mask, ops); +} + +static inline bool dma_map_direct(struct device *dev, + const struct dma_map_ops *ops) +{ + return dma_go_direct(dev, *dev->dma_mask, ops); +} + +dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, + size_t offset, size_t size, enum dma_data_direction dir, + unsigned long attrs) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + dma_addr_t addr; + + BUG_ON(!valid_dma_direction(dir)); + if (dma_map_direct(dev, ops)) + addr = dma_direct_map_page(dev, page, offset, size, dir, attrs); + else + addr = ops->map_page(dev, page, offset, size, dir, attrs); + debug_dma_map_page(dev, page, offset, size, dir, addr); + + return addr; +} +EXPORT_SYMBOL(dma_map_page_attrs); + +void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size, + enum dma_data_direction dir, unsigned long attrs) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + if (dma_map_direct(dev, ops)) + dma_direct_unmap_page(dev, addr, size, dir, attrs); + else if (ops->unmap_page) + ops->unmap_page(dev, addr, size, dir, attrs); + debug_dma_unmap_page(dev, addr, size, dir); +} +EXPORT_SYMBOL(dma_unmap_page_attrs); + +/* + * dma_maps_sg_attrs returns 0 on error and > 0 on success. + * It should never return a value < 0. + */ +int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, + enum dma_data_direction dir, unsigned long attrs) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + int ents; + + BUG_ON(!valid_dma_direction(dir)); + if (dma_map_direct(dev, ops)) + ents = dma_direct_map_sg(dev, sg, nents, dir, attrs); + else + ents = ops->map_sg(dev, sg, nents, dir, attrs); + BUG_ON(ents < 0); + debug_dma_map_sg(dev, sg, nents, ents, dir); + + return ents; +} +EXPORT_SYMBOL(dma_map_sg_attrs); + +void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction dir, + unsigned long attrs) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + debug_dma_unmap_sg(dev, sg, nents, dir); + if (dma_map_direct(dev, ops)) + dma_direct_unmap_sg(dev, sg, nents, dir, attrs); + else if (ops->unmap_sg) + ops->unmap_sg(dev, sg, nents, dir, attrs); +} +EXPORT_SYMBOL(dma_unmap_sg_attrs); + +dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr, + size_t size, enum dma_data_direction dir, unsigned long attrs) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + dma_addr_t addr = DMA_MAPPING_ERROR; + + BUG_ON(!valid_dma_direction(dir)); + + /* Don't allow RAM to be mapped */ + if (WARN_ON_ONCE(pfn_valid(PHYS_PFN(phys_addr)))) + return DMA_MAPPING_ERROR; + + if (dma_map_direct(dev, ops)) + addr = dma_direct_map_resource(dev, phys_addr, size, dir, attrs); + else if (ops->map_resource) + addr = ops->map_resource(dev, phys_addr, size, dir, attrs); + + debug_dma_map_resource(dev, phys_addr, size, dir, addr); + return addr; +} +EXPORT_SYMBOL(dma_map_resource); + +void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size, + enum dma_data_direction dir, unsigned long attrs) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + if (!dma_map_direct(dev, ops) && ops->unmap_resource) + ops->unmap_resource(dev, addr, size, dir, attrs); + debug_dma_unmap_resource(dev, addr, size, dir); +} +EXPORT_SYMBOL(dma_unmap_resource); + +void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size, + enum dma_data_direction dir) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + if (dma_map_direct(dev, ops)) + dma_direct_sync_single_for_cpu(dev, addr, size, dir); + else if (ops->sync_single_for_cpu) + ops->sync_single_for_cpu(dev, addr, size, dir); + debug_dma_sync_single_for_cpu(dev, addr, size, dir); +} +EXPORT_SYMBOL(dma_sync_single_for_cpu); + +void dma_sync_single_for_device(struct device *dev, dma_addr_t addr, + size_t size, enum dma_data_direction dir) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + if (dma_map_direct(dev, ops)) + dma_direct_sync_single_for_device(dev, addr, size, dir); + else if (ops->sync_single_for_device) + ops->sync_single_for_device(dev, addr, size, dir); + debug_dma_sync_single_for_device(dev, addr, size, dir); +} +EXPORT_SYMBOL(dma_sync_single_for_device); + +void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, + int nelems, enum dma_data_direction dir) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + if (dma_map_direct(dev, ops)) + dma_direct_sync_sg_for_cpu(dev, sg, nelems, dir); + else if (ops->sync_sg_for_cpu) + ops->sync_sg_for_cpu(dev, sg, nelems, dir); + debug_dma_sync_sg_for_cpu(dev, sg, nelems, dir); +} +EXPORT_SYMBOL(dma_sync_sg_for_cpu); + +void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, + int nelems, enum dma_data_direction dir) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + if (dma_map_direct(dev, ops)) + dma_direct_sync_sg_for_device(dev, sg, nelems, dir); + else if (ops->sync_sg_for_device) + ops->sync_sg_for_device(dev, sg, nelems, dir); + debug_dma_sync_sg_for_device(dev, sg, nelems, dir); +} +EXPORT_SYMBOL(dma_sync_sg_for_device); + /* * Create scatter-list for the already allocated DMA buffer. */ @@ -138,7 +328,7 @@ int dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt, { const struct dma_map_ops *ops = get_dma_ops(dev); - if (dma_is_direct(ops)) + if (dma_alloc_direct(dev, ops)) return dma_direct_get_sgtable(dev, sgt, cpu_addr, dma_addr, size, attrs); if (!ops->get_sgtable) @@ -208,7 +398,7 @@ bool dma_can_mmap(struct device *dev) { const struct dma_map_ops *ops = get_dma_ops(dev); - if (dma_is_direct(ops)) + if (dma_alloc_direct(dev, ops)) return dma_direct_can_mmap(dev); return ops->mmap != NULL; } @@ -233,7 +423,7 @@ int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma, { const struct dma_map_ops *ops = get_dma_ops(dev); - if (dma_is_direct(ops)) + if (dma_alloc_direct(dev, ops)) return dma_direct_mmap(dev, vma, cpu_addr, dma_addr, size, attrs); if (!ops->mmap) @@ -246,7 +436,7 @@ u64 dma_get_required_mask(struct device *dev) { const struct dma_map_ops *ops = get_dma_ops(dev); - if (dma_is_direct(ops)) + if (dma_alloc_direct(dev, ops)) return dma_direct_get_required_mask(dev); if (ops->get_required_mask) return ops->get_required_mask(dev); @@ -277,7 +467,7 @@ void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, /* let the implementation decide on the zone to allocate from: */ flag &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM); - if (dma_is_direct(ops)) + if (dma_alloc_direct(dev, ops)) cpu_addr = dma_direct_alloc(dev, size, dma_handle, flag, attrs); else if (ops->alloc) cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs); @@ -309,7 +499,7 @@ void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr, return; debug_dma_free_coherent(dev, size, cpu_addr, dma_handle); - if (dma_is_direct(ops)) + if (dma_alloc_direct(dev, ops)) dma_direct_free(dev, size, cpu_addr, dma_handle, attrs); else if (ops->free) ops->free(dev, size, cpu_addr, dma_handle, attrs); @@ -320,7 +510,11 @@ int dma_supported(struct device *dev, u64 mask) { const struct dma_map_ops *ops = get_dma_ops(dev); - if (dma_is_direct(ops)) + /* + * ->dma_supported sets the bypass flag, so we must always call + * into the method here unless the device is truly direct mapped. + */ + if (!ops) return dma_direct_supported(dev, mask); if (!ops->dma_supported) return 1; @@ -376,7 +570,7 @@ void dma_cache_sync(struct device *dev, void *vaddr, size_t size, BUG_ON(!valid_dma_direction(dir)); - if (dma_is_direct(ops)) + if (dma_alloc_direct(dev, ops)) arch_dma_cache_sync(dev, vaddr, size, dir); else if (ops->cache_sync) ops->cache_sync(dev, vaddr, size, dir); @@ -388,7 +582,7 @@ size_t dma_max_mapping_size(struct device *dev) const struct dma_map_ops *ops = get_dma_ops(dev); size_t size = SIZE_MAX; - if (dma_is_direct(ops)) + if (dma_map_direct(dev, ops)) size = dma_direct_max_mapping_size(dev); else if (ops && ops->max_mapping_size) size = ops->max_mapping_size(dev); @@ -397,6 +591,16 @@ size_t dma_max_mapping_size(struct device *dev) } EXPORT_SYMBOL_GPL(dma_max_mapping_size); +bool dma_need_sync(struct device *dev, dma_addr_t dma_addr) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + if (dma_map_direct(dev, ops)) + return dma_direct_need_sync(dev, dma_addr); + return ops->sync_single_for_cpu || ops->sync_single_for_device; +} +EXPORT_SYMBOL_GPL(dma_need_sync); + unsigned long dma_get_merge_boundary(struct device *dev) { const struct dma_map_ops *ops = get_dma_ops(dev); diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c new file mode 100644 index 000000000000..6bc74a2d5127 --- /dev/null +++ b/kernel/dma/pool.c @@ -0,0 +1,288 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2012 ARM Ltd. + * Copyright (C) 2020 Google LLC + */ +#include <linux/debugfs.h> +#include <linux/dma-direct.h> +#include <linux/dma-noncoherent.h> +#include <linux/init.h> +#include <linux/genalloc.h> +#include <linux/set_memory.h> +#include <linux/slab.h> +#include <linux/workqueue.h> + +static struct gen_pool *atomic_pool_dma __ro_after_init; +static unsigned long pool_size_dma; +static struct gen_pool *atomic_pool_dma32 __ro_after_init; +static unsigned long pool_size_dma32; +static struct gen_pool *atomic_pool_kernel __ro_after_init; +static unsigned long pool_size_kernel; + +/* Size can be defined by the coherent_pool command line */ +static size_t atomic_pool_size; + +/* Dynamic background expansion when the atomic pool is near capacity */ +static struct work_struct atomic_pool_work; + +static int __init early_coherent_pool(char *p) +{ + atomic_pool_size = memparse(p, &p); + return 0; +} +early_param("coherent_pool", early_coherent_pool); + +static void __init dma_atomic_pool_debugfs_init(void) +{ + struct dentry *root; + + root = debugfs_create_dir("dma_pools", NULL); + if (IS_ERR_OR_NULL(root)) + return; + + debugfs_create_ulong("pool_size_dma", 0400, root, &pool_size_dma); + debugfs_create_ulong("pool_size_dma32", 0400, root, &pool_size_dma32); + debugfs_create_ulong("pool_size_kernel", 0400, root, &pool_size_kernel); +} + +static void dma_atomic_pool_size_add(gfp_t gfp, size_t size) +{ + if (gfp & __GFP_DMA) + pool_size_dma += size; + else if (gfp & __GFP_DMA32) + pool_size_dma32 += size; + else + pool_size_kernel += size; +} + +static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size, + gfp_t gfp) +{ + unsigned int order; + struct page *page; + void *addr; + int ret = -ENOMEM; + + /* Cannot allocate larger than MAX_ORDER-1 */ + order = min(get_order(pool_size), MAX_ORDER-1); + + do { + pool_size = 1 << (PAGE_SHIFT + order); + page = alloc_pages(gfp, order); + } while (!page && order-- > 0); + if (!page) + goto out; + + arch_dma_prep_coherent(page, pool_size); + +#ifdef CONFIG_DMA_DIRECT_REMAP + addr = dma_common_contiguous_remap(page, pool_size, + pgprot_dmacoherent(PAGE_KERNEL), + __builtin_return_address(0)); + if (!addr) + goto free_page; +#else + addr = page_to_virt(page); +#endif + /* + * Memory in the atomic DMA pools must be unencrypted, the pools do not + * shrink so no re-encryption occurs in dma_direct_free_pages(). + */ + ret = set_memory_decrypted((unsigned long)page_to_virt(page), + 1 << order); + if (ret) + goto remove_mapping; + ret = gen_pool_add_virt(pool, (unsigned long)addr, page_to_phys(page), + pool_size, NUMA_NO_NODE); + if (ret) + goto encrypt_mapping; + + dma_atomic_pool_size_add(gfp, pool_size); + return 0; + +encrypt_mapping: + ret = set_memory_encrypted((unsigned long)page_to_virt(page), + 1 << order); + if (WARN_ON_ONCE(ret)) { + /* Decrypt succeeded but encrypt failed, purposely leak */ + goto out; + } +remove_mapping: +#ifdef CONFIG_DMA_DIRECT_REMAP + dma_common_free_remap(addr, pool_size); +#endif +free_page: __maybe_unused + __free_pages(page, order); +out: + return ret; +} + +static void atomic_pool_resize(struct gen_pool *pool, gfp_t gfp) +{ + if (pool && gen_pool_avail(pool) < atomic_pool_size) + atomic_pool_expand(pool, gen_pool_size(pool), gfp); +} + +static void atomic_pool_work_fn(struct work_struct *work) +{ + if (IS_ENABLED(CONFIG_ZONE_DMA)) + atomic_pool_resize(atomic_pool_dma, + GFP_KERNEL | GFP_DMA); + if (IS_ENABLED(CONFIG_ZONE_DMA32)) + atomic_pool_resize(atomic_pool_dma32, + GFP_KERNEL | GFP_DMA32); + atomic_pool_resize(atomic_pool_kernel, GFP_KERNEL); +} + +static __init struct gen_pool *__dma_atomic_pool_init(size_t pool_size, + gfp_t gfp) +{ + struct gen_pool *pool; + int ret; + + pool = gen_pool_create(PAGE_SHIFT, NUMA_NO_NODE); + if (!pool) + return NULL; + + gen_pool_set_algo(pool, gen_pool_first_fit_order_align, NULL); + + ret = atomic_pool_expand(pool, pool_size, gfp); + if (ret) { + gen_pool_destroy(pool); + pr_err("DMA: failed to allocate %zu KiB %pGg pool for atomic allocation\n", + pool_size >> 10, &gfp); + return NULL; + } + + pr_info("DMA: preallocated %zu KiB %pGg pool for atomic allocations\n", + gen_pool_size(pool) >> 10, &gfp); + return pool; +} + +static int __init dma_atomic_pool_init(void) +{ + int ret = 0; + + /* + * If coherent_pool was not used on the command line, default the pool + * sizes to 128KB per 1GB of memory, min 128KB, max MAX_ORDER-1. + */ + if (!atomic_pool_size) { + unsigned long pages = totalram_pages() / (SZ_1G / SZ_128K); + pages = min_t(unsigned long, pages, MAX_ORDER_NR_PAGES); + atomic_pool_size = max_t(size_t, pages << PAGE_SHIFT, SZ_128K); + } + INIT_WORK(&atomic_pool_work, atomic_pool_work_fn); + + atomic_pool_kernel = __dma_atomic_pool_init(atomic_pool_size, + GFP_KERNEL); + if (!atomic_pool_kernel) + ret = -ENOMEM; + if (IS_ENABLED(CONFIG_ZONE_DMA)) { + atomic_pool_dma = __dma_atomic_pool_init(atomic_pool_size, + GFP_KERNEL | GFP_DMA); + if (!atomic_pool_dma) + ret = -ENOMEM; + } + if (IS_ENABLED(CONFIG_ZONE_DMA32)) { + atomic_pool_dma32 = __dma_atomic_pool_init(atomic_pool_size, + GFP_KERNEL | GFP_DMA32); + if (!atomic_pool_dma32) + ret = -ENOMEM; + } + + dma_atomic_pool_debugfs_init(); + return ret; +} +postcore_initcall(dma_atomic_pool_init); + +static inline struct gen_pool *dma_guess_pool_from_device(struct device *dev) +{ + u64 phys_mask; + gfp_t gfp; + + gfp = dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, + &phys_mask); + if (IS_ENABLED(CONFIG_ZONE_DMA) && gfp == GFP_DMA) + return atomic_pool_dma; + if (IS_ENABLED(CONFIG_ZONE_DMA32) && gfp == GFP_DMA32) + return atomic_pool_dma32; + return atomic_pool_kernel; +} + +static inline struct gen_pool *dma_get_safer_pool(struct gen_pool *bad_pool) +{ + if (bad_pool == atomic_pool_kernel) + return atomic_pool_dma32 ? : atomic_pool_dma; + + if (bad_pool == atomic_pool_dma32) + return atomic_pool_dma; + + return NULL; +} + +static inline struct gen_pool *dma_guess_pool(struct device *dev, + struct gen_pool *bad_pool) +{ + if (bad_pool) + return dma_get_safer_pool(bad_pool); + + return dma_guess_pool_from_device(dev); +} + +void *dma_alloc_from_pool(struct device *dev, size_t size, + struct page **ret_page, gfp_t flags) +{ + struct gen_pool *pool = NULL; + unsigned long val = 0; + void *ptr = NULL; + phys_addr_t phys; + + while (1) { + pool = dma_guess_pool(dev, pool); + if (!pool) { + WARN(1, "Failed to get suitable pool for %s\n", + dev_name(dev)); + break; + } + + val = gen_pool_alloc(pool, size); + if (!val) + continue; + + phys = gen_pool_virt_to_phys(pool, val); + if (dma_coherent_ok(dev, phys, size)) + break; + + gen_pool_free(pool, val, size); + val = 0; + } + + + if (val) { + *ret_page = pfn_to_page(__phys_to_pfn(phys)); + ptr = (void *)val; + memset(ptr, 0, size); + + if (gen_pool_avail(pool) < atomic_pool_size) + schedule_work(&atomic_pool_work); + } + + return ptr; +} + +bool dma_free_from_pool(struct device *dev, void *start, size_t size) +{ + struct gen_pool *pool = NULL; + + while (1) { + pool = dma_guess_pool(dev, pool); + if (!pool) + return false; + + if (gen_pool_has_addr(pool, (unsigned long)start, size)) { + gen_pool_free(pool, (unsigned long)start, size); + return true; + } + } +} diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index d14cbc83986a..78b23f089cf1 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -1,13 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright (C) 2012 ARM Ltd. * Copyright (c) 2014 The Linux Foundation */ -#include <linux/dma-direct.h> -#include <linux/dma-noncoherent.h> -#include <linux/dma-contiguous.h> -#include <linux/init.h> -#include <linux/genalloc.h> +#include <linux/dma-mapping.h> #include <linux/slab.h> #include <linux/vmalloc.h> @@ -20,23 +15,6 @@ struct page **dma_common_find_pages(void *cpu_addr) return area->pages; } -static struct vm_struct *__dma_common_pages_remap(struct page **pages, - size_t size, pgprot_t prot, const void *caller) -{ - struct vm_struct *area; - - area = get_vm_area_caller(size, VM_DMA_COHERENT, caller); - if (!area) - return NULL; - - if (map_vm_area(area, prot, pages)) { - vunmap(area->addr); - return NULL; - } - - return area; -} - /* * Remaps an array of PAGE_SIZE pages into another vm_area. * Cannot be used in non-sleeping contexts @@ -44,15 +22,13 @@ static struct vm_struct *__dma_common_pages_remap(struct page **pages, void *dma_common_pages_remap(struct page **pages, size_t size, pgprot_t prot, const void *caller) { - struct vm_struct *area; - - area = __dma_common_pages_remap(pages, size, prot, caller); - if (!area) - return NULL; + void *vaddr; - area->pages = pages; - - return area->addr; + vaddr = vmap(pages, PAGE_ALIGN(size) >> PAGE_SHIFT, + VM_DMA_COHERENT, prot); + if (vaddr) + find_vm_area(vaddr)->pages = pages; + return vaddr; } /* @@ -62,24 +38,20 @@ void *dma_common_pages_remap(struct page **pages, size_t size, void *dma_common_contiguous_remap(struct page *page, size_t size, pgprot_t prot, const void *caller) { - int i; + int count = PAGE_ALIGN(size) >> PAGE_SHIFT; struct page **pages; - struct vm_struct *area; + void *vaddr; + int i; - pages = kmalloc(sizeof(struct page *) << get_order(size), GFP_KERNEL); + pages = kmalloc_array(count, sizeof(struct page *), GFP_KERNEL); if (!pages) return NULL; - - for (i = 0; i < (size >> PAGE_SHIFT); i++) + for (i = 0; i < count; i++) pages[i] = nth_page(page, i); - - area = __dma_common_pages_remap(pages, size, prot, caller); - + vaddr = vmap(pages, count, VM_DMA_COHERENT, prot); kfree(pages); - if (!area) - return NULL; - return area->addr; + return vaddr; } /* @@ -97,117 +69,3 @@ void dma_common_free_remap(void *cpu_addr, size_t size) unmap_kernel_range((unsigned long)cpu_addr, PAGE_ALIGN(size)); vunmap(cpu_addr); } - -#ifdef CONFIG_DMA_DIRECT_REMAP -static struct gen_pool *atomic_pool __ro_after_init; - -#define DEFAULT_DMA_COHERENT_POOL_SIZE SZ_256K -static size_t atomic_pool_size __initdata = DEFAULT_DMA_COHERENT_POOL_SIZE; - -static int __init early_coherent_pool(char *p) -{ - atomic_pool_size = memparse(p, &p); - return 0; -} -early_param("coherent_pool", early_coherent_pool); - -static gfp_t dma_atomic_pool_gfp(void) -{ - if (IS_ENABLED(CONFIG_ZONE_DMA)) - return GFP_DMA; - if (IS_ENABLED(CONFIG_ZONE_DMA32)) - return GFP_DMA32; - return GFP_KERNEL; -} - -static int __init dma_atomic_pool_init(void) -{ - unsigned int pool_size_order = get_order(atomic_pool_size); - unsigned long nr_pages = atomic_pool_size >> PAGE_SHIFT; - struct page *page; - void *addr; - int ret; - - if (dev_get_cma_area(NULL)) - page = dma_alloc_from_contiguous(NULL, nr_pages, - pool_size_order, false); - else - page = alloc_pages(dma_atomic_pool_gfp(), pool_size_order); - if (!page) - goto out; - - arch_dma_prep_coherent(page, atomic_pool_size); - - atomic_pool = gen_pool_create(PAGE_SHIFT, -1); - if (!atomic_pool) - goto free_page; - - addr = dma_common_contiguous_remap(page, atomic_pool_size, - pgprot_dmacoherent(PAGE_KERNEL), - __builtin_return_address(0)); - if (!addr) - goto destroy_genpool; - - ret = gen_pool_add_virt(atomic_pool, (unsigned long)addr, - page_to_phys(page), atomic_pool_size, -1); - if (ret) - goto remove_mapping; - gen_pool_set_algo(atomic_pool, gen_pool_first_fit_order_align, NULL); - - pr_info("DMA: preallocated %zu KiB pool for atomic allocations\n", - atomic_pool_size / 1024); - return 0; - -remove_mapping: - dma_common_free_remap(addr, atomic_pool_size); -destroy_genpool: - gen_pool_destroy(atomic_pool); - atomic_pool = NULL; -free_page: - if (!dma_release_from_contiguous(NULL, page, nr_pages)) - __free_pages(page, pool_size_order); -out: - pr_err("DMA: failed to allocate %zu KiB pool for atomic coherent allocation\n", - atomic_pool_size / 1024); - return -ENOMEM; -} -postcore_initcall(dma_atomic_pool_init); - -bool dma_in_atomic_pool(void *start, size_t size) -{ - if (unlikely(!atomic_pool)) - return false; - - return gen_pool_has_addr(atomic_pool, (unsigned long)start, size); -} - -void *dma_alloc_from_pool(size_t size, struct page **ret_page, gfp_t flags) -{ - unsigned long val; - void *ptr = NULL; - - if (!atomic_pool) { - WARN(1, "coherent pool not initialised!\n"); - return NULL; - } - - val = gen_pool_alloc(atomic_pool, size); - if (val) { - phys_addr_t phys = gen_pool_virt_to_phys(atomic_pool, val); - - *ret_page = pfn_to_page(__phys_to_pfn(phys)); - ptr = (void *)val; - memset(ptr, 0, size); - } - - return ptr; -} - -bool dma_free_from_pool(void *start, size_t size) -{ - if (!dma_in_atomic_pool(start, size)) - return false; - gen_pool_free(atomic_pool, (unsigned long)start, size); - return true; -} -#endif /* CONFIG_DMA_DIRECT_REMAP */ diff --git a/kernel/entry/Makefile b/kernel/entry/Makefile new file mode 100644 index 000000000000..34c8a3f1c735 --- /dev/null +++ b/kernel/entry/Makefile @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: GPL-2.0 + +# Prevent the noinstr section from being pestered by sanitizer and other goodies +# as long as these things cannot be disabled per function. +KASAN_SANITIZE := n +UBSAN_SANITIZE := n +KCOV_INSTRUMENT := n + +CFLAGS_REMOVE_common.o = -fstack-protector -fstack-protector-strong +CFLAGS_common.o += -fno-stack-protector + +obj-$(CONFIG_GENERIC_ENTRY) += common.o +obj-$(CONFIG_KVM_XFER_TO_GUEST_WORK) += kvm.o diff --git a/kernel/entry/common.c b/kernel/entry/common.c new file mode 100644 index 000000000000..9852e0d62d95 --- /dev/null +++ b/kernel/entry/common.c @@ -0,0 +1,374 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/context_tracking.h> +#include <linux/entry-common.h> +#include <linux/livepatch.h> +#include <linux/audit.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/syscalls.h> + +/** + * enter_from_user_mode - Establish state when coming from user mode + * + * Syscall/interrupt entry disables interrupts, but user mode is traced as + * interrupts enabled. Also with NO_HZ_FULL RCU might be idle. + * + * 1) Tell lockdep that interrupts are disabled + * 2) Invoke context tracking if enabled to reactivate RCU + * 3) Trace interrupts off state + */ +static __always_inline void enter_from_user_mode(struct pt_regs *regs) +{ + arch_check_user_regs(regs); + lockdep_hardirqs_off(CALLER_ADDR0); + + CT_WARN_ON(ct_state() != CONTEXT_USER); + user_exit_irqoff(); + + instrumentation_begin(); + trace_hardirqs_off_finish(); + instrumentation_end(); +} + +static inline void syscall_enter_audit(struct pt_regs *regs, long syscall) +{ + if (unlikely(audit_context())) { + unsigned long args[6]; + + syscall_get_arguments(current, regs, args); + audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]); + } +} + +static long syscall_trace_enter(struct pt_regs *regs, long syscall, + unsigned long ti_work) +{ + long ret = 0; + + /* Handle ptrace */ + if (ti_work & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU)) { + ret = arch_syscall_enter_tracehook(regs); + if (ret || (ti_work & _TIF_SYSCALL_EMU)) + return -1L; + } + + /* Do seccomp after ptrace, to catch any tracer changes. */ + if (ti_work & _TIF_SECCOMP) { + ret = __secure_computing(NULL); + if (ret == -1L) + return ret; + } + + if (unlikely(ti_work & _TIF_SYSCALL_TRACEPOINT)) + trace_sys_enter(regs, syscall); + + syscall_enter_audit(regs, syscall); + + return ret ? : syscall; +} + +noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall) +{ + unsigned long ti_work; + + enter_from_user_mode(regs); + instrumentation_begin(); + + local_irq_enable(); + ti_work = READ_ONCE(current_thread_info()->flags); + if (ti_work & SYSCALL_ENTER_WORK) + syscall = syscall_trace_enter(regs, syscall, ti_work); + instrumentation_end(); + + return syscall; +} + +/** + * exit_to_user_mode - Fixup state when exiting to user mode + * + * Syscall/interupt exit enables interrupts, but the kernel state is + * interrupts disabled when this is invoked. Also tell RCU about it. + * + * 1) Trace interrupts on state + * 2) Invoke context tracking if enabled to adjust RCU state + * 3) Invoke architecture specific last minute exit code, e.g. speculation + * mitigations, etc. + * 4) Tell lockdep that interrupts are enabled + */ +static __always_inline void exit_to_user_mode(void) +{ + instrumentation_begin(); + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(CALLER_ADDR0); + instrumentation_end(); + + user_enter_irqoff(); + arch_exit_to_user_mode(); + lockdep_hardirqs_on(CALLER_ADDR0); +} + +/* Workaround to allow gradual conversion of architecture code */ +void __weak arch_do_signal(struct pt_regs *regs) { } + +static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, + unsigned long ti_work) +{ + /* + * Before returning to user space ensure that all pending work + * items have been completed. + */ + while (ti_work & EXIT_TO_USER_MODE_WORK) { + + local_irq_enable_exit_to_user(ti_work); + + if (ti_work & _TIF_NEED_RESCHED) + schedule(); + + if (ti_work & _TIF_UPROBE) + uprobe_notify_resume(regs); + + if (ti_work & _TIF_PATCH_PENDING) + klp_update_patch_state(current); + + if (ti_work & _TIF_SIGPENDING) + arch_do_signal(regs); + + if (ti_work & _TIF_NOTIFY_RESUME) { + clear_thread_flag(TIF_NOTIFY_RESUME); + tracehook_notify_resume(regs); + rseq_handle_notify_resume(NULL, regs); + } + + /* Architecture specific TIF work */ + arch_exit_to_user_mode_work(regs, ti_work); + + /* + * Disable interrupts and reevaluate the work flags as they + * might have changed while interrupts and preemption was + * enabled above. + */ + local_irq_disable_exit_to_user(); + ti_work = READ_ONCE(current_thread_info()->flags); + } + + /* Return the latest work state for arch_exit_to_user_mode() */ + return ti_work; +} + +static void exit_to_user_mode_prepare(struct pt_regs *regs) +{ + unsigned long ti_work = READ_ONCE(current_thread_info()->flags); + + lockdep_assert_irqs_disabled(); + + if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK)) + ti_work = exit_to_user_mode_loop(regs, ti_work); + + arch_exit_to_user_mode_prepare(regs, ti_work); + + /* Ensure that the address limit is intact and no locks are held */ + addr_limit_user_check(); + lockdep_assert_irqs_disabled(); + lockdep_sys_exit(); +} + +#ifndef _TIF_SINGLESTEP +static inline bool report_single_step(unsigned long ti_work) +{ + return false; +} +#else +/* + * If TIF_SYSCALL_EMU is set, then the only reason to report is when + * TIF_SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall + * instruction has been already reported in syscall_enter_from_usermode(). + */ +#define SYSEMU_STEP (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU) + +static inline bool report_single_step(unsigned long ti_work) +{ + return (ti_work & SYSEMU_STEP) == _TIF_SINGLESTEP; +} +#endif + +static void syscall_exit_work(struct pt_regs *regs, unsigned long ti_work) +{ + bool step; + + audit_syscall_exit(regs); + + if (ti_work & _TIF_SYSCALL_TRACEPOINT) + trace_sys_exit(regs, syscall_get_return_value(current, regs)); + + step = report_single_step(ti_work); + if (step || ti_work & _TIF_SYSCALL_TRACE) + arch_syscall_exit_tracehook(regs, step); +} + +/* + * Syscall specific exit to user mode preparation. Runs with interrupts + * enabled. + */ +static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs) +{ + u32 cached_flags = READ_ONCE(current_thread_info()->flags); + unsigned long nr = syscall_get_nr(current, regs); + + CT_WARN_ON(ct_state() != CONTEXT_KERNEL); + + if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { + if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr)) + local_irq_enable(); + } + + rseq_syscall(regs); + + /* + * Do one-time syscall specific work. If these work items are + * enabled, we want to run them exactly once per syscall exit with + * interrupts enabled. + */ + if (unlikely(cached_flags & SYSCALL_EXIT_WORK)) + syscall_exit_work(regs, cached_flags); +} + +__visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs) +{ + instrumentation_begin(); + syscall_exit_to_user_mode_prepare(regs); + local_irq_disable_exit_to_user(); + exit_to_user_mode_prepare(regs); + instrumentation_end(); + exit_to_user_mode(); +} + +noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs) +{ + enter_from_user_mode(regs); +} + +noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs) +{ + instrumentation_begin(); + exit_to_user_mode_prepare(regs); + instrumentation_end(); + exit_to_user_mode(); +} + +noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) +{ + irqentry_state_t ret = { + .exit_rcu = false, + }; + + if (user_mode(regs)) { + irqentry_enter_from_user_mode(regs); + return ret; + } + + /* + * If this entry hit the idle task invoke rcu_irq_enter() whether + * RCU is watching or not. + * + * Interupts can nest when the first interrupt invokes softirq + * processing on return which enables interrupts. + * + * Scheduler ticks in the idle task can mark quiescent state and + * terminate a grace period, if and only if the timer interrupt is + * not nested into another interrupt. + * + * Checking for __rcu_is_watching() here would prevent the nesting + * interrupt to invoke rcu_irq_enter(). If that nested interrupt is + * the tick then rcu_flavor_sched_clock_irq() would wrongfully + * assume that it is the first interupt and eventually claim + * quiescient state and end grace periods prematurely. + * + * Unconditionally invoke rcu_irq_enter() so RCU state stays + * consistent. + * + * TINY_RCU does not support EQS, so let the compiler eliminate + * this part when enabled. + */ + if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) { + /* + * If RCU is not watching then the same careful + * sequence vs. lockdep and tracing is required + * as in irq_enter_from_user_mode(). + */ + lockdep_hardirqs_off(CALLER_ADDR0); + rcu_irq_enter(); + instrumentation_begin(); + trace_hardirqs_off_finish(); + instrumentation_end(); + + ret.exit_rcu = true; + return ret; + } + + /* + * If RCU is watching then RCU only wants to check whether it needs + * to restart the tick in NOHZ mode. rcu_irq_enter_check_tick() + * already contains a warning when RCU is not watching, so no point + * in having another one here. + */ + instrumentation_begin(); + rcu_irq_enter_check_tick(); + /* Use the combo lockdep/tracing function */ + trace_hardirqs_off(); + instrumentation_end(); + + return ret; +} + +void irqentry_exit_cond_resched(void) +{ + if (!preempt_count()) { + /* Sanity check RCU and thread stack */ + rcu_irq_exit_check_preempt(); + if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) + WARN_ON_ONCE(!on_thread_stack()); + if (need_resched()) + preempt_schedule_irq(); + } +} + +noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) +{ + lockdep_assert_irqs_disabled(); + + /* Check whether this returns to user mode */ + if (user_mode(regs)) { + irqentry_exit_to_user_mode(regs); + } else if (!regs_irqs_disabled(regs)) { + /* + * If RCU was not watching on entry this needs to be done + * carefully and needs the same ordering of lockdep/tracing + * and RCU as the return to user mode path. + */ + if (state.exit_rcu) { + instrumentation_begin(); + /* Tell the tracer that IRET will enable interrupts */ + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(CALLER_ADDR0); + instrumentation_end(); + rcu_irq_exit(); + lockdep_hardirqs_on(CALLER_ADDR0); + return; + } + + instrumentation_begin(); + if (IS_ENABLED(CONFIG_PREEMPTION)) + irqentry_exit_cond_resched(); + /* Covers both tracing and lockdep */ + trace_hardirqs_on(); + instrumentation_end(); + } else { + /* + * IRQ flags state is correct already. Just tell RCU if it + * was not watching on entry. + */ + if (state.exit_rcu) + rcu_irq_exit(); + } +} diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c new file mode 100644 index 000000000000..eb1a8a4c867c --- /dev/null +++ b/kernel/entry/kvm.c @@ -0,0 +1,51 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/entry-kvm.h> +#include <linux/kvm_host.h> + +static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work) +{ + do { + int ret; + + if (ti_work & _TIF_SIGPENDING) { + kvm_handle_signal_exit(vcpu); + return -EINTR; + } + + if (ti_work & _TIF_NEED_RESCHED) + schedule(); + + if (ti_work & _TIF_NOTIFY_RESUME) { + clear_thread_flag(TIF_NOTIFY_RESUME); + tracehook_notify_resume(NULL); + } + + ret = arch_xfer_to_guest_mode_handle_work(vcpu, ti_work); + if (ret) + return ret; + + ti_work = READ_ONCE(current_thread_info()->flags); + } while (ti_work & XFER_TO_GUEST_MODE_WORK || need_resched()); + return 0; +} + +int xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu) +{ + unsigned long ti_work; + + /* + * This is invoked from the outer guest loop with interrupts and + * preemption enabled. + * + * KVM invokes xfer_to_guest_mode_work_pending() with interrupts + * disabled in the inner loop before going into guest mode. No need + * to disable interrupts here. + */ + ti_work = READ_ONCE(current_thread_info()->flags); + if (!(ti_work & XFER_TO_GUEST_MODE_WORK)) + return 0; + + return xfer_to_guest_mode_work(vcpu, ti_work); +} +EXPORT_SYMBOL_GPL(xfer_to_guest_mode_handle_work); diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index c2b41a263166..c6ce894e4ce9 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -16,7 +16,7 @@ struct callchain_cpus_entries { struct rcu_head rcu_head; - struct perf_callchain_entry *cpu_entries[0]; + struct perf_callchain_entry *cpu_entries[]; }; int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH; @@ -149,7 +149,7 @@ void put_callchain_buffers(void) } } -static struct perf_callchain_entry *get_callchain_entry(int *rctx) +struct perf_callchain_entry *get_callchain_entry(int *rctx) { int cpu; struct callchain_cpus_entries *entries; @@ -159,8 +159,10 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx) return NULL; entries = rcu_dereference(callchain_cpus_entries); - if (!entries) + if (!entries) { + put_recursion_context(this_cpu_ptr(callchain_recursion), *rctx); return NULL; + } cpu = smp_processor_id(); @@ -168,7 +170,7 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx) (*rctx * perf_callchain_entry__sizeof())); } -static void +void put_callchain_entry(int rctx) { put_recursion_context(this_cpu_ptr(callchain_recursion), rctx); @@ -183,11 +185,8 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, int rctx; entry = get_callchain_entry(&rctx); - if (rctx == -1) - return NULL; - if (!entry) - goto exit_put; + return NULL; ctx.entry = entry; ctx.max_stack = max_stack; @@ -236,7 +235,7 @@ exit_put: * sysctl_perf_event_max_contexts_per_stack. */ int perf_event_max_stack_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int *value = table->data; int new_value = *value, ret; diff --git a/kernel/events/core.c b/kernel/events/core.c index 633b4ae72ed5..d1f0a7e5b182 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -95,11 +95,11 @@ static void remote_function(void *data) * @info: the function call argument * * Calls the function @func when the task is currently running. This might - * be on the current CPU, which just calls the function directly + * be on the current CPU, which just calls the function directly. This will + * retry due to any failures in smp_call_function_single(), such as if the + * task_cpu() goes offline concurrently. * - * returns: @func return value, or - * -ESRCH - when the process isn't running - * -EAGAIN - when the process moved away + * returns @func return value or -ESRCH when the process isn't running */ static int task_function_call(struct task_struct *p, remote_function_f func, void *info) @@ -112,11 +112,16 @@ task_function_call(struct task_struct *p, remote_function_f func, void *info) }; int ret; - do { - ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1); - if (!ret) - ret = data.ret; - } while (ret == -EAGAIN); + for (;;) { + ret = smp_call_function_single(task_cpu(p), remote_function, + &data, 1); + ret = !ret ? data.ret : -EAGAIN; + + if (ret != -EAGAIN) + break; + + cond_resched(); + } return ret; } @@ -389,6 +394,7 @@ static atomic_t nr_switch_events __read_mostly; static atomic_t nr_ksymbol_events __read_mostly; static atomic_t nr_bpf_events __read_mostly; static atomic_t nr_cgroup_events __read_mostly; +static atomic_t nr_text_poke_events __read_mostly; static LIST_HEAD(pmus); static DEFINE_MUTEX(pmus_lock); @@ -437,8 +443,7 @@ static void update_perf_cpu_limits(void) static bool perf_rotate_context(struct perf_cpu_context *cpuctx); int perf_proc_update_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; int perf_cpu = sysctl_perf_cpu_time_max_percent; @@ -462,8 +467,7 @@ int perf_proc_update_handler(struct ctl_table *table, int write, int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT; int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); @@ -1234,12 +1238,26 @@ static void get_ctx(struct perf_event_context *ctx) refcount_inc(&ctx->refcount); } +static void *alloc_task_ctx_data(struct pmu *pmu) +{ + if (pmu->task_ctx_cache) + return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL); + + return NULL; +} + +static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data) +{ + if (pmu->task_ctx_cache && task_ctx_data) + kmem_cache_free(pmu->task_ctx_cache, task_ctx_data); +} + static void free_ctx(struct rcu_head *head) { struct perf_event_context *ctx; ctx = container_of(head, struct perf_event_context, rcu_head); - kfree(ctx->task_ctx_data); + free_task_ctx_data(ctx->pmu, ctx->task_ctx_data); kfree(ctx); } @@ -1313,7 +1331,7 @@ static void put_ctx(struct perf_event_context *ctx) * perf_event::child_mutex; * perf_event_context::lock * perf_event::mmap_mutex - * mmap_sem + * mmap_lock * perf_addr_filters_head::lock * * cpu_hotplug_lock @@ -3077,7 +3095,7 @@ static int perf_event_stop(struct perf_event *event, int restart) * pre-existing mappings, called once when new filters arrive via SET_FILTER * ioctl; * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly - * registered mapping, called for every new mmap(), with mm::mmap_sem down + * registered mapping, called for every new mmap(), with mm::mmap_lock down * for reading; * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process * of exec. @@ -4467,7 +4485,7 @@ find_get_context(struct pmu *pmu, struct task_struct *task, goto errout; if (event->attach_state & PERF_ATTACH_TASK_DATA) { - task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL); + task_ctx_data = alloc_task_ctx_data(pmu); if (!task_ctx_data) { err = -ENOMEM; goto errout; @@ -4525,11 +4543,11 @@ retry: } } - kfree(task_ctx_data); + free_task_ctx_data(pmu, task_ctx_data); return ctx; errout: - kfree(task_ctx_data); + free_task_ctx_data(pmu, task_ctx_data); return ERR_PTR(err); } @@ -4572,7 +4590,7 @@ static bool is_sb_event(struct perf_event *event) if (attr->mmap || attr->mmap_data || attr->mmap2 || attr->comm || attr->comm_exec || attr->task || attr->ksymbol || - attr->context_switch || + attr->context_switch || attr->text_poke || attr->bpf_event) return true; return false; @@ -4648,6 +4666,8 @@ static void unaccount_event(struct perf_event *event) atomic_dec(&nr_ksymbol_events); if (event->attr.bpf_event) atomic_dec(&nr_bpf_events); + if (event->attr.text_poke) + atomic_dec(&nr_text_poke_events); if (dec) { if (!atomic_add_unless(&perf_sched_count, -1, 1)) @@ -6931,12 +6951,12 @@ static u64 perf_virt_to_phys(u64 virt) * Walking the pages tables for user address. * Interrupts are disabled, so it prevents any tear down * of the page tables. - * Try IRQ-safe __get_user_pages_fast first. + * Try IRQ-safe get_user_page_fast_only first. * If failed, leave phys_addr as 0. */ if (current->mm != NULL) { pagefault_disable(); - if (__get_user_pages_fast(virt, 1, 0, &p) == 1) + if (get_user_page_fast_only(virt, 0, &p)) phys_addr = page_to_phys(p) + virt % PAGE_SIZE; pagefault_enable(); } @@ -8625,6 +8645,89 @@ void perf_event_bpf_event(struct bpf_prog *prog, perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL); } +struct perf_text_poke_event { + const void *old_bytes; + const void *new_bytes; + size_t pad; + u16 old_len; + u16 new_len; + + struct { + struct perf_event_header header; + + u64 addr; + } event_id; +}; + +static int perf_event_text_poke_match(struct perf_event *event) +{ + return event->attr.text_poke; +} + +static void perf_event_text_poke_output(struct perf_event *event, void *data) +{ + struct perf_text_poke_event *text_poke_event = data; + struct perf_output_handle handle; + struct perf_sample_data sample; + u64 padding = 0; + int ret; + + if (!perf_event_text_poke_match(event)) + return; + + perf_event_header__init_id(&text_poke_event->event_id.header, &sample, event); + + ret = perf_output_begin(&handle, event, text_poke_event->event_id.header.size); + if (ret) + return; + + perf_output_put(&handle, text_poke_event->event_id); + perf_output_put(&handle, text_poke_event->old_len); + perf_output_put(&handle, text_poke_event->new_len); + + __output_copy(&handle, text_poke_event->old_bytes, text_poke_event->old_len); + __output_copy(&handle, text_poke_event->new_bytes, text_poke_event->new_len); + + if (text_poke_event->pad) + __output_copy(&handle, &padding, text_poke_event->pad); + + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +} + +void perf_event_text_poke(const void *addr, const void *old_bytes, + size_t old_len, const void *new_bytes, size_t new_len) +{ + struct perf_text_poke_event text_poke_event; + size_t tot, pad; + + if (!atomic_read(&nr_text_poke_events)) + return; + + tot = sizeof(text_poke_event.old_len) + old_len; + tot += sizeof(text_poke_event.new_len) + new_len; + pad = ALIGN(tot, sizeof(u64)) - tot; + + text_poke_event = (struct perf_text_poke_event){ + .old_bytes = old_bytes, + .new_bytes = new_bytes, + .pad = pad, + .old_len = old_len, + .new_len = new_len, + .event_id = { + .header = { + .type = PERF_RECORD_TEXT_POKE, + .misc = PERF_RECORD_MISC_KERNEL, + .size = sizeof(text_poke_event.event_id) + tot + pad, + }, + .addr = (unsigned long)addr, + }, + }; + + perf_iterate_sb(perf_event_text_poke_output, &text_poke_event, NULL); +} + void perf_event_itrace_started(struct perf_event *event) { event->attach_state |= PERF_ATTACH_ITRACE; @@ -9404,7 +9507,7 @@ static int perf_kprobe_event_init(struct perf_event *event) if (event->attr.type != perf_kprobe.type) return -ENOENT; - if (!capable(CAP_SYS_ADMIN)) + if (!perfmon_capable()) return -EACCES; /* @@ -9464,7 +9567,7 @@ static int perf_uprobe_event_init(struct perf_event *event) if (event->attr.type != perf_uprobe.type) return -ENOENT; - if (!capable(CAP_SYS_ADMIN)) + if (!perfmon_capable()) return -EACCES; /* @@ -9541,6 +9644,24 @@ static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd) if (IS_ERR(prog)) return PTR_ERR(prog); + if (event->attr.precise_ip && + prog->call_get_stack && + (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY) || + event->attr.exclude_callchain_kernel || + event->attr.exclude_callchain_user)) { + /* + * On perf_event with precise_ip, calling bpf_get_stack() + * may trigger unwinder warnings and occasional crashes. + * bpf_get_[stack|stackid] works around this issue by using + * callchain attached to perf_sample_data. If the + * perf_event does not full (kernel and user) callchain + * attached to perf_sample_data, do not allow attaching BPF + * program that calls bpf_get_[stack|stackid]. + */ + bpf_prog_put(prog); + return -EPROTO; + } + event->prog = prog; event->orig_overflow_handler = READ_ONCE(event->overflow_handler); WRITE_ONCE(event->overflow_handler, bpf_overflow_handler); @@ -9739,7 +9860,7 @@ static void perf_addr_filters_splice(struct perf_event *event, /* * Scan through mm's vmas and see if one of them matches the * @filter; if so, adjust filter's address range. - * Called with mm::mmap_sem down for reading. + * Called with mm::mmap_lock down for reading. */ static void perf_addr_filter_apply(struct perf_addr_filter *filter, struct mm_struct *mm, @@ -9781,7 +9902,7 @@ static void perf_event_addr_filters_apply(struct perf_event *event) if (!mm) goto restart; - down_read(&mm->mmap_sem); + mmap_read_lock(mm); } raw_spin_lock_irqsave(&ifh->lock, flags); @@ -9807,7 +9928,7 @@ static void perf_event_addr_filters_apply(struct perf_event *event) raw_spin_unlock_irqrestore(&ifh->lock, flags); if (ifh->nr_file_filters) { - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); mmput(mm); } @@ -10942,6 +11063,8 @@ static void account_event(struct perf_event *event) atomic_inc(&nr_ksymbol_events); if (event->attr.bpf_event) atomic_inc(&nr_bpf_events); + if (event->attr.text_poke) + atomic_inc(&nr_text_poke_events); if (inc) { /* @@ -11480,7 +11603,7 @@ SYSCALL_DEFINE5(perf_event_open, struct perf_event *group_leader = NULL, *output_event = NULL; struct perf_event *event, *sibling; struct perf_event_attr attr; - struct perf_event_context *ctx, *uninitialized_var(gctx); + struct perf_event_context *ctx, *gctx; struct file *event_file = NULL; struct fd group = {NULL, 0}; struct task_struct *task = NULL; @@ -11511,7 +11634,7 @@ SYSCALL_DEFINE5(perf_event_open, } if (attr.namespaces) { - if (!capable(CAP_SYS_ADMIN)) + if (!perfmon_capable()) return -EACCES; } @@ -12217,7 +12340,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) * When a child task exits, feed back event values to parent events. * * Can be called with exec_update_mutex held when called from - * install_exec_creds(). + * setup_new_exec(). */ void perf_event_exit_task(struct task_struct *child) { @@ -12406,8 +12529,7 @@ inherit_event(struct perf_event *parent_event, !child_ctx->task_ctx_data) { struct pmu *pmu = child_event->pmu; - child_ctx->task_ctx_data = kzalloc(pmu->task_ctx_size, - GFP_KERNEL); + child_ctx->task_ctx_data = alloc_task_ctx_data(pmu); if (!child_ctx->task_ctx_data) { free_event(child_event); return ERR_PTR(-ENOMEM); diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 3cc8416ec844..b48d7039a015 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -213,6 +213,15 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, list_del(&bp->hw.bp_list); } +__weak int arch_reserve_bp_slot(struct perf_event *bp) +{ + return 0; +} + +__weak void arch_release_bp_slot(struct perf_event *bp) +{ +} + /* * Function to perform processor-specific cleanup during unregistration */ @@ -270,6 +279,7 @@ static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type) struct bp_busy_slots slots = {0}; enum bp_type_idx type; int weight; + int ret; /* We couldn't initialize breakpoint constraints on boot */ if (!constraints_initialized) @@ -294,6 +304,10 @@ static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type) if (slots.pinned + (!!slots.flexible) > nr_slots[type]) return -ENOSPC; + ret = arch_reserve_bp_slot(bp); + if (ret) + return ret; + toggle_bp_slot(bp, true, type, weight); return 0; @@ -317,6 +331,8 @@ static void __release_bp_slot(struct perf_event *bp, u64 bp_type) enum bp_type_idx type; int weight; + arch_release_bp_slot(bp); + type = find_slot_idx(bp_type); weight = hw_breakpoint_weight(bp); toggle_bp_slot(bp, false, type, weight); diff --git a/kernel/events/internal.h b/kernel/events/internal.h index f16f66b6b655..fcbf5616a441 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -55,7 +55,7 @@ struct perf_buffer { void *aux_priv; struct perf_event_mmap_page *user_page; - void *data_pages[0]; + void *data_pages[]; }; extern void rb_free(struct perf_buffer *rb); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index ece7e13f6e4a..25de10c904e6 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -162,14 +162,12 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, }; int err; struct mmu_notifier_range range; - struct mem_cgroup *memcg; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, addr + PAGE_SIZE); if (new_page) { - err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, - &memcg, false); + err = mem_cgroup_charge(new_page, vma->vm_mm, GFP_KERNEL); if (err) return err; } @@ -179,17 +177,13 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, mmu_notifier_invalidate_range_start(&range); err = -EAGAIN; - if (!page_vma_mapped_walk(&pvmw)) { - if (new_page) - mem_cgroup_cancel_charge(new_page, memcg, false); + if (!page_vma_mapped_walk(&pvmw)) goto unlock; - } VM_BUG_ON_PAGE(addr != pvmw.address, old_page); if (new_page) { get_page(new_page); page_add_new_anon_rmap(new_page, vma, addr, false); - mem_cgroup_commit_charge(new_page, memcg, false, false); lru_cache_add_active_or_unevictable(new_page, vma); } else /* no new page, just dec_mm_counter for old_page */ @@ -463,7 +457,7 @@ static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm, * @vaddr: the virtual address to store the opcode. * @opcode: opcode to be written at @vaddr. * - * Called with mm->mmap_sem held for write. + * Called with mm->mmap_lock held for write. * Return 0 (success) or a negative errno. */ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, @@ -867,10 +861,6 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, if (ret) goto out; - /* uprobe_write_opcode() assumes we don't cross page boundary */ - BUG_ON((uprobe->offset & ~PAGE_MASK) + - UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); - smp_wmb(); /* pairs with the smp_rmb() in handle_swbp() */ set_bit(UPROBE_COPY_INSN, &uprobe->flags); @@ -1064,7 +1054,7 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) if (err && is_register) goto free; - down_write(&mm->mmap_sem); + mmap_write_lock(mm); vma = find_vma(mm, info->vaddr); if (!vma || !valid_vma(vma, is_register) || file_inode(vma->vm_file) != uprobe->inode) @@ -1086,7 +1076,7 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) } unlock: - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); free: mmput(mm); info = free_map_info(info); @@ -1166,6 +1156,15 @@ static int __uprobe_register(struct inode *inode, loff_t offset, if (offset > i_size_read(inode)) return -EINVAL; + /* + * This ensures that copy_from_page(), copy_to_page() and + * __update_ref_ctr() can't cross page boundary. + */ + if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE)) + return -EINVAL; + if (!IS_ALIGNED(ref_ctr_offset, sizeof(short))) + return -EINVAL; + retry: uprobe = alloc_uprobe(inode, offset, ref_ctr_offset); if (!uprobe) @@ -1241,7 +1240,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) struct vm_area_struct *vma; int err = 0; - down_read(&mm->mmap_sem); + mmap_read_lock(mm); for (vma = mm->mmap; vma; vma = vma->vm_next) { unsigned long vaddr; loff_t offset; @@ -1258,7 +1257,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) vaddr = offset_to_vaddr(vma, uprobe->offset); err |= remove_breakpoint(uprobe, mm, vaddr); } - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); return err; } @@ -1355,7 +1354,7 @@ static int delayed_ref_ctr_inc(struct vm_area_struct *vma) } /* - * Called from mmap_region/vma_adjust with mm->mmap_sem acquired. + * Called from mmap_region/vma_adjust with mm->mmap_lock acquired. * * Currently we ignore all errors and always return 0, the callers * can't handle the failure anyway. @@ -1445,7 +1444,7 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) struct vm_area_struct *vma; int ret; - if (down_write_killable(&mm->mmap_sem)) + if (mmap_write_lock_killable(mm)) return -EINTR; if (mm->uprobes_state.xol_area) { @@ -1475,7 +1474,7 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) /* pairs with get_xol_area() */ smp_store_release(&mm->uprobes_state.xol_area, area); /* ^^^ */ fail: - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); return ret; } @@ -1674,7 +1673,7 @@ void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, copy_to_page(page, vaddr, src, len); /* - * We probably need flush_icache_user_range() but it needs vma. + * We probably need flush_icache_user_page() but it needs vma. * This should work on most of architectures by default. If * architecture needs to do something different it can define * its own version of the function. @@ -2014,6 +2013,9 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) uprobe_opcode_t opcode; int result; + if (WARN_ON_ONCE(!IS_ALIGNED(vaddr, UPROBE_SWBP_INSN_SIZE))) + return -EINVAL; + pagefault_disable(); result = __get_user(opcode, (uprobe_opcode_t __user *)vaddr); pagefault_enable(); @@ -2045,7 +2047,7 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) struct uprobe *uprobe = NULL; struct vm_area_struct *vma; - down_read(&mm->mmap_sem); + mmap_read_lock(mm); vma = find_vma(mm, bp_vaddr); if (vma && vma->vm_start <= bp_vaddr) { if (valid_vma(vma, false)) { @@ -2063,7 +2065,7 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) if (!uprobe && test_and_clear_bit(MMF_RECALC_UPROBES, &mm->flags)) mmf_recalc_uprobes(mm); - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); return uprobe; } @@ -2187,7 +2189,7 @@ static void handle_swbp(struct pt_regs *regs) { struct uprobe *uprobe; unsigned long bp_vaddr; - int uninitialized_var(is_swbp); + int is_swbp; bp_vaddr = uprobe_get_swbp_addr(regs); if (bp_vaddr == get_trampoline_vaddr()) @@ -2197,7 +2199,7 @@ static void handle_swbp(struct pt_regs *regs) if (!uprobe) { if (is_swbp > 0) { /* No matching uprobe; signal SIGTRAP. */ - send_sig(SIGTRAP, current, 0); + force_sig(SIGTRAP); } else { /* * Either we raced with uprobe_unregister() or we can't diff --git a/kernel/exit.c b/kernel/exit.c index ce2a75bc0ade..e731c414e024 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -66,7 +66,6 @@ #include <linux/uaccess.h> #include <asm/unistd.h> -#include <asm/pgtable.h> #include <asm/mmu_context.h> static void __unhash_process(struct task_struct *p, bool group_dead) @@ -94,7 +93,7 @@ static void __exit_signal(struct task_struct *tsk) struct signal_struct *sig = tsk->signal; bool group_dead = thread_group_leader(tsk); struct sighand_struct *sighand; - struct tty_struct *uninitialized_var(tty); + struct tty_struct *tty; u64 utime, stime; sighand = rcu_dereference_check(tsk->sighand, @@ -218,6 +217,7 @@ repeat: } write_unlock_irq(&tasklist_lock); + seccomp_filter_release(p); proc_flush_pid(thread_pid); put_pid(thread_pid); release_thread(p); @@ -228,8 +228,9 @@ repeat: goto repeat; } -void rcuwait_wake_up(struct rcuwait *w) +int rcuwait_wake_up(struct rcuwait *w) { + int ret = 0; struct task_struct *task; rcu_read_lock(); @@ -237,7 +238,7 @@ void rcuwait_wake_up(struct rcuwait *w) /* * Order condition vs @task, such that everything prior to the load * of @task is visible. This is the condition as to why the user called - * rcuwait_trywake() in the first place. Pairs with set_current_state() + * rcuwait_wake() in the first place. Pairs with set_current_state() * barrier (A) in rcuwait_wait_event(). * * WAIT WAKE @@ -249,8 +250,10 @@ void rcuwait_wake_up(struct rcuwait *w) task = rcu_dereference(w->task); if (task) - wake_up_process(task); + ret = wake_up_process(task); rcu_read_unlock(); + + return ret; } EXPORT_SYMBOL_GPL(rcuwait_wake_up); @@ -438,17 +441,17 @@ static void exit_mm(void) sync_mm_rss(mm); /* * Serialize with any possible pending coredump. - * We must hold mmap_sem around checking core_state + * We must hold mmap_lock around checking core_state * and clearing tsk->mm. The core-inducing thread * will increment ->nr_threads for each thread in the * group with ->mm != NULL. */ - down_read(&mm->mmap_sem); + mmap_read_lock(mm); core_state = mm->core_state; if (core_state) { struct core_thread self; - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); self.task = current; self.next = xchg(&core_state->dumper.next, &self); @@ -466,14 +469,14 @@ static void exit_mm(void) freezable_schedule(); } __set_current_state(TASK_RUNNING); - down_read(&mm->mmap_sem); + mmap_read_lock(mm); } mmgrab(mm); BUG_ON(mm != current->active_mm); /* more a memory barrier than a real lock */ task_lock(current); current->mm = NULL; - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); enter_lazy_tlb(mm, current); task_unlock(current); mm_update_next_owner(mm); @@ -708,8 +711,12 @@ void __noreturn do_exit(long code) struct task_struct *tsk = current; int group_dead; - profile_task_exit(tsk); - kcov_task_exit(tsk); + /* + * We can get here from a kernel oops, sometimes with preemption off. + * Start by checking for critical errors. + * Then fix up important state like USER_DS and preemption. + * Then do everything else. + */ WARN_ON(blk_needs_flush_plug(tsk)); @@ -727,6 +734,16 @@ void __noreturn do_exit(long code) */ set_fs(USER_DS); + if (unlikely(in_atomic())) { + pr_info("note: %s[%d] exited with preempt_count %d\n", + current->comm, task_pid_nr(current), + preempt_count()); + preempt_count_set(PREEMPT_ENABLED); + } + + profile_task_exit(tsk); + kcov_task_exit(tsk); + ptrace_event(PTRACE_EVENT_EXIT, code); validate_creds_for_do_exit(tsk); @@ -744,13 +761,6 @@ void __noreturn do_exit(long code) exit_signals(tsk); /* sets PF_EXITING */ - if (unlikely(in_atomic())) { - pr_info("note: %s[%d] exited with preempt_count %d\n", - current->comm, task_pid_nr(current), - preempt_count()); - preempt_count_set(PREEMPT_ENABLED); - } - /* sync mm's RSS info before statistics gathering */ if (tsk->mm) sync_mm_rss(tsk->mm); @@ -795,7 +805,6 @@ void __noreturn do_exit(long code) exit_task_namespaces(tsk); exit_task_work(tsk); exit_thread(tsk); - exit_umh(tsk); /* * Flush inherited counters to the parent - before the parent @@ -1558,7 +1567,7 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, if (!infop) return err; - if (!user_access_begin(infop, sizeof(*infop))) + if (!user_write_access_begin(infop, sizeof(*infop))) return -EFAULT; unsafe_put_user(signo, &infop->si_signo, Efault); @@ -1567,10 +1576,10 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, unsafe_put_user(info.pid, &infop->si_pid, Efault); unsafe_put_user(info.uid, &infop->si_uid, Efault); unsafe_put_user(info.status, &infop->si_status, Efault); - user_access_end(); + user_write_access_end(); return err; Efault: - user_access_end(); + user_write_access_end(); return -EFAULT; } @@ -1685,7 +1694,7 @@ COMPAT_SYSCALL_DEFINE5(waitid, if (!infop) return err; - if (!user_access_begin(infop, sizeof(*infop))) + if (!user_write_access_begin(infop, sizeof(*infop))) return -EFAULT; unsafe_put_user(signo, &infop->si_signo, Efault); @@ -1694,14 +1703,38 @@ COMPAT_SYSCALL_DEFINE5(waitid, unsafe_put_user(info.pid, &infop->si_pid, Efault); unsafe_put_user(info.uid, &infop->si_uid, Efault); unsafe_put_user(info.status, &infop->si_status, Efault); - user_access_end(); + user_write_access_end(); return err; Efault: - user_access_end(); + user_write_access_end(); return -EFAULT; } #endif +/** + * thread_group_exited - check that a thread group has exited + * @pid: tgid of thread group to be checked. + * + * Test if the thread group represented by tgid has exited (all + * threads are zombies, dead or completely gone). + * + * Return: true if the thread group has exited. false otherwise. + */ +bool thread_group_exited(struct pid *pid) +{ + struct task_struct *task; + bool exited; + + rcu_read_lock(); + task = pid_task(pid, PIDTYPE_PID); + exited = !task || + (READ_ONCE(task->exit_state) && thread_group_empty(task)); + rcu_read_unlock(); + + return exited; +} +EXPORT_SYMBOL(thread_group_exited); + __weak void abort(void) { BUG(); diff --git a/kernel/fork.c b/kernel/fork.c index 48ed22774efa..76d3f3387554 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -94,8 +94,8 @@ #include <linux/thread_info.h> #include <linux/stackleak.h> #include <linux/kasan.h> +#include <linux/scs.h> -#include <asm/pgtable.h> #include <asm/pgalloc.h> #include <linux/uaccess.h> #include <asm/mmu_context.h> @@ -359,7 +359,13 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (new) { - *new = *orig; + ASSERT_EXCLUSIVE_WRITER(orig->vm_flags); + ASSERT_EXCLUSIVE_WRITER(orig->vm_file); + /* + * orig->shared.rb may be modified concurrently, but the clone + * will be reinitialized. + */ + *new = data_race(*orig); INIT_LIST_HEAD(&new->anon_vma_chain); new->vm_next = new->vm_prev = NULL; } @@ -456,6 +462,8 @@ void put_task_stack(struct task_struct *tsk) void free_task(struct task_struct *tsk) { + scs_release(tsk); + #ifndef CONFIG_THREAD_INFO_IN_TASK /* * The task is finally done with both the stack and thread_info, @@ -471,7 +479,6 @@ void free_task(struct task_struct *tsk) #endif rt_mutex_debug_task_free(tsk); ftrace_graph_exit_task(tsk); - put_seccomp_filter(tsk); arch_release_task_struct(tsk); if (tsk->flags & PF_KTHREAD) free_kthread_struct(tsk); @@ -490,7 +497,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, LIST_HEAD(uf); uprobe_start_dup_mmap(); - if (down_write_killable(&oldmm->mmap_sem)) { + if (mmap_write_lock_killable(oldmm)) { retval = -EINTR; goto fail_uprobe_end; } @@ -499,7 +506,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, /* * Not linked in yet - no deadlock potential: */ - down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); + mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING); /* No ordering required: file already has been exposed. */ RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); @@ -615,9 +622,9 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, /* a new mm has just been created */ retval = arch_dup_mmap(oldmm, mm); out: - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); flush_tlb_mm(oldmm); - up_write(&oldmm->mmap_sem); + mmap_write_unlock(oldmm); dup_userfaultfd_complete(&uf); fail_uprobe_end: uprobe_end_dup_mmap(); @@ -647,9 +654,9 @@ static inline void mm_free_pgd(struct mm_struct *mm) #else static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) { - down_write(&oldmm->mmap_sem); + mmap_write_lock(oldmm); RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); - up_write(&oldmm->mmap_sem); + mmap_write_unlock(oldmm); return 0; } #define mm_alloc_pgd(mm) (0) @@ -840,6 +847,8 @@ void __init fork_init(void) NULL, free_vm_stack_cache); #endif + scs_init(); + lockdep_init_task(&init_task); uprobes_init(); } @@ -899,6 +908,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) if (err) goto free_stack; + err = scs_prepare(tsk, node); + if (err) + goto free_stack; + #ifdef CONFIG_SECCOMP /* * We must handle setting up seccomp filters once we're under @@ -1014,7 +1027,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm->vmacache_seqnum = 0; atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1); - init_rwsem(&mm->mmap_sem); + mmap_init_lock(mm); INIT_LIST_HEAD(&mm->mmlist); mm->core_state = NULL; mm_pgtables_bytes_init(mm); @@ -1466,7 +1479,7 @@ static int copy_files(unsigned long clone_flags, struct task_struct *tsk) goto out; } - newf = dup_fd(oldf, &error); + newf = dup_fd(oldf, NR_OPEN_MAX, &error); if (!newf) goto out; @@ -1683,6 +1696,11 @@ static inline void rcu_copy_process(struct task_struct *p) INIT_LIST_HEAD(&p->rcu_tasks_holdout_list); p->rcu_tasks_idle_cpu = -1; #endif /* #ifdef CONFIG_TASKS_RCU */ +#ifdef CONFIG_TASKS_TRACE_RCU + p->trc_reader_nesting = 0; + p->trc_reader_special.s = 0; + INIT_LIST_HEAD(&p->trc_holdout_list); +#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ } struct pid *pidfd_pid(const struct file *file) @@ -1745,7 +1763,7 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) pid_t nr = -1; if (likely(pid_has_task(pid, PIDTYPE_PID))) { - ns = proc_pid_ns(file_inode(m->file)); + ns = proc_pid_ns(file_inode(m->file)->i_sb); nr = pid_nr_ns(pid, ns); } @@ -1774,22 +1792,18 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) */ static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) { - struct task_struct *task; struct pid *pid = file->private_data; __poll_t poll_flags = 0; poll_wait(file, &pid->wait_pidfd, pts); - rcu_read_lock(); - task = pid_task(pid, PIDTYPE_PID); /* * Inform pollers only when the whole thread group exits. * If the thread group leader exits before all other threads in the * group, then poll(2) should block, similar to the wait(2) family. */ - if (!task || (task->exit_state && thread_group_empty(task))) + if (thread_group_exited(pid)) poll_flags = EPOLLIN | EPOLLRDNORM; - rcu_read_unlock(); return poll_flags; } @@ -1941,8 +1955,8 @@ static __latent_entropy struct task_struct *copy_process( rt_mutex_init_task(p); + lockdep_assert_irqs_enabled(); #ifdef CONFIG_PROVE_LOCKING - DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); #endif retval = -EAGAIN; @@ -1964,7 +1978,7 @@ static __latent_entropy struct task_struct *copy_process( * to stop root fork bombs. */ retval = -EAGAIN; - if (nr_threads >= max_threads) + if (data_race(nr_threads >= max_threads)) goto bad_fork_cleanup_count; delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ @@ -2022,19 +2036,11 @@ static __latent_entropy struct task_struct *copy_process( seqcount_init(&p->mems_allowed_seq); #endif #ifdef CONFIG_TRACE_IRQFLAGS - p->irq_events = 0; - p->hardirqs_enabled = 0; - p->hardirq_enable_ip = 0; - p->hardirq_enable_event = 0; - p->hardirq_disable_ip = _THIS_IP_; - p->hardirq_disable_event = 0; - p->softirqs_enabled = 1; - p->softirq_enable_ip = _THIS_IP_; - p->softirq_enable_event = 0; - p->softirq_disable_ip = 0; - p->softirq_disable_event = 0; - p->hardirq_context = 0; - p->softirq_context = 0; + memset(&p->irqtrace, 0, sizeof(p->irqtrace)); + p->irqtrace.hardirq_disable_ip = _THIS_IP_; + p->irqtrace.softirq_enable_ip = _THIS_IP_; + p->softirqs_enabled = 1; + p->softirq_context = 0; #endif p->pagefault_disabled = 0; @@ -2091,8 +2097,7 @@ static __latent_entropy struct task_struct *copy_process( retval = copy_io(clone_flags, p); if (retval) goto bad_fork_cleanup_namespaces; - retval = copy_thread_tls(clone_flags, args->stack, args->stack_size, p, - args->tls); + retval = copy_thread(clone_flags, args->stack, args->stack_size, p, args->tls); if (retval) goto bad_fork_cleanup_io; @@ -2291,6 +2296,7 @@ static __latent_entropy struct task_struct *copy_process( write_unlock_irq(&tasklist_lock); proc_fork_connector(p); + sched_post_fork(p); cgroup_post_fork(p, args); perf_event_fork(p); @@ -2410,6 +2416,20 @@ long _do_fork(struct kernel_clone_args *args) long nr; /* + * For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument + * to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are + * mutually exclusive. With clone3() CLONE_PIDFD has grown a separate + * field in struct clone_args and it still doesn't make sense to have + * them both point at the same memory location. Performing this check + * here has the advantage that we don't need to have a separate helper + * to check for legacy clone(). + */ + if ((args->flags & CLONE_PIDFD) && + (args->flags & CLONE_PARENT_SETTID) && + (args->pidfd == args->parent_tid)) + return -EINVAL; + + /* * Determine whether and which event to report to ptracer. When * called from kernel_thread or CLONE_UNTRACED is explicitly * requested, no event is reported; otherwise, report if the event @@ -2466,42 +2486,6 @@ long _do_fork(struct kernel_clone_args *args) return nr; } -bool legacy_clone_args_valid(const struct kernel_clone_args *kargs) -{ - /* clone(CLONE_PIDFD) uses parent_tidptr to return a pidfd */ - if ((kargs->flags & CLONE_PIDFD) && - (kargs->flags & CLONE_PARENT_SETTID)) - return false; - - return true; -} - -#ifndef CONFIG_HAVE_COPY_THREAD_TLS -/* For compatibility with architectures that call do_fork directly rather than - * using the syscall entry points below. */ -long do_fork(unsigned long clone_flags, - unsigned long stack_start, - unsigned long stack_size, - int __user *parent_tidptr, - int __user *child_tidptr) -{ - struct kernel_clone_args args = { - .flags = (lower_32_bits(clone_flags) & ~CSIGNAL), - .pidfd = parent_tidptr, - .child_tid = child_tidptr, - .parent_tid = parent_tidptr, - .exit_signal = (lower_32_bits(clone_flags) & CSIGNAL), - .stack = stack_start, - .stack_size = stack_size, - }; - - if (!legacy_clone_args_valid(&args)) - return -EINVAL; - - return _do_fork(&args); -} -#endif - /* * Create a kernel thread. */ @@ -2580,24 +2564,12 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, .tls = tls, }; - if (!legacy_clone_args_valid(&args)) - return -EINVAL; - return _do_fork(&args); } #endif #ifdef __ARCH_WANT_SYS_CLONE3 -/* - * copy_thread implementations handle CLONE_SETTLS by reading the TLS value from - * the registers containing the syscall arguments for clone. This doesn't work - * with clone3 since the TLS value is passed in clone_args instead. - */ -#ifndef CONFIG_HAVE_COPY_THREAD_TLS -#error clone3 requires copy_thread_tls support in arch -#endif - noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, struct clone_args __user *uargs, size_t usize) @@ -2894,14 +2866,15 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) /* * Unshare file descriptor table if it is being shared */ -static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp) +int unshare_fd(unsigned long unshare_flags, unsigned int max_fds, + struct files_struct **new_fdp) { struct files_struct *fd = current->files; int error = 0; if ((unshare_flags & CLONE_FILES) && (fd && atomic_read(&fd->count) > 1)) { - *new_fdp = dup_fd(fd, &error); + *new_fdp = dup_fd(fd, max_fds, &error); if (!*new_fdp) return error; } @@ -2912,7 +2885,7 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp /* * unshare allows a process to 'unshare' part of the process * context which was originally shared using clone. copy_* - * functions used by do_fork() cannot be used here directly + * functions used by _do_fork() cannot be used here directly * because they modify an inactive task_struct that is being * constructed. Here we are modifying the current, active, * task_struct. @@ -2961,7 +2934,7 @@ int ksys_unshare(unsigned long unshare_flags) err = unshare_fs(unshare_flags, &new_fs); if (err) goto bad_unshare_out; - err = unshare_fd(unshare_flags, &new_fd); + err = unshare_fd(unshare_flags, NR_OPEN_MAX, &new_fd); if (err) goto bad_unshare_cleanup_fs; err = unshare_userns(unshare_flags, &new_cred); @@ -3050,7 +3023,7 @@ int unshare_files(struct files_struct **displaced) struct files_struct *copy = NULL; int error; - error = unshare_fd(CLONE_FILES, ©); + error = unshare_fd(CLONE_FILES, NR_OPEN_MAX, ©); if (error || !copy) { *displaced = NULL; return error; diff --git a/kernel/futex.c b/kernel/futex.c index b59532862bc0..83404124b77b 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -32,30 +32,13 @@ * "But they come in a choice of three flavours!" */ #include <linux/compat.h> -#include <linux/slab.h> -#include <linux/poll.h> -#include <linux/fs.h> -#include <linux/file.h> #include <linux/jhash.h> -#include <linux/init.h> -#include <linux/futex.h> -#include <linux/mount.h> #include <linux/pagemap.h> #include <linux/syscalls.h> -#include <linux/signal.h> -#include <linux/export.h> -#include <linux/magic.h> -#include <linux/pid.h> -#include <linux/nsproxy.h> -#include <linux/ptrace.h> -#include <linux/sched/rt.h> -#include <linux/sched/wake_q.h> -#include <linux/sched/mm.h> #include <linux/hugetlb.h> #include <linux/freezer.h> #include <linux/memblock.h> #include <linux/fault-inject.h> -#include <linux/refcount.h> #include <asm/futex.h> @@ -476,7 +459,7 @@ static u64 get_inode_sequence_number(struct inode *inode) /** * get_futex_key() - Get parameters which are the keys for a futex * @uaddr: virtual address of the futex - * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED + * @fshared: false for a PROCESS_PRIVATE futex, true for PROCESS_SHARED * @key: address where result is stored. * @rw: mapping needs to be read/write (values: FUTEX_READ, * FUTEX_WRITE) @@ -486,10 +469,13 @@ static u64 get_inode_sequence_number(struct inode *inode) * The key words are stored in @key on success. * * For shared mappings (when @fshared), the key is: + * * ( inode->i_sequence, page->index, offset_within_page ) + * * [ also see get_inode_sequence_number() ] * * For private mappings (or when !@fshared), the key is: + * * ( current->mm, address, 0 ) * * This allows (cross process, where applicable) identification of the futex @@ -497,8 +483,8 @@ static u64 get_inode_sequence_number(struct inode *inode) * * lock_page() might sleep, the caller should not hold a spinlock. */ -static int -get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, enum futex_access rw) +static int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key, + enum futex_access rw) { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; @@ -535,7 +521,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, enum futex_a again: /* Ignore any VERIFY_READ mapping (futex common case) */ - if (unlikely(should_fail_futex(fshared))) + if (unlikely(should_fail_futex(true))) return -EFAULT; err = get_user_pages_fast(address, 1, FOLL_WRITE, &page); @@ -623,7 +609,7 @@ again: * A RO anonymous page will never change and thus doesn't make * sense for futex operations. */ - if (unlikely(should_fail_futex(fshared)) || ro) { + if (unlikely(should_fail_futex(true)) || ro) { err = -EFAULT; goto out; } @@ -674,10 +660,6 @@ out: return err; } -static inline void put_futex_key(union futex_key *key) -{ -} - /** * fault_in_user_writeable() - Fault in user address and verify RW access * @uaddr: pointer to faulting user space address @@ -695,10 +677,10 @@ static int fault_in_user_writeable(u32 __user *uaddr) struct mm_struct *mm = current->mm; int ret; - down_read(&mm->mmap_sem); + mmap_read_lock(mm); ret = fixup_user_fault(current, mm, (unsigned long)uaddr, FAULT_FLAG_WRITE, NULL); - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); return ret < 0 ? ret : 0; } @@ -1323,7 +1305,7 @@ static int lookup_pi_state(u32 __user *uaddr, u32 uval, static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) { int err; - u32 uninitialized_var(curval); + u32 curval; if (unlikely(should_fail_futex(true))) return -EFAULT; @@ -1493,7 +1475,7 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q) */ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state) { - u32 uninitialized_var(curval), newval; + u32 curval, newval; struct task_struct *new_owner; bool postunlock = false; DEFINE_WAKE_Q(wake_q); @@ -1608,13 +1590,13 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ); if (unlikely(ret != 0)) - goto out; + return ret; hb = hash_futex(&key); /* Make sure we really have tasks to wakeup */ if (!hb_waiters_pending(hb)) - goto out_put_key; + return ret; spin_lock(&hb->lock); @@ -1637,9 +1619,6 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) spin_unlock(&hb->lock); wake_up_q(&wake_q); -out_put_key: - put_futex_key(&key); -out: return ret; } @@ -1706,10 +1685,10 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, retry: ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ); if (unlikely(ret != 0)) - goto out; + return ret; ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE); if (unlikely(ret != 0)) - goto out_put_key1; + return ret; hb1 = hash_futex(&key1); hb2 = hash_futex(&key2); @@ -1727,13 +1706,13 @@ retry_private: * an MMU, but we might get them from range checking */ ret = op_ret; - goto out_put_keys; + return ret; } if (op_ret == -EFAULT) { ret = fault_in_user_writeable(uaddr2); if (ret) - goto out_put_keys; + return ret; } if (!(flags & FLAGS_SHARED)) { @@ -1741,8 +1720,6 @@ retry_private: goto retry_private; } - put_futex_key(&key2); - put_futex_key(&key1); cond_resched(); goto retry; } @@ -1778,11 +1755,6 @@ retry_private: out_unlock: double_unlock_hb(hb1, hb2); wake_up_q(&wake_q); -out_put_keys: - put_futex_key(&key2); -out_put_key1: - put_futex_key(&key1); -out: return ret; } @@ -1989,20 +1961,18 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, retry: ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ); if (unlikely(ret != 0)) - goto out; + return ret; ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, requeue_pi ? FUTEX_WRITE : FUTEX_READ); if (unlikely(ret != 0)) - goto out_put_key1; + return ret; /* * The check above which compares uaddrs is not sufficient for * shared futexes. We need to compare the keys: */ - if (requeue_pi && match_futex(&key1, &key2)) { - ret = -EINVAL; - goto out_put_keys; - } + if (requeue_pi && match_futex(&key1, &key2)) + return -EINVAL; hb1 = hash_futex(&key1); hb2 = hash_futex(&key2); @@ -2022,13 +1992,11 @@ retry_private: ret = get_user(curval, uaddr1); if (ret) - goto out_put_keys; + return ret; if (!(flags & FLAGS_SHARED)) goto retry_private; - put_futex_key(&key2); - put_futex_key(&key1); goto retry; } if (curval != *cmpval) { @@ -2087,12 +2055,10 @@ retry_private: case -EFAULT: double_unlock_hb(hb1, hb2); hb_waiters_dec(hb2); - put_futex_key(&key2); - put_futex_key(&key1); ret = fault_in_user_writeable(uaddr2); if (!ret) goto retry; - goto out; + return ret; case -EBUSY: case -EAGAIN: /* @@ -2103,8 +2069,6 @@ retry_private: */ double_unlock_hb(hb1, hb2); hb_waiters_dec(hb2); - put_futex_key(&key2); - put_futex_key(&key1); /* * Handle the case where the owner is in the middle of * exiting. Wait for the exit to complete otherwise @@ -2213,12 +2177,6 @@ out_unlock: double_unlock_hb(hb1, hb2); wake_up_q(&wake_q); hb_waiters_dec(hb2); - -out_put_keys: - put_futex_key(&key2); -out_put_key1: - put_futex_key(&key1); -out: return ret ? ret : task_count; } @@ -2367,7 +2325,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, struct task_struct *argowner) { struct futex_pi_state *pi_state = q->pi_state; - u32 uval, uninitialized_var(curval), newval; + u32 uval, curval, newval; struct task_struct *oldowner, *newowner; u32 newtid; int ret, err = 0; @@ -2564,7 +2522,7 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) */ if (q->pi_state->owner != current) ret = fixup_pi_state_owner(uaddr, q, current); - goto out; + return ret ? ret : locked; } /* @@ -2577,7 +2535,7 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) */ if (q->pi_state->owner == current) { ret = fixup_pi_state_owner(uaddr, q, NULL); - goto out; + return ret; } /* @@ -2591,8 +2549,7 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) q->pi_state->owner); } -out: - return ret ? ret : locked; + return ret; } /** @@ -2689,12 +2646,11 @@ retry_private: ret = get_user(uval, uaddr); if (ret) - goto out; + return ret; if (!(flags & FLAGS_SHARED)) goto retry_private; - put_futex_key(&q->key); goto retry; } @@ -2703,9 +2659,6 @@ retry_private: ret = -EWOULDBLOCK; } -out: - if (ret) - put_futex_key(&q->key); return ret; } @@ -2850,7 +2803,6 @@ retry_private: * - EAGAIN: The user space value changed. */ queue_unlock(hb); - put_futex_key(&q.key); /* * Handle the case where the owner is in the middle of * exiting. Wait for the exit to complete otherwise @@ -2958,13 +2910,11 @@ no_block: put_pi_state(pi_state); } - goto out_put_key; + goto out; out_unlock_put_key: queue_unlock(hb); -out_put_key: - put_futex_key(&q.key); out: if (to) { hrtimer_cancel(&to->timer); @@ -2977,12 +2927,11 @@ uaddr_faulted: ret = fault_in_user_writeable(uaddr); if (ret) - goto out_put_key; + goto out; if (!(flags & FLAGS_SHARED)) goto retry_private; - put_futex_key(&q.key); goto retry; } @@ -2993,7 +2942,7 @@ uaddr_faulted: */ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) { - u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current); + u32 curval, uval, vpid = task_pid_vnr(current); union futex_key key = FUTEX_KEY_INIT; struct futex_hash_bucket *hb; struct futex_q *top_waiter; @@ -3111,16 +3060,13 @@ retry: out_unlock: spin_unlock(&hb->lock); out_putkey: - put_futex_key(&key); return ret; pi_retry: - put_futex_key(&key); cond_resched(); goto retry; pi_faulted: - put_futex_key(&key); ret = fault_in_user_writeable(uaddr); if (!ret) @@ -3262,7 +3208,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, */ ret = futex_wait_setup(uaddr, val, flags, &q, &hb); if (ret) - goto out_key2; + goto out; /* * The check above which compares uaddrs is not sufficient for @@ -3271,7 +3217,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, if (match_futex(&q.key, &key2)) { queue_unlock(hb); ret = -EINVAL; - goto out_put_keys; + goto out; } /* Queue the futex_q, drop the hb lock, wait for wakeup. */ @@ -3281,7 +3227,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); spin_unlock(&hb->lock); if (ret) - goto out_put_keys; + goto out; /* * In order for us to be here, we know our q.key == key2, and since @@ -3371,11 +3317,6 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, ret = -EWOULDBLOCK; } -out_put_keys: - put_futex_key(&q.key); -out_key2: - put_futex_key(&key2); - out: if (to) { hrtimer_cancel(&to->timer); @@ -3476,7 +3417,7 @@ err_unlock: static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, bool pi, bool pending_op) { - u32 uval, uninitialized_var(nval), mval; + u32 uval, nval, mval; int err; /* Futex address must be 32bit aligned */ @@ -3606,7 +3547,7 @@ static void exit_robust_list(struct task_struct *curr) struct robust_list_head __user *head = curr->robust_list; struct robust_list __user *entry, *next_entry, *pending; unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; - unsigned int uninitialized_var(next_pi); + unsigned int next_pi; unsigned long futex_offset; int rc; @@ -3906,7 +3847,7 @@ static void compat_exit_robust_list(struct task_struct *curr) struct compat_robust_list_head __user *head = curr->compat_robust_list; struct robust_list __user *entry, *next_entry, *pending; unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; - unsigned int uninitialized_var(next_pi); + unsigned int next_pi; compat_uptr_t uentry, next_uentry, upending; compat_long_t futex_offset; int rc; diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index 3941a9c48f83..3110c77230c7 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -6,7 +6,7 @@ config GCOV_KERNEL depends on DEBUG_FS select CONSTRUCTORS if !UML default n - ---help--- + help This option enables gcov-based code profiling (e.g. for code coverage measurements). @@ -42,7 +42,7 @@ config GCOV_PROFILE_ALL depends on GCOV_KERNEL depends on ARCH_HAS_GCOV_PROFILE_ALL default n - ---help--- + help This options activates profiling for the entire kernel. If unsure, say N. @@ -51,28 +51,4 @@ config GCOV_PROFILE_ALL larger and run slower. Also be sure to exclude files from profiling which are not linked to the kernel image to prevent linker errors. -choice - prompt "Specify GCOV format" - depends on GCOV_KERNEL - depends on CC_IS_GCC - ---help--- - The gcov format is usually determined by the GCC version, and the - default is chosen according to your GCC version. However, there are - exceptions where format changes are integrated in lower-version GCCs. - In such a case, change this option to adjust the format used in the - kernel accordingly. - -config GCOV_FORMAT_3_4 - bool "GCC 3.4 format" - depends on GCC_VERSION < 40700 - ---help--- - Select this option to use the format defined by GCC 3.4. - -config GCOV_FORMAT_4_7 - bool "GCC 4.7 format" - ---help--- - Select this option to use the format defined by GCC 4.7. - -endchoice - endmenu diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile index d66a74b0f100..16f8ecc7d882 100644 --- a/kernel/gcov/Makefile +++ b/kernel/gcov/Makefile @@ -2,6 +2,5 @@ ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"' obj-y := base.o fs.o -obj-$(CONFIG_GCOV_FORMAT_3_4) += gcc_base.o gcc_3_4.o -obj-$(CONFIG_GCOV_FORMAT_4_7) += gcc_base.o gcc_4_7.o +obj-$(CONFIG_CC_IS_GCC) += gcc_base.o gcc_4_7.o obj-$(CONFIG_CC_IS_CLANG) += clang.o diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c deleted file mode 100644 index acb83558e5df..000000000000 --- a/kernel/gcov/gcc_3_4.c +++ /dev/null @@ -1,573 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * This code provides functions to handle gcc's profiling data format - * introduced with gcc 3.4. Future versions of gcc may change the gcov - * format (as happened before), so all format-specific information needs - * to be kept modular and easily exchangeable. - * - * This file is based on gcc-internal definitions. Functions and data - * structures are defined to be compatible with gcc counterparts. - * For a better understanding, refer to gcc source: gcc/gcov-io.h. - * - * Copyright IBM Corp. 2009 - * Author(s): Peter Oberparleiter <oberpar@linux.vnet.ibm.com> - * - * Uses gcc-internal data definitions. - */ - -#include <linux/errno.h> -#include <linux/slab.h> -#include <linux/string.h> -#include <linux/seq_file.h> -#include <linux/vmalloc.h> -#include "gcov.h" - -#define GCOV_COUNTERS 5 - -static struct gcov_info *gcov_info_head; - -/** - * struct gcov_fn_info - profiling meta data per function - * @ident: object file-unique function identifier - * @checksum: function checksum - * @n_ctrs: number of values per counter type belonging to this function - * - * This data is generated by gcc during compilation and doesn't change - * at run-time. - */ -struct gcov_fn_info { - unsigned int ident; - unsigned int checksum; - unsigned int n_ctrs[]; -}; - -/** - * struct gcov_ctr_info - profiling data per counter type - * @num: number of counter values for this type - * @values: array of counter values for this type - * @merge: merge function for counter values of this type (unused) - * - * This data is generated by gcc during compilation and doesn't change - * at run-time with the exception of the values array. - */ -struct gcov_ctr_info { - unsigned int num; - gcov_type *values; - void (*merge)(gcov_type *, unsigned int); -}; - -/** - * struct gcov_info - profiling data per object file - * @version: gcov version magic indicating the gcc version used for compilation - * @next: list head for a singly-linked list - * @stamp: time stamp - * @filename: name of the associated gcov data file - * @n_functions: number of instrumented functions - * @functions: function data - * @ctr_mask: mask specifying which counter types are active - * @counts: counter data per counter type - * - * This data is generated by gcc during compilation and doesn't change - * at run-time with the exception of the next pointer. - */ -struct gcov_info { - unsigned int version; - struct gcov_info *next; - unsigned int stamp; - const char *filename; - unsigned int n_functions; - const struct gcov_fn_info *functions; - unsigned int ctr_mask; - struct gcov_ctr_info counts[]; -}; - -/** - * gcov_info_filename - return info filename - * @info: profiling data set - */ -const char *gcov_info_filename(struct gcov_info *info) -{ - return info->filename; -} - -/** - * gcov_info_version - return info version - * @info: profiling data set - */ -unsigned int gcov_info_version(struct gcov_info *info) -{ - return info->version; -} - -/** - * gcov_info_next - return next profiling data set - * @info: profiling data set - * - * Returns next gcov_info following @info or first gcov_info in the chain if - * @info is %NULL. - */ -struct gcov_info *gcov_info_next(struct gcov_info *info) -{ - if (!info) - return gcov_info_head; - - return info->next; -} - -/** - * gcov_info_link - link/add profiling data set to the list - * @info: profiling data set - */ -void gcov_info_link(struct gcov_info *info) -{ - info->next = gcov_info_head; - gcov_info_head = info; -} - -/** - * gcov_info_unlink - unlink/remove profiling data set from the list - * @prev: previous profiling data set - * @info: profiling data set - */ -void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info) -{ - if (prev) - prev->next = info->next; - else - gcov_info_head = info->next; -} - -/** - * gcov_info_within_module - check if a profiling data set belongs to a module - * @info: profiling data set - * @mod: module - * - * Returns true if profiling data belongs module, false otherwise. - */ -bool gcov_info_within_module(struct gcov_info *info, struct module *mod) -{ - return within_module((unsigned long)info, mod); -} - -/* Symbolic links to be created for each profiling data file. */ -const struct gcov_link gcov_link[] = { - { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */ - { 0, NULL}, -}; - -/* - * Determine whether a counter is active. Based on gcc magic. Doesn't change - * at run-time. - */ -static int counter_active(struct gcov_info *info, unsigned int type) -{ - return (1 << type) & info->ctr_mask; -} - -/* Determine number of active counters. Based on gcc magic. */ -static unsigned int num_counter_active(struct gcov_info *info) -{ - unsigned int i; - unsigned int result = 0; - - for (i = 0; i < GCOV_COUNTERS; i++) { - if (counter_active(info, i)) - result++; - } - return result; -} - -/** - * gcov_info_reset - reset profiling data to zero - * @info: profiling data set - */ -void gcov_info_reset(struct gcov_info *info) -{ - unsigned int active = num_counter_active(info); - unsigned int i; - - for (i = 0; i < active; i++) { - memset(info->counts[i].values, 0, - info->counts[i].num * sizeof(gcov_type)); - } -} - -/** - * gcov_info_is_compatible - check if profiling data can be added - * @info1: first profiling data set - * @info2: second profiling data set - * - * Returns non-zero if profiling data can be added, zero otherwise. - */ -int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2) -{ - return (info1->stamp == info2->stamp); -} - -/** - * gcov_info_add - add up profiling data - * @dest: profiling data set to which data is added - * @source: profiling data set which is added - * - * Adds profiling counts of @source to @dest. - */ -void gcov_info_add(struct gcov_info *dest, struct gcov_info *source) -{ - unsigned int i; - unsigned int j; - - for (i = 0; i < num_counter_active(dest); i++) { - for (j = 0; j < dest->counts[i].num; j++) { - dest->counts[i].values[j] += - source->counts[i].values[j]; - } - } -} - -/* Get size of function info entry. Based on gcc magic. */ -static size_t get_fn_size(struct gcov_info *info) -{ - size_t size; - - size = sizeof(struct gcov_fn_info) + num_counter_active(info) * - sizeof(unsigned int); - if (__alignof__(struct gcov_fn_info) > sizeof(unsigned int)) - size = ALIGN(size, __alignof__(struct gcov_fn_info)); - return size; -} - -/* Get address of function info entry. Based on gcc magic. */ -static struct gcov_fn_info *get_fn_info(struct gcov_info *info, unsigned int fn) -{ - return (struct gcov_fn_info *) - ((char *) info->functions + fn * get_fn_size(info)); -} - -/** - * gcov_info_dup - duplicate profiling data set - * @info: profiling data set to duplicate - * - * Return newly allocated duplicate on success, %NULL on error. - */ -struct gcov_info *gcov_info_dup(struct gcov_info *info) -{ - struct gcov_info *dup; - unsigned int i; - unsigned int active; - - /* Duplicate gcov_info. */ - active = num_counter_active(info); - dup = kzalloc(struct_size(dup, counts, active), GFP_KERNEL); - if (!dup) - return NULL; - dup->version = info->version; - dup->stamp = info->stamp; - dup->n_functions = info->n_functions; - dup->ctr_mask = info->ctr_mask; - /* Duplicate filename. */ - dup->filename = kstrdup(info->filename, GFP_KERNEL); - if (!dup->filename) - goto err_free; - /* Duplicate table of functions. */ - dup->functions = kmemdup(info->functions, info->n_functions * - get_fn_size(info), GFP_KERNEL); - if (!dup->functions) - goto err_free; - /* Duplicate counter arrays. */ - for (i = 0; i < active ; i++) { - struct gcov_ctr_info *ctr = &info->counts[i]; - size_t size = ctr->num * sizeof(gcov_type); - - dup->counts[i].num = ctr->num; - dup->counts[i].merge = ctr->merge; - dup->counts[i].values = vmalloc(size); - if (!dup->counts[i].values) - goto err_free; - memcpy(dup->counts[i].values, ctr->values, size); - } - return dup; - -err_free: - gcov_info_free(dup); - return NULL; -} - -/** - * gcov_info_free - release memory for profiling data set duplicate - * @info: profiling data set duplicate to free - */ -void gcov_info_free(struct gcov_info *info) -{ - unsigned int active = num_counter_active(info); - unsigned int i; - - for (i = 0; i < active ; i++) - vfree(info->counts[i].values); - kfree(info->functions); - kfree(info->filename); - kfree(info); -} - -/** - * struct type_info - iterator helper array - * @ctr_type: counter type - * @offset: index of the first value of the current function for this type - * - * This array is needed to convert the in-memory data format into the in-file - * data format: - * - * In-memory: - * for each counter type - * for each function - * values - * - * In-file: - * for each function - * for each counter type - * values - * - * See gcc source gcc/gcov-io.h for more information on data organization. - */ -struct type_info { - int ctr_type; - unsigned int offset; -}; - -/** - * struct gcov_iterator - specifies current file position in logical records - * @info: associated profiling data - * @record: record type - * @function: function number - * @type: counter type - * @count: index into values array - * @num_types: number of counter types - * @type_info: helper array to get values-array offset for current function - */ -struct gcov_iterator { - struct gcov_info *info; - - int record; - unsigned int function; - unsigned int type; - unsigned int count; - - int num_types; - struct type_info type_info[]; -}; - -static struct gcov_fn_info *get_func(struct gcov_iterator *iter) -{ - return get_fn_info(iter->info, iter->function); -} - -static struct type_info *get_type(struct gcov_iterator *iter) -{ - return &iter->type_info[iter->type]; -} - -/** - * gcov_iter_new - allocate and initialize profiling data iterator - * @info: profiling data set to be iterated - * - * Return file iterator on success, %NULL otherwise. - */ -struct gcov_iterator *gcov_iter_new(struct gcov_info *info) -{ - struct gcov_iterator *iter; - - iter = kzalloc(struct_size(iter, type_info, num_counter_active(info)), - GFP_KERNEL); - if (iter) - iter->info = info; - - return iter; -} - -/** - * gcov_iter_free - release memory for iterator - * @iter: file iterator to free - */ -void gcov_iter_free(struct gcov_iterator *iter) -{ - kfree(iter); -} - -/** - * gcov_iter_get_info - return profiling data set for given file iterator - * @iter: file iterator - */ -struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter) -{ - return iter->info; -} - -/** - * gcov_iter_start - reset file iterator to starting position - * @iter: file iterator - */ -void gcov_iter_start(struct gcov_iterator *iter) -{ - int i; - - iter->record = 0; - iter->function = 0; - iter->type = 0; - iter->count = 0; - iter->num_types = 0; - for (i = 0; i < GCOV_COUNTERS; i++) { - if (counter_active(iter->info, i)) { - iter->type_info[iter->num_types].ctr_type = i; - iter->type_info[iter->num_types++].offset = 0; - } - } -} - -/* Mapping of logical record number to actual file content. */ -#define RECORD_FILE_MAGIC 0 -#define RECORD_GCOV_VERSION 1 -#define RECORD_TIME_STAMP 2 -#define RECORD_FUNCTION_TAG 3 -#define RECORD_FUNCTON_TAG_LEN 4 -#define RECORD_FUNCTION_IDENT 5 -#define RECORD_FUNCTION_CHECK 6 -#define RECORD_COUNT_TAG 7 -#define RECORD_COUNT_LEN 8 -#define RECORD_COUNT 9 - -/** - * gcov_iter_next - advance file iterator to next logical record - * @iter: file iterator - * - * Return zero if new position is valid, non-zero if iterator has reached end. - */ -int gcov_iter_next(struct gcov_iterator *iter) -{ - switch (iter->record) { - case RECORD_FILE_MAGIC: - case RECORD_GCOV_VERSION: - case RECORD_FUNCTION_TAG: - case RECORD_FUNCTON_TAG_LEN: - case RECORD_FUNCTION_IDENT: - case RECORD_COUNT_TAG: - /* Advance to next record */ - iter->record++; - break; - case RECORD_COUNT: - /* Advance to next count */ - iter->count++; - /* fall through */ - case RECORD_COUNT_LEN: - if (iter->count < get_func(iter)->n_ctrs[iter->type]) { - iter->record = 9; - break; - } - /* Advance to next counter type */ - get_type(iter)->offset += iter->count; - iter->count = 0; - iter->type++; - /* fall through */ - case RECORD_FUNCTION_CHECK: - if (iter->type < iter->num_types) { - iter->record = 7; - break; - } - /* Advance to next function */ - iter->type = 0; - iter->function++; - /* fall through */ - case RECORD_TIME_STAMP: - if (iter->function < iter->info->n_functions) - iter->record = 3; - else - iter->record = -1; - break; - } - /* Check for EOF. */ - if (iter->record == -1) - return -EINVAL; - else - return 0; -} - -/** - * seq_write_gcov_u32 - write 32 bit number in gcov format to seq_file - * @seq: seq_file handle - * @v: value to be stored - * - * Number format defined by gcc: numbers are recorded in the 32 bit - * unsigned binary form of the endianness of the machine generating the - * file. - */ -static int seq_write_gcov_u32(struct seq_file *seq, u32 v) -{ - return seq_write(seq, &v, sizeof(v)); -} - -/** - * seq_write_gcov_u64 - write 64 bit number in gcov format to seq_file - * @seq: seq_file handle - * @v: value to be stored - * - * Number format defined by gcc: numbers are recorded in the 32 bit - * unsigned binary form of the endianness of the machine generating the - * file. 64 bit numbers are stored as two 32 bit numbers, the low part - * first. - */ -static int seq_write_gcov_u64(struct seq_file *seq, u64 v) -{ - u32 data[2]; - - data[0] = (v & 0xffffffffUL); - data[1] = (v >> 32); - return seq_write(seq, data, sizeof(data)); -} - -/** - * gcov_iter_write - write data for current pos to seq_file - * @iter: file iterator - * @seq: seq_file handle - * - * Return zero on success, non-zero otherwise. - */ -int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq) -{ - int rc = -EINVAL; - - switch (iter->record) { - case RECORD_FILE_MAGIC: - rc = seq_write_gcov_u32(seq, GCOV_DATA_MAGIC); - break; - case RECORD_GCOV_VERSION: - rc = seq_write_gcov_u32(seq, iter->info->version); - break; - case RECORD_TIME_STAMP: - rc = seq_write_gcov_u32(seq, iter->info->stamp); - break; - case RECORD_FUNCTION_TAG: - rc = seq_write_gcov_u32(seq, GCOV_TAG_FUNCTION); - break; - case RECORD_FUNCTON_TAG_LEN: - rc = seq_write_gcov_u32(seq, 2); - break; - case RECORD_FUNCTION_IDENT: - rc = seq_write_gcov_u32(seq, get_func(iter)->ident); - break; - case RECORD_FUNCTION_CHECK: - rc = seq_write_gcov_u32(seq, get_func(iter)->checksum); - break; - case RECORD_COUNT_TAG: - rc = seq_write_gcov_u32(seq, - GCOV_TAG_FOR_COUNTER(get_type(iter)->ctr_type)); - break; - case RECORD_COUNT_LEN: - rc = seq_write_gcov_u32(seq, - get_func(iter)->n_ctrs[iter->type] * 2); - break; - case RECORD_COUNT: - rc = seq_write_gcov_u64(seq, - iter->info->counts[iter->type]. - values[iter->count + get_type(iter)->offset]); - break; - } - return rc; -} diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index e13ca842eb7e..c1510f0ab3ea 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -88,7 +88,7 @@ find $cpio_dir -type f -print0 | find $cpio_dir -printf "./%P\n" | LC_ALL=C sort | \ tar "${KBUILD_BUILD_TIMESTAMP:+--mtime=$KBUILD_BUILD_TIMESTAMP}" \ --owner=0 --group=0 --numeric-owner --no-recursion \ - -Jcf $tarfile -C $cpio_dir/ -T - > /dev/null + -I $XZ -cf $tarfile -C $cpio_dir/ -T - > /dev/null echo $headers_md5 > kernel/kheaders.md5 echo "$this_file_md5" >> kernel/kheaders.md5 diff --git a/kernel/groups.c b/kernel/groups.c index daae2f2dc6d4..6ee6691f6839 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -20,7 +20,7 @@ struct group_info *groups_alloc(int gidsetsize) len = sizeof(struct group_info) + sizeof(kgid_t) * gidsetsize; gi = kmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_NOWARN|__GFP_NORETRY); if (!gi) - gi = __vmalloc(len, GFP_KERNEL_ACCOUNT, PAGE_KERNEL); + gi = __vmalloc(len, GFP_KERNEL_ACCOUNT); if (!gi) return NULL; diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 14a625c16cb3..ce76f490126c 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -53,9 +53,18 @@ int __read_mostly sysctl_hung_task_warnings = 10; static int __read_mostly did_panic; static bool hung_task_show_lock; static bool hung_task_call_panic; +static bool hung_task_show_all_bt; static struct task_struct *watchdog_task; +#ifdef CONFIG_SMP +/* + * Should we dump all CPUs backtraces in a hung task event? + * Defaults to 0, can be changed via sysctl. + */ +unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace; +#endif /* CONFIG_SMP */ + /* * Should we panic (and reboot, if panic_timeout= is set) when a * hung task is detected: @@ -63,16 +72,6 @@ static struct task_struct *watchdog_task; unsigned int __read_mostly sysctl_hung_task_panic = CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE; -static int __init hung_task_panic_setup(char *str) -{ - int rc = kstrtouint(str, 0, &sysctl_hung_task_panic); - - if (rc) - return rc; - return 1; -} -__setup("hung_task_panic=", hung_task_panic_setup); - static int hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr) { @@ -137,6 +136,9 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) " disables this message.\n"); sched_show_task(t); hung_task_show_lock = true; + + if (sysctl_hung_task_all_cpu_backtrace) + hung_task_show_all_bt = true; } touch_nmi_watchdog(); @@ -201,10 +203,14 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) rcu_read_unlock(); if (hung_task_show_lock) debug_show_all_locks(); - if (hung_task_call_panic) { + + if (hung_task_show_all_bt) { + hung_task_show_all_bt = false; trigger_all_cpu_backtrace(); - panic("hung_task: blocked tasks"); } + + if (hung_task_call_panic) + panic("hung_task: blocked tasks"); } static long hung_timeout_jiffies(unsigned long last_checked, diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 20d501af4f2e..10a5aff4eecc 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -51,10 +51,6 @@ config GENERIC_IRQ_INJECTION config HARDIRQS_SW_RESEND bool -# Preflow handler support for fasteoi (sparc64) -config IRQ_PREFLOW_FASTEOI - bool - # Edge style eoi based handler (cell) config IRQ_EDGE_EOI_HANDLER bool @@ -72,6 +68,7 @@ config IRQ_DOMAIN config IRQ_SIM bool select IRQ_WORK + select IRQ_DOMAIN # Support for hierarchical irq domains config IRQ_DOMAIN_HIERARCHY @@ -117,7 +114,7 @@ config IRQ_FORCED_THREADING config SPARSE_IRQ bool "Support sparse irq numbering" if MAY_HAVE_SPARSE_IRQ - ---help--- + help Sparse irq numbering is useful for distro kernels that want to define a high CONFIG_NR_CPUS value but still want to have @@ -133,7 +130,7 @@ config GENERIC_IRQ_DEBUGFS depends on DEBUG_FS select GENERIC_IRQ_INJECTION default n - ---help--- + help Exposes internal state information through debugfs. Mostly for developers and debugging of hard to diagnose interrupt problems. diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 41e7e37a0928..857f5f4c8098 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -656,16 +656,6 @@ out_unlock: } EXPORT_SYMBOL_GPL(handle_level_irq); -#ifdef CONFIG_IRQ_PREFLOW_FASTEOI -static inline void preflow_handler(struct irq_desc *desc) -{ - if (desc->preflow_handler) - desc->preflow_handler(&desc->irq_data); -} -#else -static inline void preflow_handler(struct irq_desc *desc) { } -#endif - static void cond_unmask_eoi_irq(struct irq_desc *desc, struct irq_chip *chip) { if (!(desc->istate & IRQS_ONESHOT)) { @@ -721,7 +711,6 @@ void handle_fasteoi_irq(struct irq_desc *desc) if (desc->istate & IRQS_ONESHOT) mask_irq(desc); - preflow_handler(desc); handle_irq_event(desc); cond_unmask_eoi_irq(desc, chip); @@ -1231,7 +1220,6 @@ void handle_fasteoi_ack_irq(struct irq_desc *desc) /* Start handling the irq */ desc->irq_data.chip->irq_ack(&desc->irq_data); - preflow_handler(desc); handle_irq_event(desc); cond_unmask_eoi_irq(desc, chip); @@ -1281,7 +1269,6 @@ void handle_fasteoi_mask_irq(struct irq_desc *desc) if (desc->istate & IRQS_ONESHOT) mask_irq(desc); - preflow_handler(desc); handle_irq_event(desc); cond_unmask_eoi_irq(desc, chip); @@ -1478,6 +1465,7 @@ int irq_chip_retrigger_hierarchy(struct irq_data *data) return 0; } +EXPORT_SYMBOL_GPL(irq_chip_retrigger_hierarchy); /** * irq_chip_set_vcpu_affinity_parent - Set vcpu affinity on the parent interrupt @@ -1492,7 +1480,7 @@ int irq_chip_set_vcpu_affinity_parent(struct irq_data *data, void *vcpu_info) return -ENOSYS; } - +EXPORT_SYMBOL_GPL(irq_chip_set_vcpu_affinity_parent); /** * irq_chip_set_wake_parent - Set/reset wake-up on the parent interrupt * @data: Pointer to interrupt specific data diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index 4f9f844074db..b95ff5d5f4bd 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -112,6 +112,7 @@ static const struct irq_bit_descr irqdata_states[] = { BIT_MASK_DESCR(IRQD_AFFINITY_SET), BIT_MASK_DESCR(IRQD_SETAFFINITY_PENDING), BIT_MASK_DESCR(IRQD_AFFINITY_MANAGED), + BIT_MASK_DESCR(IRQD_AFFINITY_ON_ACTIVATE), BIT_MASK_DESCR(IRQD_MANAGED_SHUTDOWN), BIT_MASK_DESCR(IRQD_CAN_RESERVE), BIT_MASK_DESCR(IRQD_MSI_NOMASK_QUIRK), @@ -120,6 +121,10 @@ static const struct irq_bit_descr irqdata_states[] = { BIT_MASK_DESCR(IRQD_WAKEUP_STATE), BIT_MASK_DESCR(IRQD_WAKEUP_ARMED), + + BIT_MASK_DESCR(IRQD_DEFAULT_TRIGGER_SET), + + BIT_MASK_DESCR(IRQD_HANDLE_ENFORCE_IRQCTX), }; static const struct irq_bit_descr irqdesc_states[] = { diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c index b992f88c5613..48006608baf0 100644 --- a/kernel/irq/irq_sim.c +++ b/kernel/irq/irq_sim.c @@ -1,14 +1,31 @@ // SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2017-2018 Bartosz Golaszewski <brgl@bgdev.pl> + * Copyright (C) 2020 Bartosz Golaszewski <bgolaszewski@baylibre.com> */ -#include <linux/slab.h> -#include <linux/irq_sim.h> #include <linux/irq.h> +#include <linux/irq_sim.h> +#include <linux/irq_work.h> +#include <linux/interrupt.h> +#include <linux/slab.h> + +struct irq_sim_work_ctx { + struct irq_work work; + int irq_base; + unsigned int irq_count; + unsigned long *pending; + struct irq_domain *domain; +}; + +struct irq_sim_irq_ctx { + int irqnum; + bool enabled; + struct irq_sim_work_ctx *work_ctx; +}; struct irq_sim_devres { - struct irq_sim *sim; + struct irq_domain *domain; }; static void irq_sim_irqmask(struct irq_data *data) @@ -36,159 +53,205 @@ static int irq_sim_set_type(struct irq_data *data, unsigned int type) return 0; } +static int irq_sim_get_irqchip_state(struct irq_data *data, + enum irqchip_irq_state which, bool *state) +{ + struct irq_sim_irq_ctx *irq_ctx = irq_data_get_irq_chip_data(data); + irq_hw_number_t hwirq = irqd_to_hwirq(data); + + switch (which) { + case IRQCHIP_STATE_PENDING: + if (irq_ctx->enabled) + *state = test_bit(hwirq, irq_ctx->work_ctx->pending); + break; + default: + return -EINVAL; + } + + return 0; +} + +static int irq_sim_set_irqchip_state(struct irq_data *data, + enum irqchip_irq_state which, bool state) +{ + struct irq_sim_irq_ctx *irq_ctx = irq_data_get_irq_chip_data(data); + irq_hw_number_t hwirq = irqd_to_hwirq(data); + + switch (which) { + case IRQCHIP_STATE_PENDING: + if (irq_ctx->enabled) { + assign_bit(hwirq, irq_ctx->work_ctx->pending, state); + if (state) + irq_work_queue(&irq_ctx->work_ctx->work); + } + break; + default: + return -EINVAL; + } + + return 0; +} + static struct irq_chip irq_sim_irqchip = { - .name = "irq_sim", - .irq_mask = irq_sim_irqmask, - .irq_unmask = irq_sim_irqunmask, - .irq_set_type = irq_sim_set_type, + .name = "irq_sim", + .irq_mask = irq_sim_irqmask, + .irq_unmask = irq_sim_irqunmask, + .irq_set_type = irq_sim_set_type, + .irq_get_irqchip_state = irq_sim_get_irqchip_state, + .irq_set_irqchip_state = irq_sim_set_irqchip_state, }; static void irq_sim_handle_irq(struct irq_work *work) { struct irq_sim_work_ctx *work_ctx; unsigned int offset = 0; - struct irq_sim *sim; int irqnum; work_ctx = container_of(work, struct irq_sim_work_ctx, work); - sim = container_of(work_ctx, struct irq_sim, work_ctx); - while (!bitmap_empty(work_ctx->pending, sim->irq_count)) { + while (!bitmap_empty(work_ctx->pending, work_ctx->irq_count)) { offset = find_next_bit(work_ctx->pending, - sim->irq_count, offset); + work_ctx->irq_count, offset); clear_bit(offset, work_ctx->pending); - irqnum = irq_sim_irqnum(sim, offset); + irqnum = irq_find_mapping(work_ctx->domain, offset); handle_simple_irq(irq_to_desc(irqnum)); } } +static int irq_sim_domain_map(struct irq_domain *domain, + unsigned int virq, irq_hw_number_t hw) +{ + struct irq_sim_work_ctx *work_ctx = domain->host_data; + struct irq_sim_irq_ctx *irq_ctx; + + irq_ctx = kzalloc(sizeof(*irq_ctx), GFP_KERNEL); + if (!irq_ctx) + return -ENOMEM; + + irq_set_chip(virq, &irq_sim_irqchip); + irq_set_chip_data(virq, irq_ctx); + irq_set_handler(virq, handle_simple_irq); + irq_modify_status(virq, IRQ_NOREQUEST | IRQ_NOAUTOEN, IRQ_NOPROBE); + irq_ctx->work_ctx = work_ctx; + + return 0; +} + +static void irq_sim_domain_unmap(struct irq_domain *domain, unsigned int virq) +{ + struct irq_sim_irq_ctx *irq_ctx; + struct irq_data *irqd; + + irqd = irq_domain_get_irq_data(domain, virq); + irq_ctx = irq_data_get_irq_chip_data(irqd); + + irq_set_handler(virq, NULL); + irq_domain_reset_irq_data(irqd); + kfree(irq_ctx); +} + +static const struct irq_domain_ops irq_sim_domain_ops = { + .map = irq_sim_domain_map, + .unmap = irq_sim_domain_unmap, +}; + /** - * irq_sim_init - Initialize the interrupt simulator: allocate a range of - * dummy interrupts. + * irq_domain_create_sim - Create a new interrupt simulator irq_domain and + * allocate a range of dummy interrupts. * - * @sim: The interrupt simulator object to initialize. - * @num_irqs: Number of interrupts to allocate + * @fnode: struct fwnode_handle to be associated with this domain. + * @num_irqs: Number of interrupts to allocate. * - * On success: return the base of the allocated interrupt range. - * On failure: a negative errno. + * On success: return a new irq_domain object. + * On failure: a negative errno wrapped with ERR_PTR(). */ -int irq_sim_init(struct irq_sim *sim, unsigned int num_irqs) +struct irq_domain *irq_domain_create_sim(struct fwnode_handle *fwnode, + unsigned int num_irqs) { - int i; + struct irq_sim_work_ctx *work_ctx; - sim->irqs = kmalloc_array(num_irqs, sizeof(*sim->irqs), GFP_KERNEL); - if (!sim->irqs) - return -ENOMEM; + work_ctx = kmalloc(sizeof(*work_ctx), GFP_KERNEL); + if (!work_ctx) + goto err_out; - sim->irq_base = irq_alloc_descs(-1, 0, num_irqs, 0); - if (sim->irq_base < 0) { - kfree(sim->irqs); - return sim->irq_base; - } + work_ctx->pending = bitmap_zalloc(num_irqs, GFP_KERNEL); + if (!work_ctx->pending) + goto err_free_work_ctx; - sim->work_ctx.pending = bitmap_zalloc(num_irqs, GFP_KERNEL); - if (!sim->work_ctx.pending) { - kfree(sim->irqs); - irq_free_descs(sim->irq_base, num_irqs); - return -ENOMEM; - } + work_ctx->domain = irq_domain_create_linear(fwnode, num_irqs, + &irq_sim_domain_ops, + work_ctx); + if (!work_ctx->domain) + goto err_free_bitmap; - for (i = 0; i < num_irqs; i++) { - sim->irqs[i].irqnum = sim->irq_base + i; - sim->irqs[i].enabled = false; - irq_set_chip(sim->irq_base + i, &irq_sim_irqchip); - irq_set_chip_data(sim->irq_base + i, &sim->irqs[i]); - irq_set_handler(sim->irq_base + i, &handle_simple_irq); - irq_modify_status(sim->irq_base + i, - IRQ_NOREQUEST | IRQ_NOAUTOEN, IRQ_NOPROBE); - } + work_ctx->irq_count = num_irqs; + init_irq_work(&work_ctx->work, irq_sim_handle_irq); - init_irq_work(&sim->work_ctx.work, irq_sim_handle_irq); - sim->irq_count = num_irqs; + return work_ctx->domain; - return sim->irq_base; +err_free_bitmap: + bitmap_free(work_ctx->pending); +err_free_work_ctx: + kfree(work_ctx); +err_out: + return ERR_PTR(-ENOMEM); } -EXPORT_SYMBOL_GPL(irq_sim_init); +EXPORT_SYMBOL_GPL(irq_domain_create_sim); /** - * irq_sim_fini - Deinitialize the interrupt simulator: free the interrupt - * descriptors and allocated memory. + * irq_domain_remove_sim - Deinitialize the interrupt simulator domain: free + * the interrupt descriptors and allocated memory. * - * @sim: The interrupt simulator to tear down. + * @domain: The interrupt simulator domain to tear down. */ -void irq_sim_fini(struct irq_sim *sim) +void irq_domain_remove_sim(struct irq_domain *domain) { - irq_work_sync(&sim->work_ctx.work); - bitmap_free(sim->work_ctx.pending); - irq_free_descs(sim->irq_base, sim->irq_count); - kfree(sim->irqs); + struct irq_sim_work_ctx *work_ctx = domain->host_data; + + irq_work_sync(&work_ctx->work); + bitmap_free(work_ctx->pending); + kfree(work_ctx); + + irq_domain_remove(domain); } -EXPORT_SYMBOL_GPL(irq_sim_fini); +EXPORT_SYMBOL_GPL(irq_domain_remove_sim); -static void devm_irq_sim_release(struct device *dev, void *res) +static void devm_irq_domain_release_sim(struct device *dev, void *res) { struct irq_sim_devres *this = res; - irq_sim_fini(this->sim); + irq_domain_remove_sim(this->domain); } /** - * irq_sim_init - Initialize the interrupt simulator for a managed device. + * devm_irq_domain_create_sim - Create a new interrupt simulator for + * a managed device. * * @dev: Device to initialize the simulator object for. - * @sim: The interrupt simulator object to initialize. + * @fnode: struct fwnode_handle to be associated with this domain. * @num_irqs: Number of interrupts to allocate * - * On success: return the base of the allocated interrupt range. - * On failure: a negative errno. + * On success: return a new irq_domain object. + * On failure: a negative errno wrapped with ERR_PTR(). */ -int devm_irq_sim_init(struct device *dev, struct irq_sim *sim, - unsigned int num_irqs) +struct irq_domain *devm_irq_domain_create_sim(struct device *dev, + struct fwnode_handle *fwnode, + unsigned int num_irqs) { struct irq_sim_devres *dr; - int rv; - dr = devres_alloc(devm_irq_sim_release, sizeof(*dr), GFP_KERNEL); + dr = devres_alloc(devm_irq_domain_release_sim, + sizeof(*dr), GFP_KERNEL); if (!dr) - return -ENOMEM; + return ERR_PTR(-ENOMEM); - rv = irq_sim_init(sim, num_irqs); - if (rv < 0) { + dr->domain = irq_domain_create_sim(fwnode, num_irqs); + if (IS_ERR(dr->domain)) { devres_free(dr); - return rv; + return dr->domain; } - dr->sim = sim; devres_add(dev, dr); - - return rv; -} -EXPORT_SYMBOL_GPL(devm_irq_sim_init); - -/** - * irq_sim_fire - Enqueue an interrupt. - * - * @sim: The interrupt simulator object. - * @offset: Offset of the simulated interrupt which should be fired. - */ -void irq_sim_fire(struct irq_sim *sim, unsigned int offset) -{ - if (sim->irqs[offset].enabled) { - set_bit(offset, sim->work_ctx.pending); - irq_work_queue(&sim->work_ctx.work); - } -} -EXPORT_SYMBOL_GPL(irq_sim_fire); - -/** - * irq_sim_irqnum - Get the allocated number of a dummy interrupt. - * - * @sim: The interrupt simulator object. - * @offset: Offset of the simulated interrupt for which to retrieve - * the number. - */ -int irq_sim_irqnum(struct irq_sim *sim, unsigned int offset) -{ - return sim->irqs[offset].irqnum; + return dr->domain; } -EXPORT_SYMBOL_GPL(irq_sim_irqnum); +EXPORT_SYMBOL_GPL(devm_irq_domain_create_sim); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 35b8d97c3a1d..76cd7ebd1178 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -132,18 +132,17 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, const struct irq_domain_ops *ops, void *host_data) { - struct device_node *of_node = to_of_node(fwnode); struct irqchip_fwid *fwid; struct irq_domain *domain; static atomic_t unknown_domains; domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size), - GFP_KERNEL, of_node_to_nid(of_node)); + GFP_KERNEL, of_node_to_nid(to_of_node(fwnode))); if (!domain) return NULL; - if (fwnode && is_fwnode_irqchip(fwnode)) { + if (is_fwnode_irqchip(fwnode)) { fwid = container_of(fwnode, struct irqchip_fwid, fwnode); switch (fwid->type) { @@ -162,30 +161,16 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, domain->name = fwid->name; break; } -#ifdef CONFIG_ACPI - } else if (is_acpi_device_node(fwnode)) { - struct acpi_buffer buf = { - .length = ACPI_ALLOCATE_BUFFER, - }; - acpi_handle handle; - - handle = acpi_device_handle(to_acpi_device_node(fwnode)); - if (acpi_get_name(handle, ACPI_FULL_PATHNAME, &buf) == AE_OK) { - domain->name = buf.pointer; - domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED; - } - - domain->fwnode = fwnode; -#endif - } else if (of_node) { + } else if (is_of_node(fwnode) || is_acpi_device_node(fwnode) || + is_software_node(fwnode)) { char *name; /* - * DT paths contain '/', which debugfs is legitimately + * fwnode paths contain '/', which debugfs is legitimately * unhappy about. Replace them with ':', which does * the trick and is not as offensive as '\'... */ - name = kasprintf(GFP_KERNEL, "%pOF", of_node); + name = kasprintf(GFP_KERNEL, "%pfw", fwnode); if (!name) { kfree(domain); return NULL; @@ -210,7 +195,7 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED; } - of_node_get(of_node); + fwnode_handle_get(fwnode); /* Fill structure */ INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL); @@ -259,7 +244,7 @@ void irq_domain_remove(struct irq_domain *domain) pr_debug("Removed domain %s\n", domain->name); - of_node_put(irq_domain_get_of_node(domain)); + fwnode_handle_put(domain->fwnode); if (domain->flags & IRQ_DOMAIN_NAME_ALLOCATED) kfree(domain->name); kfree(domain); @@ -296,6 +281,7 @@ void irq_domain_update_bus_token(struct irq_domain *domain, mutex_unlock(&irq_domain_mutex); } +EXPORT_SYMBOL_GPL(irq_domain_update_bus_token); /** * irq_domain_add_simple() - Register an irq_domain and optionally map a range of irqs @@ -1047,6 +1033,18 @@ int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq, return virq; } +/** + * irq_domain_reset_irq_data - Clear hwirq, chip and chip_data in @irq_data + * @irq_data: The pointer to irq_data + */ +void irq_domain_reset_irq_data(struct irq_data *irq_data) +{ + irq_data->hwirq = 0; + irq_data->chip = &no_irq_chip; + irq_data->chip_data = NULL; +} +EXPORT_SYMBOL_GPL(irq_domain_reset_irq_data); + #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY /** * irq_domain_create_hierarchy - Add a irqdomain into the hierarchy @@ -1248,18 +1246,6 @@ void irq_domain_set_info(struct irq_domain *domain, unsigned int virq, EXPORT_SYMBOL(irq_domain_set_info); /** - * irq_domain_reset_irq_data - Clear hwirq, chip and chip_data in @irq_data - * @irq_data: The pointer to irq_data - */ -void irq_domain_reset_irq_data(struct irq_data *irq_data) -{ - irq_data->hwirq = 0; - irq_data->chip = &no_irq_chip; - irq_data->chip_data = NULL; -} -EXPORT_SYMBOL_GPL(irq_domain_reset_irq_data); - -/** * irq_domain_free_irqs_common - Clear irq_data and free the parent * @domain: Interrupt domain to match * @virq: IRQ number to start with diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 453a8a0f4804..d55ba625d426 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -195,9 +195,9 @@ void irq_set_thread_affinity(struct irq_desc *desc) set_bit(IRQTF_AFFINITY, &action->thread_flags); } +#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK static void irq_validate_effective_affinity(struct irq_data *data) { -#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK const struct cpumask *m = irq_data_get_effective_affinity_mask(data); struct irq_chip *chip = irq_data_get_irq_chip(data); @@ -205,9 +205,19 @@ static void irq_validate_effective_affinity(struct irq_data *data) return; pr_warn_once("irq_chip %s did not update eff. affinity mask of irq %u\n", chip->name, data->irq); -#endif } +static inline void irq_init_effective_affinity(struct irq_data *data, + const struct cpumask *mask) +{ + cpumask_copy(irq_data_get_effective_affinity_mask(data), mask); +} +#else +static inline void irq_validate_effective_affinity(struct irq_data *data) { } +static inline void irq_init_effective_affinity(struct irq_data *data, + const struct cpumask *mask) { } +#endif + int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) { @@ -304,6 +314,30 @@ static int irq_try_set_affinity(struct irq_data *data, return ret; } +static bool irq_set_affinity_deactivated(struct irq_data *data, + const struct cpumask *mask, bool force) +{ + struct irq_desc *desc = irq_data_to_desc(data); + + /* + * Handle irq chips which can handle affinity only in activated + * state correctly + * + * If the interrupt is not yet activated, just store the affinity + * mask and do not call the chip driver at all. On activation the + * driver has to make sure anyway that the interrupt is in a + * useable state so startup works. + */ + if (!IS_ENABLED(CONFIG_IRQ_DOMAIN_HIERARCHY) || + irqd_is_activated(data) || !irqd_affinity_on_activate(data)) + return false; + + cpumask_copy(desc->irq_common_data.affinity, mask); + irq_init_effective_affinity(data, mask); + irqd_set(data, IRQD_AFFINITY_SET); + return true; +} + int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask, bool force) { @@ -314,6 +348,9 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask, if (!chip || !chip->irq_set_affinity) return -EINVAL; + if (irq_set_affinity_deactivated(data, mask, force)) + return 0; + if (irq_can_move_pcntxt(data) && !irqd_is_setaffinity_pending(data)) { ret = irq_try_set_affinity(data, mask, force); } else { @@ -1271,9 +1308,6 @@ static int setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary) { struct task_struct *t; - struct sched_param param = { - .sched_priority = MAX_USER_RT_PRIO/2, - }; if (!secondary) { t = kthread_create(irq_thread, new, "irq/%d-%s", irq, @@ -1281,13 +1315,12 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary) } else { t = kthread_create(irq_thread, new, "irq/%d-s-%s", irq, new->name); - param.sched_priority -= 1; } if (IS_ERR(t)) return PTR_ERR(t); - sched_setscheduler_nocheck(t, SCHED_FIFO, ¶m); + sched_set_fifo(t); /* * We keep the reference to the task struct even if @@ -2619,6 +2652,8 @@ int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which, do { chip = irq_data_get_irq_chip(data); + if (WARN_ON_ONCE(!chip)) + return -ENODEV; if (chip->irq_get_irqchip_state) break; #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY @@ -2696,6 +2731,8 @@ int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, do { chip = irq_data_get_irq_chip(data); + if (WARN_ON_ONCE(!chip)) + return -ENODEV; if (chip->irq_set_irqchip_state) break; #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 27634f4022d0..c48ce19a257f 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -45,7 +45,7 @@ static void resend_irqs(unsigned long arg) } /* Tasklet to handle resend: */ -static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0); +static DECLARE_TASKLET_OLD(resend_tasklet, resend_irqs); static int irq_sw_resend(struct irq_desc *desc) { diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 48b5d1b6af4d..eca83965b631 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -31,7 +31,7 @@ static bool irq_work_claim(struct irq_work *work) { int oflags; - oflags = atomic_fetch_or(IRQ_WORK_CLAIMED, &work->flags); + oflags = atomic_fetch_or(IRQ_WORK_CLAIMED | CSD_TYPE_IRQ_WORK, &work->flags); /* * If the work is already pending, no need to raise the IPI. * The pairing atomic_fetch_andnot() in irq_work_run() makes sure @@ -102,8 +102,7 @@ bool irq_work_queue_on(struct irq_work *work, int cpu) if (cpu != smp_processor_id()) { /* Arch remote IPI send/receive backend aren't NMI safe */ WARN_ON_ONCE(in_nmi()); - if (llist_add(&work->llnode, &per_cpu(raised_list, cpu))) - arch_send_call_function_single_ipi(cpu); + __smp_call_single_queue(cpu, &work->llnode); } else { __irq_work_queue_local(work); } @@ -131,6 +130,31 @@ bool irq_work_needs_cpu(void) return true; } +void irq_work_single(void *arg) +{ + struct irq_work *work = arg; + int flags; + + /* + * Clear the PENDING bit, after this point the @work + * can be re-used. + * Make it immediately visible so that other CPUs trying + * to claim that work don't rely on us to handle their data + * while we are in the middle of the func. + */ + flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->flags); + + lockdep_irq_work_enter(work); + work->func(work); + lockdep_irq_work_exit(work); + /* + * Clear the BUSY bit and return to the free state if + * no-one else claimed it meanwhile. + */ + flags &= ~IRQ_WORK_PENDING; + (void)atomic_cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY); +} + static void irq_work_run_list(struct llist_head *list) { struct irq_work *work, *tmp; @@ -142,27 +166,8 @@ static void irq_work_run_list(struct llist_head *list) return; llnode = llist_del_all(list); - llist_for_each_entry_safe(work, tmp, llnode, llnode) { - int flags; - /* - * Clear the PENDING bit, after this point the @work - * can be re-used. - * Make it immediately visible so that other CPUs trying - * to claim that work don't rely on us to handle their data - * while we are in the middle of the func. - */ - flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->flags); - - lockdep_irq_work_enter(work); - work->func(work); - lockdep_irq_work_exit(work); - /* - * Clear the BUSY bit and return to the free state if - * no-one else claimed it meanwhile. - */ - flags &= ~IRQ_WORK_PENDING; - (void)atomic_cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY); - } + llist_for_each_entry_safe(work, tmp, llnode, llnode) + irq_work_single(work); } /* diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 16c8c605f4b0..95cb74f73292 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -24,6 +24,7 @@ #include <linux/slab.h> #include <linux/filter.h> #include <linux/ftrace.h> +#include <linux/kprobes.h> #include <linux/compiler.h> /* @@ -437,6 +438,7 @@ struct kallsym_iter { loff_t pos_arch_end; loff_t pos_mod_end; loff_t pos_ftrace_mod_end; + loff_t pos_bpf_end; unsigned long value; unsigned int nameoff; /* If iterating in core kernel symbols. */ char type; @@ -480,6 +482,11 @@ static int get_ksymbol_mod(struct kallsym_iter *iter) return 1; } +/* + * ftrace_mod_get_kallsym() may also get symbols for pages allocated for ftrace + * purposes. In that case "__builtin__ftrace" is used as a module name, even + * though "__builtin__ftrace" is not a module. + */ static int get_ksymbol_ftrace_mod(struct kallsym_iter *iter) { int ret = ftrace_mod_get_kallsym(iter->pos - iter->pos_mod_end, @@ -496,11 +503,33 @@ static int get_ksymbol_ftrace_mod(struct kallsym_iter *iter) static int get_ksymbol_bpf(struct kallsym_iter *iter) { + int ret; + strlcpy(iter->module_name, "bpf", MODULE_NAME_LEN); iter->exported = 0; - return bpf_get_kallsym(iter->pos - iter->pos_ftrace_mod_end, - &iter->value, &iter->type, - iter->name) < 0 ? 0 : 1; + ret = bpf_get_kallsym(iter->pos - iter->pos_ftrace_mod_end, + &iter->value, &iter->type, + iter->name); + if (ret < 0) { + iter->pos_bpf_end = iter->pos; + return 0; + } + + return 1; +} + +/* + * This uses "__builtin__kprobes" as a module name for symbols for pages + * allocated for kprobes' purposes, even though "__builtin__kprobes" is not a + * module. + */ +static int get_ksymbol_kprobe(struct kallsym_iter *iter) +{ + strlcpy(iter->module_name, "__builtin__kprobes", MODULE_NAME_LEN); + iter->exported = 0; + return kprobe_get_kallsym(iter->pos - iter->pos_bpf_end, + &iter->value, &iter->type, + iter->name) < 0 ? 0 : 1; } /* Returns space to next name. */ @@ -527,6 +556,7 @@ static void reset_iter(struct kallsym_iter *iter, loff_t new_pos) iter->pos_arch_end = 0; iter->pos_mod_end = 0; iter->pos_ftrace_mod_end = 0; + iter->pos_bpf_end = 0; } } @@ -551,7 +581,11 @@ static int update_iter_mod(struct kallsym_iter *iter, loff_t pos) get_ksymbol_ftrace_mod(iter)) return 1; - return get_ksymbol_bpf(iter); + if ((!iter->pos_bpf_end || iter->pos_bpf_end > pos) && + get_ksymbol_bpf(iter)) + return 1; + + return get_ksymbol_kprobe(iter); } /* Returns false if pos at or past end of file. */ @@ -644,19 +678,20 @@ static inline int kallsyms_for_perf(void) * Otherwise, require CAP_SYSLOG (assuming kptr_restrict isn't set to * block even that). */ -int kallsyms_show_value(void) +bool kallsyms_show_value(const struct cred *cred) { switch (kptr_restrict) { case 0: if (kallsyms_for_perf()) - return 1; + return true; /* fallthrough */ case 1: - if (has_capability_noaudit(current, CAP_SYSLOG)) - return 1; + if (security_capable(cred, &init_user_ns, CAP_SYSLOG, + CAP_OPT_NOAUDIT) == 0) + return true; /* fallthrough */ default: - return 0; + return false; } } @@ -673,7 +708,11 @@ static int kallsyms_open(struct inode *inode, struct file *file) return -ENOMEM; reset_iter(iter, 0); - iter->show_value = kallsyms_show_value(); + /* + * Instead of checking this on every s_show() call, cache + * the result here at open time. + */ + iter->show_value = kallsyms_show_value(file->f_cred); return 0; } diff --git a/kernel/kcov.c b/kernel/kcov.c index 8accc9722a81..6afae0bcbac4 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -86,6 +86,18 @@ static DEFINE_SPINLOCK(kcov_remote_lock); static DEFINE_HASHTABLE(kcov_remote_map, 4); static struct list_head kcov_remote_areas = LIST_HEAD_INIT(kcov_remote_areas); +struct kcov_percpu_data { + void *irq_area; + + unsigned int saved_mode; + unsigned int saved_size; + void *saved_area; + struct kcov *saved_kcov; + int saved_sequence; +}; + +DEFINE_PER_CPU(struct kcov_percpu_data, kcov_percpu_data); + /* Must be called with kcov_remote_lock locked. */ static struct kcov_remote *kcov_remote_find(u64 handle) { @@ -98,6 +110,7 @@ static struct kcov_remote *kcov_remote_find(u64 handle) return NULL; } +/* Must be called with kcov_remote_lock locked. */ static struct kcov_remote *kcov_remote_add(struct kcov *kcov, u64 handle) { struct kcov_remote *remote; @@ -119,16 +132,13 @@ static struct kcov_remote_area *kcov_remote_area_get(unsigned int size) struct kcov_remote_area *area; struct list_head *pos; - kcov_debug("size = %u\n", size); list_for_each(pos, &kcov_remote_areas) { area = list_entry(pos, struct kcov_remote_area, list); if (area->size == size) { list_del(&area->list); - kcov_debug("rv = %px\n", area); return area; } } - kcov_debug("rv = NULL\n"); return NULL; } @@ -136,7 +146,6 @@ static struct kcov_remote_area *kcov_remote_area_get(unsigned int size) static void kcov_remote_area_put(struct kcov_remote_area *area, unsigned int size) { - kcov_debug("area = %px, size = %u\n", area, size); INIT_LIST_HEAD(&area->list); area->size = size; list_add(&area->list, &kcov_remote_areas); @@ -148,9 +157,10 @@ static notrace bool check_kcov_mode(enum kcov_mode needed_mode, struct task_stru /* * We are interested in code coverage as a function of a syscall inputs, - * so we ignore code executed in interrupts. + * so we ignore code executed in interrupts, unless we are in a remote + * coverage collection section in a softirq. */ - if (!in_task()) + if (!in_task() && !(in_serving_softirq() && t->kcov_softirq)) return false; mode = READ_ONCE(t->kcov_mode); /* @@ -312,23 +322,26 @@ void notrace __sanitizer_cov_trace_switch(u64 val, u64 *cases) EXPORT_SYMBOL(__sanitizer_cov_trace_switch); #endif /* ifdef CONFIG_KCOV_ENABLE_COMPARISONS */ -static void kcov_start(struct task_struct *t, unsigned int size, - void *area, enum kcov_mode mode, int sequence) +static void kcov_start(struct task_struct *t, struct kcov *kcov, + unsigned int size, void *area, enum kcov_mode mode, + int sequence) { kcov_debug("t = %px, size = %u, area = %px\n", t, size, area); + t->kcov = kcov; /* Cache in task struct for performance. */ t->kcov_size = size; t->kcov_area = area; + t->kcov_sequence = sequence; /* See comment in check_kcov_mode(). */ barrier(); WRITE_ONCE(t->kcov_mode, mode); - t->kcov_sequence = sequence; } static void kcov_stop(struct task_struct *t) { WRITE_ONCE(t->kcov_mode, KCOV_MODE_DISABLED); barrier(); + t->kcov = NULL; t->kcov_size = 0; t->kcov_area = NULL; } @@ -336,7 +349,6 @@ static void kcov_stop(struct task_struct *t) static void kcov_task_reset(struct task_struct *t) { kcov_stop(t); - t->kcov = NULL; t->kcov_sequence = 0; t->kcov_handle = 0; } @@ -361,18 +373,18 @@ static void kcov_remote_reset(struct kcov *kcov) int bkt; struct kcov_remote *remote; struct hlist_node *tmp; + unsigned long flags; - spin_lock(&kcov_remote_lock); + spin_lock_irqsave(&kcov_remote_lock, flags); hash_for_each_safe(kcov_remote_map, bkt, tmp, remote, hnode) { if (remote->kcov != kcov) continue; - kcov_debug("removing handle %llx\n", remote->handle); hash_del(&remote->hnode); kfree(remote); } /* Do reset before unlock to prevent races with kcov_remote_start(). */ kcov_reset(kcov); - spin_unlock(&kcov_remote_lock); + spin_unlock_irqrestore(&kcov_remote_lock, flags); } static void kcov_disable(struct task_struct *t, struct kcov *kcov) @@ -401,12 +413,13 @@ static void kcov_put(struct kcov *kcov) void kcov_task_exit(struct task_struct *t) { struct kcov *kcov; + unsigned long flags; kcov = t->kcov; if (kcov == NULL) return; - spin_lock(&kcov->lock); + spin_lock_irqsave(&kcov->lock, flags); kcov_debug("t = %px, kcov->t = %px\n", t, kcov->t); /* * For KCOV_ENABLE devices we want to make sure that t->kcov->t == t, @@ -414,7 +427,8 @@ void kcov_task_exit(struct task_struct *t) * WARN_ON(!kcov->remote && kcov->t != t); * * For KCOV_REMOTE_ENABLE devices, the exiting task is either: - * 2. A remote task between kcov_remote_start() and kcov_remote_stop(). + * + * 1. A remote task between kcov_remote_start() and kcov_remote_stop(). * In this case we should print a warning right away, since a task * shouldn't be exiting when it's in a kcov coverage collection * section. Here t points to the task that is collecting remote @@ -424,18 +438,18 @@ void kcov_task_exit(struct task_struct *t) * WARN_ON(kcov->remote && kcov->t != t); * * 2. The task that created kcov exiting without calling KCOV_DISABLE, - * and then again we can make sure that t->kcov->t == t: + * and then again we make sure that t->kcov->t == t: * WARN_ON(kcov->remote && kcov->t != t); * * By combining all three checks into one we get: */ if (WARN_ON(kcov->t != t)) { - spin_unlock(&kcov->lock); + spin_unlock_irqrestore(&kcov->lock, flags); return; } /* Just to not leave dangling references behind. */ kcov_disable(t, kcov); - spin_unlock(&kcov->lock); + spin_unlock_irqrestore(&kcov->lock, flags); kcov_put(kcov); } @@ -446,12 +460,13 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma) struct kcov *kcov = vma->vm_file->private_data; unsigned long size, off; struct page *page; + unsigned long flags; area = vmalloc_user(vma->vm_end - vma->vm_start); if (!area) return -ENOMEM; - spin_lock(&kcov->lock); + spin_lock_irqsave(&kcov->lock, flags); size = kcov->size * sizeof(unsigned long); if (kcov->mode != KCOV_MODE_INIT || vma->vm_pgoff != 0 || vma->vm_end - vma->vm_start != size) { @@ -461,7 +476,7 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma) if (!kcov->area) { kcov->area = area; vma->vm_flags |= VM_DONTEXPAND; - spin_unlock(&kcov->lock); + spin_unlock_irqrestore(&kcov->lock, flags); for (off = 0; off < size; off += PAGE_SIZE) { page = vmalloc_to_page(kcov->area + off); if (vm_insert_page(vma, vma->vm_start + off, page)) @@ -470,7 +485,7 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma) return 0; } exit: - spin_unlock(&kcov->lock); + spin_unlock_irqrestore(&kcov->lock, flags); vfree(area); return res; } @@ -550,10 +565,10 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, int mode, i; struct kcov_remote_arg *remote_arg; struct kcov_remote *remote; + unsigned long flags; switch (cmd) { case KCOV_INIT_TRACE: - kcov_debug("KCOV_INIT_TRACE\n"); /* * Enable kcov in trace mode and setup buffer size. * Must happen before anything else. @@ -572,7 +587,6 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, kcov->mode = KCOV_MODE_INIT; return 0; case KCOV_ENABLE: - kcov_debug("KCOV_ENABLE\n"); /* * Enable coverage for the current task. * At this point user must have been enabled trace mode, @@ -590,15 +604,13 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, return mode; kcov_fault_in_area(kcov); kcov->mode = mode; - kcov_start(t, kcov->size, kcov->area, kcov->mode, + kcov_start(t, kcov, kcov->size, kcov->area, kcov->mode, kcov->sequence); - t->kcov = kcov; kcov->t = t; /* Put either in kcov_task_exit() or in KCOV_DISABLE. */ kcov_get(kcov); return 0; case KCOV_DISABLE: - kcov_debug("KCOV_DISABLE\n"); /* Disable coverage for the current task. */ unused = arg; if (unused != 0 || current->kcov != kcov) @@ -610,7 +622,6 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, kcov_put(kcov); return 0; case KCOV_REMOTE_ENABLE: - kcov_debug("KCOV_REMOTE_ENABLE\n"); if (kcov->mode != KCOV_MODE_INIT || !kcov->area) return -EINVAL; t = current; @@ -627,41 +638,42 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, kcov->t = t; kcov->remote = true; kcov->remote_size = remote_arg->area_size; - spin_lock(&kcov_remote_lock); + spin_lock_irqsave(&kcov_remote_lock, flags); for (i = 0; i < remote_arg->num_handles; i++) { - kcov_debug("handle %llx\n", remote_arg->handles[i]); if (!kcov_check_handle(remote_arg->handles[i], false, true, false)) { - spin_unlock(&kcov_remote_lock); + spin_unlock_irqrestore(&kcov_remote_lock, + flags); kcov_disable(t, kcov); return -EINVAL; } remote = kcov_remote_add(kcov, remote_arg->handles[i]); if (IS_ERR(remote)) { - spin_unlock(&kcov_remote_lock); + spin_unlock_irqrestore(&kcov_remote_lock, + flags); kcov_disable(t, kcov); return PTR_ERR(remote); } } if (remote_arg->common_handle) { - kcov_debug("common handle %llx\n", - remote_arg->common_handle); if (!kcov_check_handle(remote_arg->common_handle, true, false, false)) { - spin_unlock(&kcov_remote_lock); + spin_unlock_irqrestore(&kcov_remote_lock, + flags); kcov_disable(t, kcov); return -EINVAL; } remote = kcov_remote_add(kcov, remote_arg->common_handle); if (IS_ERR(remote)) { - spin_unlock(&kcov_remote_lock); + spin_unlock_irqrestore(&kcov_remote_lock, + flags); kcov_disable(t, kcov); return PTR_ERR(remote); } t->kcov_handle = remote_arg->common_handle; } - spin_unlock(&kcov_remote_lock); + spin_unlock_irqrestore(&kcov_remote_lock, flags); /* Put either in kcov_task_exit() or in KCOV_DISABLE. */ kcov_get(kcov); return 0; @@ -677,6 +689,7 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) struct kcov_remote_arg *remote_arg = NULL; unsigned int remote_num_handles; unsigned long remote_arg_size; + unsigned long flags; if (cmd == KCOV_REMOTE_ENABLE) { if (get_user(remote_num_handles, (unsigned __user *)(arg + @@ -697,9 +710,9 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) } kcov = filep->private_data; - spin_lock(&kcov->lock); + spin_lock_irqsave(&kcov->lock, flags); res = kcov_ioctl_locked(kcov, cmd, arg); - spin_unlock(&kcov->lock); + spin_unlock_irqrestore(&kcov->lock, flags); kfree(remote_arg); @@ -716,8 +729,8 @@ static const struct file_operations kcov_fops = { /* * kcov_remote_start() and kcov_remote_stop() can be used to annotate a section - * of code in a kernel background thread to allow kcov to be used to collect - * coverage from that part of code. + * of code in a kernel background thread or in a softirq to allow kcov to be + * used to collect coverage from that part of code. * * The handle argument of kcov_remote_start() identifies a code section that is * used for coverage collection. A userspace process passes this handle to @@ -728,9 +741,9 @@ static const struct file_operations kcov_fops = { * the type of the kernel thread whose code is being annotated. * * For global kernel threads that are spawned in a limited number of instances - * (e.g. one USB hub_event() worker thread is spawned per USB HCD), each - * instance must be assigned a unique 4-byte instance id. The instance id is - * then combined with a 1-byte subsystem id to get a handle via + * (e.g. one USB hub_event() worker thread is spawned per USB HCD) and for + * softirqs, each instance must be assigned a unique 4-byte instance id. The + * instance id is then combined with a 1-byte subsystem id to get a handle via * kcov_remote_handle(subsystem_id, instance_id). * * For local kernel threads that are spawned from system calls handler when a @@ -749,70 +762,136 @@ static const struct file_operations kcov_fops = { * * See Documentation/dev-tools/kcov.rst for more details. * - * Internally, this function looks up the kcov device associated with the + * Internally, kcov_remote_start() looks up the kcov device associated with the * provided handle, allocates an area for coverage collection, and saves the * pointers to kcov and area into the current task_struct to allow coverage to - * be collected via __sanitizer_cov_trace_pc() + * be collected via __sanitizer_cov_trace_pc(). * In turns kcov_remote_stop() clears those pointers from task_struct to stop * collecting coverage and copies all collected coverage into the kcov area. */ + +static inline bool kcov_mode_enabled(unsigned int mode) +{ + return (mode & ~KCOV_IN_CTXSW) != KCOV_MODE_DISABLED; +} + +void kcov_remote_softirq_start(struct task_struct *t) +{ + struct kcov_percpu_data *data = this_cpu_ptr(&kcov_percpu_data); + unsigned int mode; + + mode = READ_ONCE(t->kcov_mode); + barrier(); + if (kcov_mode_enabled(mode)) { + data->saved_mode = mode; + data->saved_size = t->kcov_size; + data->saved_area = t->kcov_area; + data->saved_sequence = t->kcov_sequence; + data->saved_kcov = t->kcov; + kcov_stop(t); + } +} + +void kcov_remote_softirq_stop(struct task_struct *t) +{ + struct kcov_percpu_data *data = this_cpu_ptr(&kcov_percpu_data); + + if (data->saved_kcov) { + kcov_start(t, data->saved_kcov, data->saved_size, + data->saved_area, data->saved_mode, + data->saved_sequence); + data->saved_mode = 0; + data->saved_size = 0; + data->saved_area = NULL; + data->saved_sequence = 0; + data->saved_kcov = NULL; + } +} + void kcov_remote_start(u64 handle) { + struct task_struct *t = current; struct kcov_remote *remote; + struct kcov *kcov; + unsigned int mode; void *area; - struct task_struct *t; unsigned int size; - enum kcov_mode mode; int sequence; + unsigned long flags; if (WARN_ON(!kcov_check_handle(handle, true, true, true))) return; - if (WARN_ON(!in_task())) + if (!in_task() && !in_serving_softirq()) return; - t = current; + + local_irq_save(flags); + /* - * Check that kcov_remote_start is not called twice - * nor called by user tasks (with enabled kcov). + * Check that kcov_remote_start() is not called twice in background + * threads nor called by user tasks (with enabled kcov). */ - if (WARN_ON(t->kcov)) + mode = READ_ONCE(t->kcov_mode); + if (WARN_ON(in_task() && kcov_mode_enabled(mode))) { + local_irq_restore(flags); return; - - kcov_debug("handle = %llx\n", handle); + } + /* + * Check that kcov_remote_start() is not called twice in softirqs. + * Note, that kcov_remote_start() can be called from a softirq that + * happened while collecting coverage from a background thread. + */ + if (WARN_ON(in_serving_softirq() && t->kcov_softirq)) { + local_irq_restore(flags); + return; + } spin_lock(&kcov_remote_lock); remote = kcov_remote_find(handle); if (!remote) { - kcov_debug("no remote found"); - spin_unlock(&kcov_remote_lock); + spin_unlock_irqrestore(&kcov_remote_lock, flags); return; } + kcov_debug("handle = %llx, context: %s\n", handle, + in_task() ? "task" : "softirq"); + kcov = remote->kcov; /* Put in kcov_remote_stop(). */ - kcov_get(remote->kcov); - t->kcov = remote->kcov; + kcov_get(kcov); /* * Read kcov fields before unlock to prevent races with * KCOV_DISABLE / kcov_remote_reset(). */ - size = remote->kcov->remote_size; - mode = remote->kcov->mode; - sequence = remote->kcov->sequence; - area = kcov_remote_area_get(size); - spin_unlock(&kcov_remote_lock); + mode = kcov->mode; + sequence = kcov->sequence; + if (in_task()) { + size = kcov->remote_size; + area = kcov_remote_area_get(size); + } else { + size = CONFIG_KCOV_IRQ_AREA_SIZE; + area = this_cpu_ptr(&kcov_percpu_data)->irq_area; + } + spin_unlock_irqrestore(&kcov_remote_lock, flags); + /* Can only happen when in_task(). */ if (!area) { area = vmalloc(size * sizeof(unsigned long)); if (!area) { - t->kcov = NULL; - kcov_put(remote->kcov); + kcov_put(kcov); return; } } + + local_irq_save(flags); + /* Reset coverage size. */ *(u64 *)area = 0; - kcov_debug("area = %px, size = %u", area, size); + if (in_serving_softirq()) { + kcov_remote_softirq_start(t); + t->kcov_softirq = 1; + } + kcov_start(t, kcov, size, area, mode, sequence); - kcov_start(t, size, area, mode, sequence); + local_irq_restore(flags); } EXPORT_SYMBOL(kcov_remote_start); @@ -876,34 +955,67 @@ static void kcov_move_area(enum kcov_mode mode, void *dst_area, void kcov_remote_stop(void) { struct task_struct *t = current; - struct kcov *kcov = t->kcov; - void *area = t->kcov_area; - unsigned int size = t->kcov_size; - int sequence = t->kcov_sequence; + struct kcov *kcov; + unsigned int mode; + void *area; + unsigned int size; + int sequence; + unsigned long flags; - if (!kcov) { - kcov_debug("no kcov found\n"); + if (!in_task() && !in_serving_softirq()) + return; + + local_irq_save(flags); + + mode = READ_ONCE(t->kcov_mode); + barrier(); + if (!kcov_mode_enabled(mode)) { + local_irq_restore(flags); + return; + } + /* + * When in softirq, check if the corresponding kcov_remote_start() + * actually found the remote handle and started collecting coverage. + */ + if (in_serving_softirq() && !t->kcov_softirq) { + local_irq_restore(flags); + return; + } + /* Make sure that kcov_softirq is only set when in softirq. */ + if (WARN_ON(!in_serving_softirq() && t->kcov_softirq)) { + local_irq_restore(flags); return; } + kcov = t->kcov; + area = t->kcov_area; + size = t->kcov_size; + sequence = t->kcov_sequence; + kcov_stop(t); - t->kcov = NULL; + if (in_serving_softirq()) { + t->kcov_softirq = 0; + kcov_remote_softirq_stop(t); + } spin_lock(&kcov->lock); /* * KCOV_DISABLE could have been called between kcov_remote_start() - * and kcov_remote_stop(), hence the check. + * and kcov_remote_stop(), hence the sequence check. */ - kcov_debug("move if: %d == %d && %d\n", - sequence, kcov->sequence, (int)kcov->remote); if (sequence == kcov->sequence && kcov->remote) kcov_move_area(kcov->mode, kcov->area, kcov->size, area); spin_unlock(&kcov->lock); - spin_lock(&kcov_remote_lock); - kcov_remote_area_put(area, size); - spin_unlock(&kcov_remote_lock); + if (in_task()) { + spin_lock(&kcov_remote_lock); + kcov_remote_area_put(area, size); + spin_unlock(&kcov_remote_lock); + } + + local_irq_restore(flags); + /* Get in kcov_remote_start(). */ kcov_put(kcov); } EXPORT_SYMBOL(kcov_remote_stop); @@ -917,6 +1029,16 @@ EXPORT_SYMBOL(kcov_common_handle); static int __init kcov_init(void) { + int cpu; + + for_each_possible_cpu(cpu) { + void *area = vmalloc(CONFIG_KCOV_IRQ_AREA_SIZE * + sizeof(unsigned long)); + if (!area) + return -ENOMEM; + per_cpu_ptr(&kcov_percpu_data, cpu)->irq_area = area; + } + /* * The kcov debugfs file won't ever get removed and thus, * there is no need to protect it against removal races. The diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile new file mode 100644 index 000000000000..65ca5539c470 --- /dev/null +++ b/kernel/kcsan/Makefile @@ -0,0 +1,17 @@ +# SPDX-License-Identifier: GPL-2.0 +KCSAN_SANITIZE := n +KCOV_INSTRUMENT := n +UBSAN_SANITIZE := n + +CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_debugfs.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_report.o = $(CC_FLAGS_FTRACE) + +CFLAGS_core.o := $(call cc-option,-fno-conserve-stack) \ + -fno-stack-protector -DDISABLE_BRANCH_PROFILING + +obj-y := core.o debugfs.o report.o +obj-$(CONFIG_KCSAN_SELFTEST) += selftest.o + +CFLAGS_kcsan-test.o := $(CFLAGS_KCSAN) -g -fno-omit-frame-pointer +obj-$(CONFIG_KCSAN_TEST) += kcsan-test.o diff --git a/kernel/kcsan/atomic.h b/kernel/kcsan/atomic.h new file mode 100644 index 000000000000..75fe701f4127 --- /dev/null +++ b/kernel/kcsan/atomic.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _KERNEL_KCSAN_ATOMIC_H +#define _KERNEL_KCSAN_ATOMIC_H + +#include <linux/types.h> + +/* + * Special rules for certain memory where concurrent conflicting accesses are + * common, however, the current convention is to not mark them; returns true if + * access to @ptr should be considered atomic. Called from slow-path. + */ +static bool kcsan_is_atomic_special(const volatile void *ptr) +{ + return false; +} + +#endif /* _KERNEL_KCSAN_ATOMIC_H */ diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c new file mode 100644 index 000000000000..9147ff6a12e5 --- /dev/null +++ b/kernel/kcsan/core.c @@ -0,0 +1,881 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/atomic.h> +#include <linux/bug.h> +#include <linux/delay.h> +#include <linux/export.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/moduleparam.h> +#include <linux/percpu.h> +#include <linux/preempt.h> +#include <linux/random.h> +#include <linux/sched.h> +#include <linux/uaccess.h> + +#include "atomic.h" +#include "encoding.h" +#include "kcsan.h" + +static bool kcsan_early_enable = IS_ENABLED(CONFIG_KCSAN_EARLY_ENABLE); +unsigned int kcsan_udelay_task = CONFIG_KCSAN_UDELAY_TASK; +unsigned int kcsan_udelay_interrupt = CONFIG_KCSAN_UDELAY_INTERRUPT; +static long kcsan_skip_watch = CONFIG_KCSAN_SKIP_WATCH; +static bool kcsan_interrupt_watcher = IS_ENABLED(CONFIG_KCSAN_INTERRUPT_WATCHER); + +#ifdef MODULE_PARAM_PREFIX +#undef MODULE_PARAM_PREFIX +#endif +#define MODULE_PARAM_PREFIX "kcsan." +module_param_named(early_enable, kcsan_early_enable, bool, 0); +module_param_named(udelay_task, kcsan_udelay_task, uint, 0644); +module_param_named(udelay_interrupt, kcsan_udelay_interrupt, uint, 0644); +module_param_named(skip_watch, kcsan_skip_watch, long, 0644); +module_param_named(interrupt_watcher, kcsan_interrupt_watcher, bool, 0444); + +bool kcsan_enabled; + +/* Per-CPU kcsan_ctx for interrupts */ +static DEFINE_PER_CPU(struct kcsan_ctx, kcsan_cpu_ctx) = { + .disable_count = 0, + .atomic_next = 0, + .atomic_nest_count = 0, + .in_flat_atomic = false, + .access_mask = 0, + .scoped_accesses = {LIST_POISON1, NULL}, +}; + +/* + * Helper macros to index into adjacent slots, starting from address slot + * itself, followed by the right and left slots. + * + * The purpose is 2-fold: + * + * 1. if during insertion the address slot is already occupied, check if + * any adjacent slots are free; + * 2. accesses that straddle a slot boundary due to size that exceeds a + * slot's range may check adjacent slots if any watchpoint matches. + * + * Note that accesses with very large size may still miss a watchpoint; however, + * given this should be rare, this is a reasonable trade-off to make, since this + * will avoid: + * + * 1. excessive contention between watchpoint checks and setup; + * 2. larger number of simultaneous watchpoints without sacrificing + * performance. + * + * Example: SLOT_IDX values for KCSAN_CHECK_ADJACENT=1, where i is [0, 1, 2]: + * + * slot=0: [ 1, 2, 0] + * slot=9: [10, 11, 9] + * slot=63: [64, 65, 63] + */ +#define SLOT_IDX(slot, i) (slot + ((i + KCSAN_CHECK_ADJACENT) % NUM_SLOTS)) + +/* + * SLOT_IDX_FAST is used in the fast-path. Not first checking the address's primary + * slot (middle) is fine if we assume that races occur rarely. The set of + * indices {SLOT_IDX(slot, i) | i in [0, NUM_SLOTS)} is equivalent to + * {SLOT_IDX_FAST(slot, i) | i in [0, NUM_SLOTS)}. + */ +#define SLOT_IDX_FAST(slot, i) (slot + i) + +/* + * Watchpoints, with each entry encoded as defined in encoding.h: in order to be + * able to safely update and access a watchpoint without introducing locking + * overhead, we encode each watchpoint as a single atomic long. The initial + * zero-initialized state matches INVALID_WATCHPOINT. + * + * Add NUM_SLOTS-1 entries to account for overflow; this helps avoid having to + * use more complicated SLOT_IDX_FAST calculation with modulo in the fast-path. + */ +static atomic_long_t watchpoints[CONFIG_KCSAN_NUM_WATCHPOINTS + NUM_SLOTS-1]; + +/* + * Instructions to skip watching counter, used in should_watch(). We use a + * per-CPU counter to avoid excessive contention. + */ +static DEFINE_PER_CPU(long, kcsan_skip); + +static __always_inline atomic_long_t *find_watchpoint(unsigned long addr, + size_t size, + bool expect_write, + long *encoded_watchpoint) +{ + const int slot = watchpoint_slot(addr); + const unsigned long addr_masked = addr & WATCHPOINT_ADDR_MASK; + atomic_long_t *watchpoint; + unsigned long wp_addr_masked; + size_t wp_size; + bool is_write; + int i; + + BUILD_BUG_ON(CONFIG_KCSAN_NUM_WATCHPOINTS < NUM_SLOTS); + + for (i = 0; i < NUM_SLOTS; ++i) { + watchpoint = &watchpoints[SLOT_IDX_FAST(slot, i)]; + *encoded_watchpoint = atomic_long_read(watchpoint); + if (!decode_watchpoint(*encoded_watchpoint, &wp_addr_masked, + &wp_size, &is_write)) + continue; + + if (expect_write && !is_write) + continue; + + /* Check if the watchpoint matches the access. */ + if (matching_access(wp_addr_masked, wp_size, addr_masked, size)) + return watchpoint; + } + + return NULL; +} + +static inline atomic_long_t * +insert_watchpoint(unsigned long addr, size_t size, bool is_write) +{ + const int slot = watchpoint_slot(addr); + const long encoded_watchpoint = encode_watchpoint(addr, size, is_write); + atomic_long_t *watchpoint; + int i; + + /* Check slot index logic, ensuring we stay within array bounds. */ + BUILD_BUG_ON(SLOT_IDX(0, 0) != KCSAN_CHECK_ADJACENT); + BUILD_BUG_ON(SLOT_IDX(0, KCSAN_CHECK_ADJACENT+1) != 0); + BUILD_BUG_ON(SLOT_IDX(CONFIG_KCSAN_NUM_WATCHPOINTS-1, KCSAN_CHECK_ADJACENT) != ARRAY_SIZE(watchpoints)-1); + BUILD_BUG_ON(SLOT_IDX(CONFIG_KCSAN_NUM_WATCHPOINTS-1, KCSAN_CHECK_ADJACENT+1) != ARRAY_SIZE(watchpoints) - NUM_SLOTS); + + for (i = 0; i < NUM_SLOTS; ++i) { + long expect_val = INVALID_WATCHPOINT; + + /* Try to acquire this slot. */ + watchpoint = &watchpoints[SLOT_IDX(slot, i)]; + if (atomic_long_try_cmpxchg_relaxed(watchpoint, &expect_val, encoded_watchpoint)) + return watchpoint; + } + + return NULL; +} + +/* + * Return true if watchpoint was successfully consumed, false otherwise. + * + * This may return false if: + * + * 1. another thread already consumed the watchpoint; + * 2. the thread that set up the watchpoint already removed it; + * 3. the watchpoint was removed and then re-used. + */ +static __always_inline bool +try_consume_watchpoint(atomic_long_t *watchpoint, long encoded_watchpoint) +{ + return atomic_long_try_cmpxchg_relaxed(watchpoint, &encoded_watchpoint, CONSUMED_WATCHPOINT); +} + +/* Return true if watchpoint was not touched, false if already consumed. */ +static inline bool consume_watchpoint(atomic_long_t *watchpoint) +{ + return atomic_long_xchg_relaxed(watchpoint, CONSUMED_WATCHPOINT) != CONSUMED_WATCHPOINT; +} + +/* Remove the watchpoint -- its slot may be reused after. */ +static inline void remove_watchpoint(atomic_long_t *watchpoint) +{ + atomic_long_set(watchpoint, INVALID_WATCHPOINT); +} + +static __always_inline struct kcsan_ctx *get_ctx(void) +{ + /* + * In interrupts, use raw_cpu_ptr to avoid unnecessary checks, that would + * also result in calls that generate warnings in uaccess regions. + */ + return in_task() ? ¤t->kcsan_ctx : raw_cpu_ptr(&kcsan_cpu_ctx); +} + +/* Check scoped accesses; never inline because this is a slow-path! */ +static noinline void kcsan_check_scoped_accesses(void) +{ + struct kcsan_ctx *ctx = get_ctx(); + struct list_head *prev_save = ctx->scoped_accesses.prev; + struct kcsan_scoped_access *scoped_access; + + ctx->scoped_accesses.prev = NULL; /* Avoid recursion. */ + list_for_each_entry(scoped_access, &ctx->scoped_accesses, list) + __kcsan_check_access(scoped_access->ptr, scoped_access->size, scoped_access->type); + ctx->scoped_accesses.prev = prev_save; +} + +/* Rules for generic atomic accesses. Called from fast-path. */ +static __always_inline bool +is_atomic(const volatile void *ptr, size_t size, int type, struct kcsan_ctx *ctx) +{ + if (type & KCSAN_ACCESS_ATOMIC) + return true; + + /* + * Unless explicitly declared atomic, never consider an assertion access + * as atomic. This allows using them also in atomic regions, such as + * seqlocks, without implicitly changing their semantics. + */ + if (type & KCSAN_ACCESS_ASSERT) + return false; + + if (IS_ENABLED(CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC) && + (type & KCSAN_ACCESS_WRITE) && size <= sizeof(long) && + IS_ALIGNED((unsigned long)ptr, size)) + return true; /* Assume aligned writes up to word size are atomic. */ + + if (ctx->atomic_next > 0) { + /* + * Because we do not have separate contexts for nested + * interrupts, in case atomic_next is set, we simply assume that + * the outer interrupt set atomic_next. In the worst case, we + * will conservatively consider operations as atomic. This is a + * reasonable trade-off to make, since this case should be + * extremely rare; however, even if extremely rare, it could + * lead to false positives otherwise. + */ + if ((hardirq_count() >> HARDIRQ_SHIFT) < 2) + --ctx->atomic_next; /* in task, or outer interrupt */ + return true; + } + + return ctx->atomic_nest_count > 0 || ctx->in_flat_atomic; +} + +static __always_inline bool +should_watch(const volatile void *ptr, size_t size, int type, struct kcsan_ctx *ctx) +{ + /* + * Never set up watchpoints when memory operations are atomic. + * + * Need to check this first, before kcsan_skip check below: (1) atomics + * should not count towards skipped instructions, and (2) to actually + * decrement kcsan_atomic_next for consecutive instruction stream. + */ + if (is_atomic(ptr, size, type, ctx)) + return false; + + if (this_cpu_dec_return(kcsan_skip) >= 0) + return false; + + /* + * NOTE: If we get here, kcsan_skip must always be reset in slow path + * via reset_kcsan_skip() to avoid underflow. + */ + + /* this operation should be watched */ + return true; +} + +static inline void reset_kcsan_skip(void) +{ + long skip_count = kcsan_skip_watch - + (IS_ENABLED(CONFIG_KCSAN_SKIP_WATCH_RANDOMIZE) ? + prandom_u32_max(kcsan_skip_watch) : + 0); + this_cpu_write(kcsan_skip, skip_count); +} + +static __always_inline bool kcsan_is_enabled(void) +{ + return READ_ONCE(kcsan_enabled) && get_ctx()->disable_count == 0; +} + +static inline unsigned int get_delay(void) +{ + unsigned int delay = in_task() ? kcsan_udelay_task : kcsan_udelay_interrupt; + return delay - (IS_ENABLED(CONFIG_KCSAN_DELAY_RANDOMIZE) ? + prandom_u32_max(delay) : + 0); +} + +void kcsan_save_irqtrace(struct task_struct *task) +{ +#ifdef CONFIG_TRACE_IRQFLAGS + task->kcsan_save_irqtrace = task->irqtrace; +#endif +} + +void kcsan_restore_irqtrace(struct task_struct *task) +{ +#ifdef CONFIG_TRACE_IRQFLAGS + task->irqtrace = task->kcsan_save_irqtrace; +#endif +} + +/* + * Pull everything together: check_access() below contains the performance + * critical operations; the fast-path (including check_access) functions should + * all be inlinable by the instrumentation functions. + * + * The slow-path (kcsan_found_watchpoint, kcsan_setup_watchpoint) are + * non-inlinable -- note that, we prefix these with "kcsan_" to ensure they can + * be filtered from the stacktrace, as well as give them unique names for the + * UACCESS whitelist of objtool. Each function uses user_access_save/restore(), + * since they do not access any user memory, but instrumentation is still + * emitted in UACCESS regions. + */ + +static noinline void kcsan_found_watchpoint(const volatile void *ptr, + size_t size, + int type, + atomic_long_t *watchpoint, + long encoded_watchpoint) +{ + unsigned long flags; + bool consumed; + + if (!kcsan_is_enabled()) + return; + + /* + * The access_mask check relies on value-change comparison. To avoid + * reporting a race where e.g. the writer set up the watchpoint, but the + * reader has access_mask!=0, we have to ignore the found watchpoint. + */ + if (get_ctx()->access_mask != 0) + return; + + /* + * Consume the watchpoint as soon as possible, to minimize the chances + * of !consumed. Consuming the watchpoint must always be guarded by + * kcsan_is_enabled() check, as otherwise we might erroneously + * triggering reports when disabled. + */ + consumed = try_consume_watchpoint(watchpoint, encoded_watchpoint); + + /* keep this after try_consume_watchpoint */ + flags = user_access_save(); + + if (consumed) { + kcsan_save_irqtrace(current); + kcsan_report(ptr, size, type, KCSAN_VALUE_CHANGE_MAYBE, + KCSAN_REPORT_CONSUMED_WATCHPOINT, + watchpoint - watchpoints); + kcsan_restore_irqtrace(current); + } else { + /* + * The other thread may not print any diagnostics, as it has + * already removed the watchpoint, or another thread consumed + * the watchpoint before this thread. + */ + kcsan_counter_inc(KCSAN_COUNTER_REPORT_RACES); + } + + if ((type & KCSAN_ACCESS_ASSERT) != 0) + kcsan_counter_inc(KCSAN_COUNTER_ASSERT_FAILURES); + else + kcsan_counter_inc(KCSAN_COUNTER_DATA_RACES); + + user_access_restore(flags); +} + +static noinline void +kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) +{ + const bool is_write = (type & KCSAN_ACCESS_WRITE) != 0; + const bool is_assert = (type & KCSAN_ACCESS_ASSERT) != 0; + atomic_long_t *watchpoint; + union { + u8 _1; + u16 _2; + u32 _4; + u64 _8; + } expect_value; + unsigned long access_mask; + enum kcsan_value_change value_change = KCSAN_VALUE_CHANGE_MAYBE; + unsigned long ua_flags = user_access_save(); + unsigned long irq_flags = 0; + + /* + * Always reset kcsan_skip counter in slow-path to avoid underflow; see + * should_watch(). + */ + reset_kcsan_skip(); + + if (!kcsan_is_enabled()) + goto out; + + /* + * Special atomic rules: unlikely to be true, so we check them here in + * the slow-path, and not in the fast-path in is_atomic(). Call after + * kcsan_is_enabled(), as we may access memory that is not yet + * initialized during early boot. + */ + if (!is_assert && kcsan_is_atomic_special(ptr)) + goto out; + + if (!check_encodable((unsigned long)ptr, size)) { + kcsan_counter_inc(KCSAN_COUNTER_UNENCODABLE_ACCESSES); + goto out; + } + + /* + * Save and restore the IRQ state trace touched by KCSAN, since KCSAN's + * runtime is entered for every memory access, and potentially useful + * information is lost if dirtied by KCSAN. + */ + kcsan_save_irqtrace(current); + if (!kcsan_interrupt_watcher) + local_irq_save(irq_flags); + + watchpoint = insert_watchpoint((unsigned long)ptr, size, is_write); + if (watchpoint == NULL) { + /* + * Out of capacity: the size of 'watchpoints', and the frequency + * with which should_watch() returns true should be tweaked so + * that this case happens very rarely. + */ + kcsan_counter_inc(KCSAN_COUNTER_NO_CAPACITY); + goto out_unlock; + } + + kcsan_counter_inc(KCSAN_COUNTER_SETUP_WATCHPOINTS); + kcsan_counter_inc(KCSAN_COUNTER_USED_WATCHPOINTS); + + /* + * Read the current value, to later check and infer a race if the data + * was modified via a non-instrumented access, e.g. from a device. + */ + expect_value._8 = 0; + switch (size) { + case 1: + expect_value._1 = READ_ONCE(*(const u8 *)ptr); + break; + case 2: + expect_value._2 = READ_ONCE(*(const u16 *)ptr); + break; + case 4: + expect_value._4 = READ_ONCE(*(const u32 *)ptr); + break; + case 8: + expect_value._8 = READ_ONCE(*(const u64 *)ptr); + break; + default: + break; /* ignore; we do not diff the values */ + } + + if (IS_ENABLED(CONFIG_KCSAN_DEBUG)) { + kcsan_disable_current(); + pr_err("KCSAN: watching %s, size: %zu, addr: %px [slot: %d, encoded: %lx]\n", + is_write ? "write" : "read", size, ptr, + watchpoint_slot((unsigned long)ptr), + encode_watchpoint((unsigned long)ptr, size, is_write)); + kcsan_enable_current(); + } + + /* + * Delay this thread, to increase probability of observing a racy + * conflicting access. + */ + udelay(get_delay()); + + /* + * Re-read value, and check if it is as expected; if not, we infer a + * racy access. + */ + access_mask = get_ctx()->access_mask; + switch (size) { + case 1: + expect_value._1 ^= READ_ONCE(*(const u8 *)ptr); + if (access_mask) + expect_value._1 &= (u8)access_mask; + break; + case 2: + expect_value._2 ^= READ_ONCE(*(const u16 *)ptr); + if (access_mask) + expect_value._2 &= (u16)access_mask; + break; + case 4: + expect_value._4 ^= READ_ONCE(*(const u32 *)ptr); + if (access_mask) + expect_value._4 &= (u32)access_mask; + break; + case 8: + expect_value._8 ^= READ_ONCE(*(const u64 *)ptr); + if (access_mask) + expect_value._8 &= (u64)access_mask; + break; + default: + break; /* ignore; we do not diff the values */ + } + + /* Were we able to observe a value-change? */ + if (expect_value._8 != 0) + value_change = KCSAN_VALUE_CHANGE_TRUE; + + /* Check if this access raced with another. */ + if (!consume_watchpoint(watchpoint)) { + /* + * Depending on the access type, map a value_change of MAYBE to + * TRUE (always report) or FALSE (never report). + */ + if (value_change == KCSAN_VALUE_CHANGE_MAYBE) { + if (access_mask != 0) { + /* + * For access with access_mask, we require a + * value-change, as it is likely that races on + * ~access_mask bits are expected. + */ + value_change = KCSAN_VALUE_CHANGE_FALSE; + } else if (size > 8 || is_assert) { + /* Always assume a value-change. */ + value_change = KCSAN_VALUE_CHANGE_TRUE; + } + } + + /* + * No need to increment 'data_races' counter, as the racing + * thread already did. + * + * Count 'assert_failures' for each failed ASSERT access, + * therefore both this thread and the racing thread may + * increment this counter. + */ + if (is_assert && value_change == KCSAN_VALUE_CHANGE_TRUE) + kcsan_counter_inc(KCSAN_COUNTER_ASSERT_FAILURES); + + kcsan_report(ptr, size, type, value_change, KCSAN_REPORT_RACE_SIGNAL, + watchpoint - watchpoints); + } else if (value_change == KCSAN_VALUE_CHANGE_TRUE) { + /* Inferring a race, since the value should not have changed. */ + + kcsan_counter_inc(KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN); + if (is_assert) + kcsan_counter_inc(KCSAN_COUNTER_ASSERT_FAILURES); + + if (IS_ENABLED(CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN) || is_assert) + kcsan_report(ptr, size, type, KCSAN_VALUE_CHANGE_TRUE, + KCSAN_REPORT_RACE_UNKNOWN_ORIGIN, + watchpoint - watchpoints); + } + + /* + * Remove watchpoint; must be after reporting, since the slot may be + * reused after this point. + */ + remove_watchpoint(watchpoint); + kcsan_counter_dec(KCSAN_COUNTER_USED_WATCHPOINTS); +out_unlock: + if (!kcsan_interrupt_watcher) + local_irq_restore(irq_flags); + kcsan_restore_irqtrace(current); +out: + user_access_restore(ua_flags); +} + +static __always_inline void check_access(const volatile void *ptr, size_t size, + int type) +{ + const bool is_write = (type & KCSAN_ACCESS_WRITE) != 0; + atomic_long_t *watchpoint; + long encoded_watchpoint; + + /* + * Do nothing for 0 sized check; this comparison will be optimized out + * for constant sized instrumentation (__tsan_{read,write}N). + */ + if (unlikely(size == 0)) + return; + + /* + * Avoid user_access_save in fast-path: find_watchpoint is safe without + * user_access_save, as the address that ptr points to is only used to + * check if a watchpoint exists; ptr is never dereferenced. + */ + watchpoint = find_watchpoint((unsigned long)ptr, size, !is_write, + &encoded_watchpoint); + /* + * It is safe to check kcsan_is_enabled() after find_watchpoint in the + * slow-path, as long as no state changes that cause a race to be + * detected and reported have occurred until kcsan_is_enabled() is + * checked. + */ + + if (unlikely(watchpoint != NULL)) + kcsan_found_watchpoint(ptr, size, type, watchpoint, + encoded_watchpoint); + else { + struct kcsan_ctx *ctx = get_ctx(); /* Call only once in fast-path. */ + + if (unlikely(should_watch(ptr, size, type, ctx))) + kcsan_setup_watchpoint(ptr, size, type); + else if (unlikely(ctx->scoped_accesses.prev)) + kcsan_check_scoped_accesses(); + } +} + +/* === Public interface ===================================================== */ + +void __init kcsan_init(void) +{ + BUG_ON(!in_task()); + + kcsan_debugfs_init(); + + /* + * We are in the init task, and no other tasks should be running; + * WRITE_ONCE without memory barrier is sufficient. + */ + if (kcsan_early_enable) + WRITE_ONCE(kcsan_enabled, true); +} + +/* === Exported interface =================================================== */ + +void kcsan_disable_current(void) +{ + ++get_ctx()->disable_count; +} +EXPORT_SYMBOL(kcsan_disable_current); + +void kcsan_enable_current(void) +{ + if (get_ctx()->disable_count-- == 0) { + /* + * Warn if kcsan_enable_current() calls are unbalanced with + * kcsan_disable_current() calls, which causes disable_count to + * become negative and should not happen. + */ + kcsan_disable_current(); /* restore to 0, KCSAN still enabled */ + kcsan_disable_current(); /* disable to generate warning */ + WARN(1, "Unbalanced %s()", __func__); + kcsan_enable_current(); + } +} +EXPORT_SYMBOL(kcsan_enable_current); + +void kcsan_enable_current_nowarn(void) +{ + if (get_ctx()->disable_count-- == 0) + kcsan_disable_current(); +} +EXPORT_SYMBOL(kcsan_enable_current_nowarn); + +void kcsan_nestable_atomic_begin(void) +{ + /* + * Do *not* check and warn if we are in a flat atomic region: nestable + * and flat atomic regions are independent from each other. + * See include/linux/kcsan.h: struct kcsan_ctx comments for more + * comments. + */ + + ++get_ctx()->atomic_nest_count; +} +EXPORT_SYMBOL(kcsan_nestable_atomic_begin); + +void kcsan_nestable_atomic_end(void) +{ + if (get_ctx()->atomic_nest_count-- == 0) { + /* + * Warn if kcsan_nestable_atomic_end() calls are unbalanced with + * kcsan_nestable_atomic_begin() calls, which causes + * atomic_nest_count to become negative and should not happen. + */ + kcsan_nestable_atomic_begin(); /* restore to 0 */ + kcsan_disable_current(); /* disable to generate warning */ + WARN(1, "Unbalanced %s()", __func__); + kcsan_enable_current(); + } +} +EXPORT_SYMBOL(kcsan_nestable_atomic_end); + +void kcsan_flat_atomic_begin(void) +{ + get_ctx()->in_flat_atomic = true; +} +EXPORT_SYMBOL(kcsan_flat_atomic_begin); + +void kcsan_flat_atomic_end(void) +{ + get_ctx()->in_flat_atomic = false; +} +EXPORT_SYMBOL(kcsan_flat_atomic_end); + +void kcsan_atomic_next(int n) +{ + get_ctx()->atomic_next = n; +} +EXPORT_SYMBOL(kcsan_atomic_next); + +void kcsan_set_access_mask(unsigned long mask) +{ + get_ctx()->access_mask = mask; +} +EXPORT_SYMBOL(kcsan_set_access_mask); + +struct kcsan_scoped_access * +kcsan_begin_scoped_access(const volatile void *ptr, size_t size, int type, + struct kcsan_scoped_access *sa) +{ + struct kcsan_ctx *ctx = get_ctx(); + + __kcsan_check_access(ptr, size, type); + + ctx->disable_count++; /* Disable KCSAN, in case list debugging is on. */ + + INIT_LIST_HEAD(&sa->list); + sa->ptr = ptr; + sa->size = size; + sa->type = type; + + if (!ctx->scoped_accesses.prev) /* Lazy initialize list head. */ + INIT_LIST_HEAD(&ctx->scoped_accesses); + list_add(&sa->list, &ctx->scoped_accesses); + + ctx->disable_count--; + return sa; +} +EXPORT_SYMBOL(kcsan_begin_scoped_access); + +void kcsan_end_scoped_access(struct kcsan_scoped_access *sa) +{ + struct kcsan_ctx *ctx = get_ctx(); + + if (WARN(!ctx->scoped_accesses.prev, "Unbalanced %s()?", __func__)) + return; + + ctx->disable_count++; /* Disable KCSAN, in case list debugging is on. */ + + list_del(&sa->list); + if (list_empty(&ctx->scoped_accesses)) + /* + * Ensure we do not enter kcsan_check_scoped_accesses() + * slow-path if unnecessary, and avoids requiring list_empty() + * in the fast-path (to avoid a READ_ONCE() and potential + * uaccess warning). + */ + ctx->scoped_accesses.prev = NULL; + + ctx->disable_count--; + + __kcsan_check_access(sa->ptr, sa->size, sa->type); +} +EXPORT_SYMBOL(kcsan_end_scoped_access); + +void __kcsan_check_access(const volatile void *ptr, size_t size, int type) +{ + check_access(ptr, size, type); +} +EXPORT_SYMBOL(__kcsan_check_access); + +/* + * KCSAN uses the same instrumentation that is emitted by supported compilers + * for ThreadSanitizer (TSAN). + * + * When enabled, the compiler emits instrumentation calls (the functions + * prefixed with "__tsan" below) for all loads and stores that it generated; + * inline asm is not instrumented. + * + * Note that, not all supported compiler versions distinguish aligned/unaligned + * accesses, but e.g. recent versions of Clang do. We simply alias the unaligned + * version to the generic version, which can handle both. + */ + +#define DEFINE_TSAN_READ_WRITE(size) \ + void __tsan_read##size(void *ptr); \ + void __tsan_read##size(void *ptr) \ + { \ + check_access(ptr, size, 0); \ + } \ + EXPORT_SYMBOL(__tsan_read##size); \ + void __tsan_unaligned_read##size(void *ptr) \ + __alias(__tsan_read##size); \ + EXPORT_SYMBOL(__tsan_unaligned_read##size); \ + void __tsan_write##size(void *ptr); \ + void __tsan_write##size(void *ptr) \ + { \ + check_access(ptr, size, KCSAN_ACCESS_WRITE); \ + } \ + EXPORT_SYMBOL(__tsan_write##size); \ + void __tsan_unaligned_write##size(void *ptr) \ + __alias(__tsan_write##size); \ + EXPORT_SYMBOL(__tsan_unaligned_write##size) + +DEFINE_TSAN_READ_WRITE(1); +DEFINE_TSAN_READ_WRITE(2); +DEFINE_TSAN_READ_WRITE(4); +DEFINE_TSAN_READ_WRITE(8); +DEFINE_TSAN_READ_WRITE(16); + +void __tsan_read_range(void *ptr, size_t size); +void __tsan_read_range(void *ptr, size_t size) +{ + check_access(ptr, size, 0); +} +EXPORT_SYMBOL(__tsan_read_range); + +void __tsan_write_range(void *ptr, size_t size); +void __tsan_write_range(void *ptr, size_t size) +{ + check_access(ptr, size, KCSAN_ACCESS_WRITE); +} +EXPORT_SYMBOL(__tsan_write_range); + +/* + * Use of explicit volatile is generally disallowed [1], however, volatile is + * still used in various concurrent context, whether in low-level + * synchronization primitives or for legacy reasons. + * [1] https://lwn.net/Articles/233479/ + * + * We only consider volatile accesses atomic if they are aligned and would pass + * the size-check of compiletime_assert_rwonce_type(). + */ +#define DEFINE_TSAN_VOLATILE_READ_WRITE(size) \ + void __tsan_volatile_read##size(void *ptr); \ + void __tsan_volatile_read##size(void *ptr) \ + { \ + const bool is_atomic = size <= sizeof(long long) && \ + IS_ALIGNED((unsigned long)ptr, size); \ + if (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS) && is_atomic) \ + return; \ + check_access(ptr, size, is_atomic ? KCSAN_ACCESS_ATOMIC : 0); \ + } \ + EXPORT_SYMBOL(__tsan_volatile_read##size); \ + void __tsan_unaligned_volatile_read##size(void *ptr) \ + __alias(__tsan_volatile_read##size); \ + EXPORT_SYMBOL(__tsan_unaligned_volatile_read##size); \ + void __tsan_volatile_write##size(void *ptr); \ + void __tsan_volatile_write##size(void *ptr) \ + { \ + const bool is_atomic = size <= sizeof(long long) && \ + IS_ALIGNED((unsigned long)ptr, size); \ + if (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS) && is_atomic) \ + return; \ + check_access(ptr, size, \ + KCSAN_ACCESS_WRITE | \ + (is_atomic ? KCSAN_ACCESS_ATOMIC : 0)); \ + } \ + EXPORT_SYMBOL(__tsan_volatile_write##size); \ + void __tsan_unaligned_volatile_write##size(void *ptr) \ + __alias(__tsan_volatile_write##size); \ + EXPORT_SYMBOL(__tsan_unaligned_volatile_write##size) + +DEFINE_TSAN_VOLATILE_READ_WRITE(1); +DEFINE_TSAN_VOLATILE_READ_WRITE(2); +DEFINE_TSAN_VOLATILE_READ_WRITE(4); +DEFINE_TSAN_VOLATILE_READ_WRITE(8); +DEFINE_TSAN_VOLATILE_READ_WRITE(16); + +/* + * The below are not required by KCSAN, but can still be emitted by the + * compiler. + */ +void __tsan_func_entry(void *call_pc); +void __tsan_func_entry(void *call_pc) +{ +} +EXPORT_SYMBOL(__tsan_func_entry); +void __tsan_func_exit(void); +void __tsan_func_exit(void) +{ +} +EXPORT_SYMBOL(__tsan_func_exit); +void __tsan_init(void); +void __tsan_init(void) +{ +} +EXPORT_SYMBOL(__tsan_init); diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c new file mode 100644 index 000000000000..023e49c58d55 --- /dev/null +++ b/kernel/kcsan/debugfs.c @@ -0,0 +1,349 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/atomic.h> +#include <linux/bsearch.h> +#include <linux/bug.h> +#include <linux/debugfs.h> +#include <linux/init.h> +#include <linux/kallsyms.h> +#include <linux/sched.h> +#include <linux/seq_file.h> +#include <linux/slab.h> +#include <linux/sort.h> +#include <linux/string.h> +#include <linux/uaccess.h> + +#include "kcsan.h" + +/* + * Statistics counters. + */ +static atomic_long_t counters[KCSAN_COUNTER_COUNT]; + +/* + * Addresses for filtering functions from reporting. This list can be used as a + * whitelist or blacklist. + */ +static struct { + unsigned long *addrs; /* array of addresses */ + size_t size; /* current size */ + int used; /* number of elements used */ + bool sorted; /* if elements are sorted */ + bool whitelist; /* if list is a blacklist or whitelist */ +} report_filterlist = { + .addrs = NULL, + .size = 8, /* small initial size */ + .used = 0, + .sorted = false, + .whitelist = false, /* default is blacklist */ +}; +static DEFINE_SPINLOCK(report_filterlist_lock); + +static const char *counter_to_name(enum kcsan_counter_id id) +{ + switch (id) { + case KCSAN_COUNTER_USED_WATCHPOINTS: return "used_watchpoints"; + case KCSAN_COUNTER_SETUP_WATCHPOINTS: return "setup_watchpoints"; + case KCSAN_COUNTER_DATA_RACES: return "data_races"; + case KCSAN_COUNTER_ASSERT_FAILURES: return "assert_failures"; + case KCSAN_COUNTER_NO_CAPACITY: return "no_capacity"; + case KCSAN_COUNTER_REPORT_RACES: return "report_races"; + case KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN: return "races_unknown_origin"; + case KCSAN_COUNTER_UNENCODABLE_ACCESSES: return "unencodable_accesses"; + case KCSAN_COUNTER_ENCODING_FALSE_POSITIVES: return "encoding_false_positives"; + case KCSAN_COUNTER_COUNT: + BUG(); + } + return NULL; +} + +void kcsan_counter_inc(enum kcsan_counter_id id) +{ + atomic_long_inc(&counters[id]); +} + +void kcsan_counter_dec(enum kcsan_counter_id id) +{ + atomic_long_dec(&counters[id]); +} + +/* + * The microbenchmark allows benchmarking KCSAN core runtime only. To run + * multiple threads, pipe 'microbench=<iters>' from multiple tasks into the + * debugfs file. This will not generate any conflicts, and tests fast-path only. + */ +static noinline void microbenchmark(unsigned long iters) +{ + const struct kcsan_ctx ctx_save = current->kcsan_ctx; + const bool was_enabled = READ_ONCE(kcsan_enabled); + cycles_t cycles; + + /* We may have been called from an atomic region; reset context. */ + memset(¤t->kcsan_ctx, 0, sizeof(current->kcsan_ctx)); + /* + * Disable to benchmark fast-path for all accesses, and (expected + * negligible) call into slow-path, but never set up watchpoints. + */ + WRITE_ONCE(kcsan_enabled, false); + + pr_info("KCSAN: %s begin | iters: %lu\n", __func__, iters); + + cycles = get_cycles(); + while (iters--) { + unsigned long addr = iters & ((PAGE_SIZE << 8) - 1); + int type = !(iters & 0x7f) ? KCSAN_ACCESS_ATOMIC : + (!(iters & 0xf) ? KCSAN_ACCESS_WRITE : 0); + __kcsan_check_access((void *)addr, sizeof(long), type); + } + cycles = get_cycles() - cycles; + + pr_info("KCSAN: %s end | cycles: %llu\n", __func__, cycles); + + WRITE_ONCE(kcsan_enabled, was_enabled); + /* restore context */ + current->kcsan_ctx = ctx_save; +} + +/* + * Simple test to create conflicting accesses. Write 'test=<iters>' to KCSAN's + * debugfs file from multiple tasks to generate real conflicts and show reports. + */ +static long test_dummy; +static long test_flags; +static long test_scoped; +static noinline void test_thread(unsigned long iters) +{ + const long CHANGE_BITS = 0xff00ff00ff00ff00L; + const struct kcsan_ctx ctx_save = current->kcsan_ctx; + cycles_t cycles; + + /* We may have been called from an atomic region; reset context. */ + memset(¤t->kcsan_ctx, 0, sizeof(current->kcsan_ctx)); + + pr_info("KCSAN: %s begin | iters: %lu\n", __func__, iters); + pr_info("test_dummy@%px, test_flags@%px, test_scoped@%px,\n", + &test_dummy, &test_flags, &test_scoped); + + cycles = get_cycles(); + while (iters--) { + /* These all should generate reports. */ + __kcsan_check_read(&test_dummy, sizeof(test_dummy)); + ASSERT_EXCLUSIVE_WRITER(test_dummy); + ASSERT_EXCLUSIVE_ACCESS(test_dummy); + + ASSERT_EXCLUSIVE_BITS(test_flags, ~CHANGE_BITS); /* no report */ + __kcsan_check_read(&test_flags, sizeof(test_flags)); /* no report */ + + ASSERT_EXCLUSIVE_BITS(test_flags, CHANGE_BITS); /* report */ + __kcsan_check_read(&test_flags, sizeof(test_flags)); /* no report */ + + /* not actually instrumented */ + WRITE_ONCE(test_dummy, iters); /* to observe value-change */ + __kcsan_check_write(&test_dummy, sizeof(test_dummy)); + + test_flags ^= CHANGE_BITS; /* generate value-change */ + __kcsan_check_write(&test_flags, sizeof(test_flags)); + + BUG_ON(current->kcsan_ctx.scoped_accesses.prev); + { + /* Should generate reports anywhere in this block. */ + ASSERT_EXCLUSIVE_WRITER_SCOPED(test_scoped); + ASSERT_EXCLUSIVE_ACCESS_SCOPED(test_scoped); + BUG_ON(!current->kcsan_ctx.scoped_accesses.prev); + /* Unrelated accesses. */ + __kcsan_check_access(&cycles, sizeof(cycles), 0); + __kcsan_check_access(&cycles, sizeof(cycles), KCSAN_ACCESS_ATOMIC); + } + BUG_ON(current->kcsan_ctx.scoped_accesses.prev); + } + cycles = get_cycles() - cycles; + + pr_info("KCSAN: %s end | cycles: %llu\n", __func__, cycles); + + /* restore context */ + current->kcsan_ctx = ctx_save; +} + +static int cmp_filterlist_addrs(const void *rhs, const void *lhs) +{ + const unsigned long a = *(const unsigned long *)rhs; + const unsigned long b = *(const unsigned long *)lhs; + + return a < b ? -1 : a == b ? 0 : 1; +} + +bool kcsan_skip_report_debugfs(unsigned long func_addr) +{ + unsigned long symbolsize, offset; + unsigned long flags; + bool ret = false; + + if (!kallsyms_lookup_size_offset(func_addr, &symbolsize, &offset)) + return false; + func_addr -= offset; /* Get function start */ + + spin_lock_irqsave(&report_filterlist_lock, flags); + if (report_filterlist.used == 0) + goto out; + + /* Sort array if it is unsorted, and then do a binary search. */ + if (!report_filterlist.sorted) { + sort(report_filterlist.addrs, report_filterlist.used, + sizeof(unsigned long), cmp_filterlist_addrs, NULL); + report_filterlist.sorted = true; + } + ret = !!bsearch(&func_addr, report_filterlist.addrs, + report_filterlist.used, sizeof(unsigned long), + cmp_filterlist_addrs); + if (report_filterlist.whitelist) + ret = !ret; + +out: + spin_unlock_irqrestore(&report_filterlist_lock, flags); + return ret; +} + +static void set_report_filterlist_whitelist(bool whitelist) +{ + unsigned long flags; + + spin_lock_irqsave(&report_filterlist_lock, flags); + report_filterlist.whitelist = whitelist; + spin_unlock_irqrestore(&report_filterlist_lock, flags); +} + +/* Returns 0 on success, error-code otherwise. */ +static ssize_t insert_report_filterlist(const char *func) +{ + unsigned long flags; + unsigned long addr = kallsyms_lookup_name(func); + ssize_t ret = 0; + + if (!addr) { + pr_err("KCSAN: could not find function: '%s'\n", func); + return -ENOENT; + } + + spin_lock_irqsave(&report_filterlist_lock, flags); + + if (report_filterlist.addrs == NULL) { + /* initial allocation */ + report_filterlist.addrs = + kmalloc_array(report_filterlist.size, + sizeof(unsigned long), GFP_ATOMIC); + if (report_filterlist.addrs == NULL) { + ret = -ENOMEM; + goto out; + } + } else if (report_filterlist.used == report_filterlist.size) { + /* resize filterlist */ + size_t new_size = report_filterlist.size * 2; + unsigned long *new_addrs = + krealloc(report_filterlist.addrs, + new_size * sizeof(unsigned long), GFP_ATOMIC); + + if (new_addrs == NULL) { + /* leave filterlist itself untouched */ + ret = -ENOMEM; + goto out; + } + + report_filterlist.size = new_size; + report_filterlist.addrs = new_addrs; + } + + /* Note: deduplicating should be done in userspace. */ + report_filterlist.addrs[report_filterlist.used++] = + kallsyms_lookup_name(func); + report_filterlist.sorted = false; + +out: + spin_unlock_irqrestore(&report_filterlist_lock, flags); + + return ret; +} + +static int show_info(struct seq_file *file, void *v) +{ + int i; + unsigned long flags; + + /* show stats */ + seq_printf(file, "enabled: %i\n", READ_ONCE(kcsan_enabled)); + for (i = 0; i < KCSAN_COUNTER_COUNT; ++i) + seq_printf(file, "%s: %ld\n", counter_to_name(i), + atomic_long_read(&counters[i])); + + /* show filter functions, and filter type */ + spin_lock_irqsave(&report_filterlist_lock, flags); + seq_printf(file, "\n%s functions: %s\n", + report_filterlist.whitelist ? "whitelisted" : "blacklisted", + report_filterlist.used == 0 ? "none" : ""); + for (i = 0; i < report_filterlist.used; ++i) + seq_printf(file, " %ps\n", (void *)report_filterlist.addrs[i]); + spin_unlock_irqrestore(&report_filterlist_lock, flags); + + return 0; +} + +static int debugfs_open(struct inode *inode, struct file *file) +{ + return single_open(file, show_info, NULL); +} + +static ssize_t +debugfs_write(struct file *file, const char __user *buf, size_t count, loff_t *off) +{ + char kbuf[KSYM_NAME_LEN]; + char *arg; + int read_len = count < (sizeof(kbuf) - 1) ? count : (sizeof(kbuf) - 1); + + if (copy_from_user(kbuf, buf, read_len)) + return -EFAULT; + kbuf[read_len] = '\0'; + arg = strstrip(kbuf); + + if (!strcmp(arg, "on")) { + WRITE_ONCE(kcsan_enabled, true); + } else if (!strcmp(arg, "off")) { + WRITE_ONCE(kcsan_enabled, false); + } else if (!strncmp(arg, "microbench=", sizeof("microbench=") - 1)) { + unsigned long iters; + + if (kstrtoul(&arg[sizeof("microbench=") - 1], 0, &iters)) + return -EINVAL; + microbenchmark(iters); + } else if (!strncmp(arg, "test=", sizeof("test=") - 1)) { + unsigned long iters; + + if (kstrtoul(&arg[sizeof("test=") - 1], 0, &iters)) + return -EINVAL; + test_thread(iters); + } else if (!strcmp(arg, "whitelist")) { + set_report_filterlist_whitelist(true); + } else if (!strcmp(arg, "blacklist")) { + set_report_filterlist_whitelist(false); + } else if (arg[0] == '!') { + ssize_t ret = insert_report_filterlist(&arg[1]); + + if (ret < 0) + return ret; + } else { + return -EINVAL; + } + + return count; +} + +static const struct file_operations debugfs_ops = +{ + .read = seq_read, + .open = debugfs_open, + .write = debugfs_write, + .release = single_release +}; + +void __init kcsan_debugfs_init(void) +{ + debugfs_create_file("kcsan", 0644, NULL, NULL, &debugfs_ops); +} diff --git a/kernel/kcsan/encoding.h b/kernel/kcsan/encoding.h new file mode 100644 index 000000000000..f03562aaf2eb --- /dev/null +++ b/kernel/kcsan/encoding.h @@ -0,0 +1,95 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _KERNEL_KCSAN_ENCODING_H +#define _KERNEL_KCSAN_ENCODING_H + +#include <linux/bits.h> +#include <linux/log2.h> +#include <linux/mm.h> + +#include "kcsan.h" + +#define SLOT_RANGE PAGE_SIZE + +#define INVALID_WATCHPOINT 0 +#define CONSUMED_WATCHPOINT 1 + +/* + * The maximum useful size of accesses for which we set up watchpoints is the + * max range of slots we check on an access. + */ +#define MAX_ENCODABLE_SIZE (SLOT_RANGE * (1 + KCSAN_CHECK_ADJACENT)) + +/* + * Number of bits we use to store size info. + */ +#define WATCHPOINT_SIZE_BITS bits_per(MAX_ENCODABLE_SIZE) +/* + * This encoding for addresses discards the upper (1 for is-write + SIZE_BITS); + * however, most 64-bit architectures do not use the full 64-bit address space. + * Also, in order for a false positive to be observable 2 things need to happen: + * + * 1. different addresses but with the same encoded address race; + * 2. and both map onto the same watchpoint slots; + * + * Both these are assumed to be very unlikely. However, in case it still happens + * happens, the report logic will filter out the false positive (see report.c). + */ +#define WATCHPOINT_ADDR_BITS (BITS_PER_LONG-1 - WATCHPOINT_SIZE_BITS) + +/* + * Masks to set/retrieve the encoded data. + */ +#define WATCHPOINT_WRITE_MASK BIT(BITS_PER_LONG-1) +#define WATCHPOINT_SIZE_MASK \ + GENMASK(BITS_PER_LONG-2, BITS_PER_LONG-2 - WATCHPOINT_SIZE_BITS) +#define WATCHPOINT_ADDR_MASK \ + GENMASK(BITS_PER_LONG-3 - WATCHPOINT_SIZE_BITS, 0) + +static inline bool check_encodable(unsigned long addr, size_t size) +{ + return size <= MAX_ENCODABLE_SIZE; +} + +static inline long +encode_watchpoint(unsigned long addr, size_t size, bool is_write) +{ + return (long)((is_write ? WATCHPOINT_WRITE_MASK : 0) | + (size << WATCHPOINT_ADDR_BITS) | + (addr & WATCHPOINT_ADDR_MASK)); +} + +static __always_inline bool decode_watchpoint(long watchpoint, + unsigned long *addr_masked, + size_t *size, + bool *is_write) +{ + if (watchpoint == INVALID_WATCHPOINT || + watchpoint == CONSUMED_WATCHPOINT) + return false; + + *addr_masked = (unsigned long)watchpoint & WATCHPOINT_ADDR_MASK; + *size = ((unsigned long)watchpoint & WATCHPOINT_SIZE_MASK) >> WATCHPOINT_ADDR_BITS; + *is_write = !!((unsigned long)watchpoint & WATCHPOINT_WRITE_MASK); + + return true; +} + +/* + * Return watchpoint slot for an address. + */ +static __always_inline int watchpoint_slot(unsigned long addr) +{ + return (addr / PAGE_SIZE) % CONFIG_KCSAN_NUM_WATCHPOINTS; +} + +static __always_inline bool matching_access(unsigned long addr1, size_t size1, + unsigned long addr2, size_t size2) +{ + unsigned long end_range1 = addr1 + size1 - 1; + unsigned long end_range2 = addr2 + size2 - 1; + + return addr1 <= end_range2 && addr2 <= end_range1; +} + +#endif /* _KERNEL_KCSAN_ENCODING_H */ diff --git a/kernel/kcsan/kcsan-test.c b/kernel/kcsan/kcsan-test.c new file mode 100644 index 000000000000..fed6fcb5768c --- /dev/null +++ b/kernel/kcsan/kcsan-test.c @@ -0,0 +1,1107 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KCSAN test with various race scenarious to test runtime behaviour. Since the + * interface with which KCSAN's reports are obtained is via the console, this is + * the output we should verify. For each test case checks the presence (or + * absence) of generated reports. Relies on 'console' tracepoint to capture + * reports as they appear in the kernel log. + * + * Makes use of KUnit for test organization, and the Torture framework for test + * thread control. + * + * Copyright (C) 2020, Google LLC. + * Author: Marco Elver <elver@google.com> + */ + +#include <kunit/test.h> +#include <linux/jiffies.h> +#include <linux/kcsan-checks.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/seqlock.h> +#include <linux/spinlock.h> +#include <linux/string.h> +#include <linux/timer.h> +#include <linux/torture.h> +#include <linux/tracepoint.h> +#include <linux/types.h> +#include <trace/events/printk.h> + +/* Points to current test-case memory access "kernels". */ +static void (*access_kernels[2])(void); + +static struct task_struct **threads; /* Lists of threads. */ +static unsigned long end_time; /* End time of test. */ + +/* Report as observed from console. */ +static struct { + spinlock_t lock; + int nlines; + char lines[3][512]; +} observed = { + .lock = __SPIN_LOCK_UNLOCKED(observed.lock), +}; + +/* Setup test checking loop. */ +static __no_kcsan inline void +begin_test_checks(void (*func1)(void), void (*func2)(void)) +{ + kcsan_disable_current(); + + /* + * Require at least as long as KCSAN_REPORT_ONCE_IN_MS, to ensure at + * least one race is reported. + */ + end_time = jiffies + msecs_to_jiffies(CONFIG_KCSAN_REPORT_ONCE_IN_MS + 500); + + /* Signal start; release potential initialization of shared data. */ + smp_store_release(&access_kernels[0], func1); + smp_store_release(&access_kernels[1], func2); +} + +/* End test checking loop. */ +static __no_kcsan inline bool +end_test_checks(bool stop) +{ + if (!stop && time_before(jiffies, end_time)) { + /* Continue checking */ + might_sleep(); + return false; + } + + kcsan_enable_current(); + return true; +} + +/* + * Probe for console output: checks if a race was reported, and obtains observed + * lines of interest. + */ +__no_kcsan +static void probe_console(void *ignore, const char *buf, size_t len) +{ + unsigned long flags; + int nlines; + + /* + * Note that KCSAN reports under a global lock, so we do not risk the + * possibility of having multiple reports interleaved. If that were the + * case, we'd expect tests to fail. + */ + + spin_lock_irqsave(&observed.lock, flags); + nlines = observed.nlines; + + if (strnstr(buf, "BUG: KCSAN: ", len) && strnstr(buf, "test_", len)) { + /* + * KCSAN report and related to the test. + * + * The provided @buf is not NUL-terminated; copy no more than + * @len bytes and let strscpy() add the missing NUL-terminator. + */ + strscpy(observed.lines[0], buf, min(len + 1, sizeof(observed.lines[0]))); + nlines = 1; + } else if ((nlines == 1 || nlines == 2) && strnstr(buf, "bytes by", len)) { + strscpy(observed.lines[nlines++], buf, min(len + 1, sizeof(observed.lines[0]))); + + if (strnstr(buf, "race at unknown origin", len)) { + if (WARN_ON(nlines != 2)) + goto out; + + /* No second line of interest. */ + strcpy(observed.lines[nlines++], "<none>"); + } + } + +out: + WRITE_ONCE(observed.nlines, nlines); /* Publish new nlines. */ + spin_unlock_irqrestore(&observed.lock, flags); +} + +/* Check if a report related to the test exists. */ +__no_kcsan +static bool report_available(void) +{ + return READ_ONCE(observed.nlines) == ARRAY_SIZE(observed.lines); +} + +/* Report information we expect in a report. */ +struct expect_report { + /* Access information of both accesses. */ + struct { + void *fn; /* Function pointer to expected function of top frame. */ + void *addr; /* Address of access; unchecked if NULL. */ + size_t size; /* Size of access; unchecked if @addr is NULL. */ + int type; /* Access type, see KCSAN_ACCESS definitions. */ + } access[2]; +}; + +/* Check observed report matches information in @r. */ +__no_kcsan +static bool report_matches(const struct expect_report *r) +{ + const bool is_assert = (r->access[0].type | r->access[1].type) & KCSAN_ACCESS_ASSERT; + bool ret = false; + unsigned long flags; + typeof(observed.lines) expect; + const char *end; + char *cur; + int i; + + /* Doubled-checked locking. */ + if (!report_available()) + return false; + + /* Generate expected report contents. */ + + /* Title */ + cur = expect[0]; + end = &expect[0][sizeof(expect[0]) - 1]; + cur += scnprintf(cur, end - cur, "BUG: KCSAN: %s in ", + is_assert ? "assert: race" : "data-race"); + if (r->access[1].fn) { + char tmp[2][64]; + int cmp; + + /* Expect lexographically sorted function names in title. */ + scnprintf(tmp[0], sizeof(tmp[0]), "%pS", r->access[0].fn); + scnprintf(tmp[1], sizeof(tmp[1]), "%pS", r->access[1].fn); + cmp = strcmp(tmp[0], tmp[1]); + cur += scnprintf(cur, end - cur, "%ps / %ps", + cmp < 0 ? r->access[0].fn : r->access[1].fn, + cmp < 0 ? r->access[1].fn : r->access[0].fn); + } else { + scnprintf(cur, end - cur, "%pS", r->access[0].fn); + /* The exact offset won't match, remove it. */ + cur = strchr(expect[0], '+'); + if (cur) + *cur = '\0'; + } + + /* Access 1 */ + cur = expect[1]; + end = &expect[1][sizeof(expect[1]) - 1]; + if (!r->access[1].fn) + cur += scnprintf(cur, end - cur, "race at unknown origin, with "); + + /* Access 1 & 2 */ + for (i = 0; i < 2; ++i) { + const char *const access_type = + (r->access[i].type & KCSAN_ACCESS_ASSERT) ? + ((r->access[i].type & KCSAN_ACCESS_WRITE) ? + "assert no accesses" : + "assert no writes") : + ((r->access[i].type & KCSAN_ACCESS_WRITE) ? + "write" : + "read"); + const char *const access_type_aux = + (r->access[i].type & KCSAN_ACCESS_ATOMIC) ? + " (marked)" : + ((r->access[i].type & KCSAN_ACCESS_SCOPED) ? + " (scoped)" : + ""); + + if (i == 1) { + /* Access 2 */ + cur = expect[2]; + end = &expect[2][sizeof(expect[2]) - 1]; + + if (!r->access[1].fn) { + /* Dummy string if no second access is available. */ + strcpy(cur, "<none>"); + break; + } + } + + cur += scnprintf(cur, end - cur, "%s%s to ", access_type, + access_type_aux); + + if (r->access[i].addr) /* Address is optional. */ + cur += scnprintf(cur, end - cur, "0x%px of %zu bytes", + r->access[i].addr, r->access[i].size); + } + + spin_lock_irqsave(&observed.lock, flags); + if (!report_available()) + goto out; /* A new report is being captured. */ + + /* Finally match expected output to what we actually observed. */ + ret = strstr(observed.lines[0], expect[0]) && + /* Access info may appear in any order. */ + ((strstr(observed.lines[1], expect[1]) && + strstr(observed.lines[2], expect[2])) || + (strstr(observed.lines[1], expect[2]) && + strstr(observed.lines[2], expect[1]))); +out: + spin_unlock_irqrestore(&observed.lock, flags); + return ret; +} + +/* ===== Test kernels ===== */ + +static long test_sink; +static long test_var; +/* @test_array should be large enough to fall into multiple watchpoint slots. */ +static long test_array[3 * PAGE_SIZE / sizeof(long)]; +static struct { + long val[8]; +} test_struct; +static DEFINE_SEQLOCK(test_seqlock); + +/* + * Helper to avoid compiler optimizing out reads, and to generate source values + * for writes. + */ +__no_kcsan +static noinline void sink_value(long v) { WRITE_ONCE(test_sink, v); } + +static noinline void test_kernel_read(void) { sink_value(test_var); } + +static noinline void test_kernel_write(void) +{ + test_var = READ_ONCE_NOCHECK(test_sink) + 1; +} + +static noinline void test_kernel_write_nochange(void) { test_var = 42; } + +/* Suffixed by value-change exception filter. */ +static noinline void test_kernel_write_nochange_rcu(void) { test_var = 42; } + +static noinline void test_kernel_read_atomic(void) +{ + sink_value(READ_ONCE(test_var)); +} + +static noinline void test_kernel_write_atomic(void) +{ + WRITE_ONCE(test_var, READ_ONCE_NOCHECK(test_sink) + 1); +} + +__no_kcsan +static noinline void test_kernel_write_uninstrumented(void) { test_var++; } + +static noinline void test_kernel_data_race(void) { data_race(test_var++); } + +static noinline void test_kernel_assert_writer(void) +{ + ASSERT_EXCLUSIVE_WRITER(test_var); +} + +static noinline void test_kernel_assert_access(void) +{ + ASSERT_EXCLUSIVE_ACCESS(test_var); +} + +#define TEST_CHANGE_BITS 0xff00ff00 + +static noinline void test_kernel_change_bits(void) +{ + if (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { + /* + * Avoid race of unknown origin for this test, just pretend they + * are atomic. + */ + kcsan_nestable_atomic_begin(); + test_var ^= TEST_CHANGE_BITS; + kcsan_nestable_atomic_end(); + } else + WRITE_ONCE(test_var, READ_ONCE(test_var) ^ TEST_CHANGE_BITS); +} + +static noinline void test_kernel_assert_bits_change(void) +{ + ASSERT_EXCLUSIVE_BITS(test_var, TEST_CHANGE_BITS); +} + +static noinline void test_kernel_assert_bits_nochange(void) +{ + ASSERT_EXCLUSIVE_BITS(test_var, ~TEST_CHANGE_BITS); +} + +/* To check that scoped assertions do trigger anywhere in scope. */ +static noinline void test_enter_scope(void) +{ + int x = 0; + + /* Unrelated accesses to scoped assert. */ + READ_ONCE(test_sink); + kcsan_check_read(&x, sizeof(x)); +} + +static noinline void test_kernel_assert_writer_scoped(void) +{ + ASSERT_EXCLUSIVE_WRITER_SCOPED(test_var); + test_enter_scope(); +} + +static noinline void test_kernel_assert_access_scoped(void) +{ + ASSERT_EXCLUSIVE_ACCESS_SCOPED(test_var); + test_enter_scope(); +} + +static noinline void test_kernel_rmw_array(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(test_array); ++i) + test_array[i]++; +} + +static noinline void test_kernel_write_struct(void) +{ + kcsan_check_write(&test_struct, sizeof(test_struct)); + kcsan_disable_current(); + test_struct.val[3]++; /* induce value change */ + kcsan_enable_current(); +} + +static noinline void test_kernel_write_struct_part(void) +{ + test_struct.val[3] = 42; +} + +static noinline void test_kernel_read_struct_zero_size(void) +{ + kcsan_check_read(&test_struct.val[3], 0); +} + +static noinline void test_kernel_jiffies_reader(void) +{ + sink_value((long)jiffies); +} + +static noinline void test_kernel_seqlock_reader(void) +{ + unsigned int seq; + + do { + seq = read_seqbegin(&test_seqlock); + sink_value(test_var); + } while (read_seqretry(&test_seqlock, seq)); +} + +static noinline void test_kernel_seqlock_writer(void) +{ + unsigned long flags; + + write_seqlock_irqsave(&test_seqlock, flags); + test_var++; + write_sequnlock_irqrestore(&test_seqlock, flags); +} + +/* ===== Test cases ===== */ + +/* Simple test with normal data race. */ +__no_kcsan +static void test_basic(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_write, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + }, + }; + static const struct expect_report never = { + .access = { + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + }, + }; + bool match_expect = false; + bool match_never = false; + + begin_test_checks(test_kernel_write, test_kernel_read); + do { + match_expect |= report_matches(&expect); + match_never = report_matches(&never); + } while (!end_test_checks(match_never)); + KUNIT_EXPECT_TRUE(test, match_expect); + KUNIT_EXPECT_FALSE(test, match_never); +} + +/* + * Stress KCSAN with lots of concurrent races on different addresses until + * timeout. + */ +__no_kcsan +static void test_concurrent_races(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + /* NULL will match any address. */ + { test_kernel_rmw_array, NULL, 0, KCSAN_ACCESS_WRITE }, + { test_kernel_rmw_array, NULL, 0, 0 }, + }, + }; + static const struct expect_report never = { + .access = { + { test_kernel_rmw_array, NULL, 0, 0 }, + { test_kernel_rmw_array, NULL, 0, 0 }, + }, + }; + bool match_expect = false; + bool match_never = false; + + begin_test_checks(test_kernel_rmw_array, test_kernel_rmw_array); + do { + match_expect |= report_matches(&expect); + match_never |= report_matches(&never); + } while (!end_test_checks(false)); + KUNIT_EXPECT_TRUE(test, match_expect); /* Sanity check matches exist. */ + KUNIT_EXPECT_FALSE(test, match_never); +} + +/* Test the KCSAN_REPORT_VALUE_CHANGE_ONLY option. */ +__no_kcsan +static void test_novalue_change(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + }, + }; + bool match_expect = false; + + begin_test_checks(test_kernel_write_nochange, test_kernel_read); + do { + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + if (IS_ENABLED(CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY)) + KUNIT_EXPECT_FALSE(test, match_expect); + else + KUNIT_EXPECT_TRUE(test, match_expect); +} + +/* + * Test that the rules where the KCSAN_REPORT_VALUE_CHANGE_ONLY option should + * never apply work. + */ +__no_kcsan +static void test_novalue_change_exception(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_write_nochange_rcu, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + }, + }; + bool match_expect = false; + + begin_test_checks(test_kernel_write_nochange_rcu, test_kernel_read); + do { + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + KUNIT_EXPECT_TRUE(test, match_expect); +} + +/* Test that data races of unknown origin are reported. */ +__no_kcsan +static void test_unknown_origin(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + { NULL }, + }, + }; + bool match_expect = false; + + begin_test_checks(test_kernel_write_uninstrumented, test_kernel_read); + do { + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + if (IS_ENABLED(CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN)) + KUNIT_EXPECT_TRUE(test, match_expect); + else + KUNIT_EXPECT_FALSE(test, match_expect); +} + +/* Test KCSAN_ASSUME_PLAIN_WRITES_ATOMIC if it is selected. */ +__no_kcsan +static void test_write_write_assume_atomic(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_write, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, + { test_kernel_write, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, + }, + }; + bool match_expect = false; + + begin_test_checks(test_kernel_write, test_kernel_write); + do { + sink_value(READ_ONCE(test_var)); /* induce value-change */ + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + if (IS_ENABLED(CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC)) + KUNIT_EXPECT_FALSE(test, match_expect); + else + KUNIT_EXPECT_TRUE(test, match_expect); +} + +/* + * Test that data races with writes larger than word-size are always reported, + * even if KCSAN_ASSUME_PLAIN_WRITES_ATOMIC is selected. + */ +__no_kcsan +static void test_write_write_struct(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE }, + { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE }, + }, + }; + bool match_expect = false; + + begin_test_checks(test_kernel_write_struct, test_kernel_write_struct); + do { + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + KUNIT_EXPECT_TRUE(test, match_expect); +} + +/* + * Test that data races where only one write is larger than word-size are always + * reported, even if KCSAN_ASSUME_PLAIN_WRITES_ATOMIC is selected. + */ +__no_kcsan +static void test_write_write_struct_part(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE }, + { test_kernel_write_struct_part, &test_struct.val[3], sizeof(test_struct.val[3]), KCSAN_ACCESS_WRITE }, + }, + }; + bool match_expect = false; + + begin_test_checks(test_kernel_write_struct, test_kernel_write_struct_part); + do { + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + KUNIT_EXPECT_TRUE(test, match_expect); +} + +/* Test that races with atomic accesses never result in reports. */ +__no_kcsan +static void test_read_atomic_write_atomic(struct kunit *test) +{ + bool match_never = false; + + begin_test_checks(test_kernel_read_atomic, test_kernel_write_atomic); + do { + match_never = report_available(); + } while (!end_test_checks(match_never)); + KUNIT_EXPECT_FALSE(test, match_never); +} + +/* Test that a race with an atomic and plain access result in reports. */ +__no_kcsan +static void test_read_plain_atomic_write(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + { test_kernel_write_atomic, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC }, + }, + }; + bool match_expect = false; + + if (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) + return; + + begin_test_checks(test_kernel_read, test_kernel_write_atomic); + do { + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + KUNIT_EXPECT_TRUE(test, match_expect); +} + +/* Zero-sized accesses should never cause data race reports. */ +__no_kcsan +static void test_zero_size_access(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE }, + { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE }, + }, + }; + const struct expect_report never = { + .access = { + { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE }, + { test_kernel_read_struct_zero_size, &test_struct.val[3], 0, 0 }, + }, + }; + bool match_expect = false; + bool match_never = false; + + begin_test_checks(test_kernel_write_struct, test_kernel_read_struct_zero_size); + do { + match_expect |= report_matches(&expect); + match_never = report_matches(&never); + } while (!end_test_checks(match_never)); + KUNIT_EXPECT_TRUE(test, match_expect); /* Sanity check. */ + KUNIT_EXPECT_FALSE(test, match_never); +} + +/* Test the data_race() macro. */ +__no_kcsan +static void test_data_race(struct kunit *test) +{ + bool match_never = false; + + begin_test_checks(test_kernel_data_race, test_kernel_data_race); + do { + match_never = report_available(); + } while (!end_test_checks(match_never)); + KUNIT_EXPECT_FALSE(test, match_never); +} + +__no_kcsan +static void test_assert_exclusive_writer(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT }, + { test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, + }, + }; + bool match_expect = false; + + begin_test_checks(test_kernel_assert_writer, test_kernel_write_nochange); + do { + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + KUNIT_EXPECT_TRUE(test, match_expect); +} + +__no_kcsan +static void test_assert_exclusive_access(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE }, + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + }, + }; + bool match_expect = false; + + begin_test_checks(test_kernel_assert_access, test_kernel_read); + do { + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + KUNIT_EXPECT_TRUE(test, match_expect); +} + +__no_kcsan +static void test_assert_exclusive_access_writer(struct kunit *test) +{ + const struct expect_report expect_access_writer = { + .access = { + { test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE }, + { test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT }, + }, + }; + const struct expect_report expect_access_access = { + .access = { + { test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE }, + { test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE }, + }, + }; + const struct expect_report never = { + .access = { + { test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT }, + { test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT }, + }, + }; + bool match_expect_access_writer = false; + bool match_expect_access_access = false; + bool match_never = false; + + begin_test_checks(test_kernel_assert_access, test_kernel_assert_writer); + do { + match_expect_access_writer |= report_matches(&expect_access_writer); + match_expect_access_access |= report_matches(&expect_access_access); + match_never |= report_matches(&never); + } while (!end_test_checks(match_never)); + KUNIT_EXPECT_TRUE(test, match_expect_access_writer); + KUNIT_EXPECT_TRUE(test, match_expect_access_access); + KUNIT_EXPECT_FALSE(test, match_never); +} + +__no_kcsan +static void test_assert_exclusive_bits_change(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_assert_bits_change, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT }, + { test_kernel_change_bits, &test_var, sizeof(test_var), + KCSAN_ACCESS_WRITE | (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS) ? 0 : KCSAN_ACCESS_ATOMIC) }, + }, + }; + bool match_expect = false; + + begin_test_checks(test_kernel_assert_bits_change, test_kernel_change_bits); + do { + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + KUNIT_EXPECT_TRUE(test, match_expect); +} + +__no_kcsan +static void test_assert_exclusive_bits_nochange(struct kunit *test) +{ + bool match_never = false; + + begin_test_checks(test_kernel_assert_bits_nochange, test_kernel_change_bits); + do { + match_never = report_available(); + } while (!end_test_checks(match_never)); + KUNIT_EXPECT_FALSE(test, match_never); +} + +__no_kcsan +static void test_assert_exclusive_writer_scoped(struct kunit *test) +{ + const struct expect_report expect_start = { + .access = { + { test_kernel_assert_writer_scoped, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_SCOPED }, + { test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, + }, + }; + const struct expect_report expect_anywhere = { + .access = { + { test_enter_scope, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_SCOPED }, + { test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, + }, + }; + bool match_expect_start = false; + bool match_expect_anywhere = false; + + begin_test_checks(test_kernel_assert_writer_scoped, test_kernel_write_nochange); + do { + match_expect_start |= report_matches(&expect_start); + match_expect_anywhere |= report_matches(&expect_anywhere); + } while (!end_test_checks(match_expect_start && match_expect_anywhere)); + KUNIT_EXPECT_TRUE(test, match_expect_start); + KUNIT_EXPECT_TRUE(test, match_expect_anywhere); +} + +__no_kcsan +static void test_assert_exclusive_access_scoped(struct kunit *test) +{ + const struct expect_report expect_start1 = { + .access = { + { test_kernel_assert_access_scoped, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_SCOPED }, + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + }, + }; + const struct expect_report expect_start2 = { + .access = { expect_start1.access[0], expect_start1.access[0] }, + }; + const struct expect_report expect_inscope = { + .access = { + { test_enter_scope, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_SCOPED }, + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + }, + }; + bool match_expect_start = false; + bool match_expect_inscope = false; + + begin_test_checks(test_kernel_assert_access_scoped, test_kernel_read); + end_time += msecs_to_jiffies(1000); /* This test requires a bit more time. */ + do { + match_expect_start |= report_matches(&expect_start1) || report_matches(&expect_start2); + match_expect_inscope |= report_matches(&expect_inscope); + } while (!end_test_checks(match_expect_start && match_expect_inscope)); + KUNIT_EXPECT_TRUE(test, match_expect_start); + KUNIT_EXPECT_TRUE(test, match_expect_inscope); +} + +/* + * jiffies is special (declared to be volatile) and its accesses are typically + * not marked; this test ensures that the compiler nor KCSAN gets confused about + * jiffies's declaration on different architectures. + */ +__no_kcsan +static void test_jiffies_noreport(struct kunit *test) +{ + bool match_never = false; + + begin_test_checks(test_kernel_jiffies_reader, test_kernel_jiffies_reader); + do { + match_never = report_available(); + } while (!end_test_checks(match_never)); + KUNIT_EXPECT_FALSE(test, match_never); +} + +/* Test that racing accesses in seqlock critical sections are not reported. */ +__no_kcsan +static void test_seqlock_noreport(struct kunit *test) +{ + bool match_never = false; + + begin_test_checks(test_kernel_seqlock_reader, test_kernel_seqlock_writer); + do { + match_never = report_available(); + } while (!end_test_checks(match_never)); + KUNIT_EXPECT_FALSE(test, match_never); +} + +/* + * Each test case is run with different numbers of threads. Until KUnit supports + * passing arguments for each test case, we encode #threads in the test case + * name (read by get_num_threads()). [The '-' was chosen as a stylistic + * preference to separate test name and #threads.] + * + * The thread counts are chosen to cover potentially interesting boundaries and + * corner cases (range 2-5), and then stress the system with larger counts. + */ +#define KCSAN_KUNIT_CASE(test_name) \ + { .run_case = test_name, .name = #test_name "-02" }, \ + { .run_case = test_name, .name = #test_name "-03" }, \ + { .run_case = test_name, .name = #test_name "-04" }, \ + { .run_case = test_name, .name = #test_name "-05" }, \ + { .run_case = test_name, .name = #test_name "-08" }, \ + { .run_case = test_name, .name = #test_name "-16" } + +static struct kunit_case kcsan_test_cases[] = { + KCSAN_KUNIT_CASE(test_basic), + KCSAN_KUNIT_CASE(test_concurrent_races), + KCSAN_KUNIT_CASE(test_novalue_change), + KCSAN_KUNIT_CASE(test_novalue_change_exception), + KCSAN_KUNIT_CASE(test_unknown_origin), + KCSAN_KUNIT_CASE(test_write_write_assume_atomic), + KCSAN_KUNIT_CASE(test_write_write_struct), + KCSAN_KUNIT_CASE(test_write_write_struct_part), + KCSAN_KUNIT_CASE(test_read_atomic_write_atomic), + KCSAN_KUNIT_CASE(test_read_plain_atomic_write), + KCSAN_KUNIT_CASE(test_zero_size_access), + KCSAN_KUNIT_CASE(test_data_race), + KCSAN_KUNIT_CASE(test_assert_exclusive_writer), + KCSAN_KUNIT_CASE(test_assert_exclusive_access), + KCSAN_KUNIT_CASE(test_assert_exclusive_access_writer), + KCSAN_KUNIT_CASE(test_assert_exclusive_bits_change), + KCSAN_KUNIT_CASE(test_assert_exclusive_bits_nochange), + KCSAN_KUNIT_CASE(test_assert_exclusive_writer_scoped), + KCSAN_KUNIT_CASE(test_assert_exclusive_access_scoped), + KCSAN_KUNIT_CASE(test_jiffies_noreport), + KCSAN_KUNIT_CASE(test_seqlock_noreport), + {}, +}; + +/* ===== End test cases ===== */ + +/* Get number of threads encoded in test name. */ +static bool __no_kcsan +get_num_threads(const char *test, int *nthreads) +{ + int len = strlen(test); + + if (WARN_ON(len < 3)) + return false; + + *nthreads = test[len - 1] - '0'; + *nthreads += (test[len - 2] - '0') * 10; + + if (WARN_ON(*nthreads < 0)) + return false; + + return true; +} + +/* Concurrent accesses from interrupts. */ +__no_kcsan +static void access_thread_timer(struct timer_list *timer) +{ + static atomic_t cnt = ATOMIC_INIT(0); + unsigned int idx; + void (*func)(void); + + idx = (unsigned int)atomic_inc_return(&cnt) % ARRAY_SIZE(access_kernels); + /* Acquire potential initialization. */ + func = smp_load_acquire(&access_kernels[idx]); + if (func) + func(); +} + +/* The main loop for each thread. */ +__no_kcsan +static int access_thread(void *arg) +{ + struct timer_list timer; + unsigned int cnt = 0; + unsigned int idx; + void (*func)(void); + + timer_setup_on_stack(&timer, access_thread_timer, 0); + do { + might_sleep(); + + if (!timer_pending(&timer)) + mod_timer(&timer, jiffies + 1); + else { + /* Iterate through all kernels. */ + idx = cnt++ % ARRAY_SIZE(access_kernels); + /* Acquire potential initialization. */ + func = smp_load_acquire(&access_kernels[idx]); + if (func) + func(); + } + } while (!torture_must_stop()); + del_timer_sync(&timer); + destroy_timer_on_stack(&timer); + + torture_kthread_stopping("access_thread"); + return 0; +} + +__no_kcsan +static int test_init(struct kunit *test) +{ + unsigned long flags; + int nthreads; + int i; + + spin_lock_irqsave(&observed.lock, flags); + for (i = 0; i < ARRAY_SIZE(observed.lines); ++i) + observed.lines[i][0] = '\0'; + observed.nlines = 0; + spin_unlock_irqrestore(&observed.lock, flags); + + if (!torture_init_begin((char *)test->name, 1)) + return -EBUSY; + + if (!get_num_threads(test->name, &nthreads)) + goto err; + + if (WARN_ON(threads)) + goto err; + + for (i = 0; i < ARRAY_SIZE(access_kernels); ++i) { + if (WARN_ON(access_kernels[i])) + goto err; + } + + if (!IS_ENABLED(CONFIG_PREEMPT) || !IS_ENABLED(CONFIG_KCSAN_INTERRUPT_WATCHER)) { + /* + * Without any preemption, keep 2 CPUs free for other tasks, one + * of which is the main test case function checking for + * completion or failure. + */ + const int min_unused_cpus = IS_ENABLED(CONFIG_PREEMPT_NONE) ? 2 : 0; + const int min_required_cpus = 2 + min_unused_cpus; + + if (num_online_cpus() < min_required_cpus) { + pr_err("%s: too few online CPUs (%u < %d) for test", + test->name, num_online_cpus(), min_required_cpus); + goto err; + } else if (nthreads > num_online_cpus() - min_unused_cpus) { + nthreads = num_online_cpus() - min_unused_cpus; + pr_warn("%s: limiting number of threads to %d\n", + test->name, nthreads); + } + } + + if (nthreads) { + threads = kcalloc(nthreads + 1, sizeof(struct task_struct *), + GFP_KERNEL); + if (WARN_ON(!threads)) + goto err; + + threads[nthreads] = NULL; + for (i = 0; i < nthreads; ++i) { + if (torture_create_kthread(access_thread, NULL, + threads[i])) + goto err; + } + } + + torture_init_end(); + + return 0; + +err: + kfree(threads); + threads = NULL; + torture_init_end(); + return -EINVAL; +} + +__no_kcsan +static void test_exit(struct kunit *test) +{ + struct task_struct **stop_thread; + int i; + + if (torture_cleanup_begin()) + return; + + for (i = 0; i < ARRAY_SIZE(access_kernels); ++i) + WRITE_ONCE(access_kernels[i], NULL); + + if (threads) { + for (stop_thread = threads; *stop_thread; stop_thread++) + torture_stop_kthread(reader_thread, *stop_thread); + + kfree(threads); + threads = NULL; + } + + torture_cleanup_end(); +} + +static struct kunit_suite kcsan_test_suite = { + .name = "kcsan-test", + .test_cases = kcsan_test_cases, + .init = test_init, + .exit = test_exit, +}; +static struct kunit_suite *kcsan_test_suites[] = { &kcsan_test_suite, NULL }; + +__no_kcsan +static void register_tracepoints(struct tracepoint *tp, void *ignore) +{ + check_trace_callback_type_console(probe_console); + if (!strcmp(tp->name, "console")) + WARN_ON(tracepoint_probe_register(tp, probe_console, NULL)); +} + +__no_kcsan +static void unregister_tracepoints(struct tracepoint *tp, void *ignore) +{ + if (!strcmp(tp->name, "console")) + tracepoint_probe_unregister(tp, probe_console, NULL); +} + +/* + * We only want to do tracepoints setup and teardown once, therefore we have to + * customize the init and exit functions and cannot rely on kunit_test_suite(). + */ +static int __init kcsan_test_init(void) +{ + /* + * Because we want to be able to build the test as a module, we need to + * iterate through all known tracepoints, since the static registration + * won't work here. + */ + for_each_kernel_tracepoint(register_tracepoints, NULL); + return __kunit_test_suites_init(kcsan_test_suites); +} + +static void kcsan_test_exit(void) +{ + __kunit_test_suites_exit(kcsan_test_suites); + for_each_kernel_tracepoint(unregister_tracepoints, NULL); + tracepoint_synchronize_unregister(); +} + +late_initcall(kcsan_test_init); +module_exit(kcsan_test_exit); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Marco Elver <elver@google.com>"); diff --git a/kernel/kcsan/kcsan.h b/kernel/kcsan/kcsan.h new file mode 100644 index 000000000000..29480010dc30 --- /dev/null +++ b/kernel/kcsan/kcsan.h @@ -0,0 +1,149 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * The Kernel Concurrency Sanitizer (KCSAN) infrastructure. For more info please + * see Documentation/dev-tools/kcsan.rst. + */ + +#ifndef _KERNEL_KCSAN_KCSAN_H +#define _KERNEL_KCSAN_KCSAN_H + +#include <linux/kcsan.h> +#include <linux/sched.h> + +/* The number of adjacent watchpoints to check. */ +#define KCSAN_CHECK_ADJACENT 1 +#define NUM_SLOTS (1 + 2*KCSAN_CHECK_ADJACENT) + +extern unsigned int kcsan_udelay_task; +extern unsigned int kcsan_udelay_interrupt; + +/* + * Globally enable and disable KCSAN. + */ +extern bool kcsan_enabled; + +/* + * Save/restore IRQ flags state trace dirtied by KCSAN. + */ +void kcsan_save_irqtrace(struct task_struct *task); +void kcsan_restore_irqtrace(struct task_struct *task); + +/* + * Initialize debugfs file. + */ +void kcsan_debugfs_init(void); + +enum kcsan_counter_id { + /* + * Number of watchpoints currently in use. + */ + KCSAN_COUNTER_USED_WATCHPOINTS, + + /* + * Total number of watchpoints set up. + */ + KCSAN_COUNTER_SETUP_WATCHPOINTS, + + /* + * Total number of data races. + */ + KCSAN_COUNTER_DATA_RACES, + + /* + * Total number of ASSERT failures due to races. If the observed race is + * due to two conflicting ASSERT type accesses, then both will be + * counted. + */ + KCSAN_COUNTER_ASSERT_FAILURES, + + /* + * Number of times no watchpoints were available. + */ + KCSAN_COUNTER_NO_CAPACITY, + + /* + * A thread checking a watchpoint raced with another checking thread; + * only one will be reported. + */ + KCSAN_COUNTER_REPORT_RACES, + + /* + * Observed data value change, but writer thread unknown. + */ + KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN, + + /* + * The access cannot be encoded to a valid watchpoint. + */ + KCSAN_COUNTER_UNENCODABLE_ACCESSES, + + /* + * Watchpoint encoding caused a watchpoint to fire on mismatching + * accesses. + */ + KCSAN_COUNTER_ENCODING_FALSE_POSITIVES, + + KCSAN_COUNTER_COUNT, /* number of counters */ +}; + +/* + * Increment/decrement counter with given id; avoid calling these in fast-path. + */ +extern void kcsan_counter_inc(enum kcsan_counter_id id); +extern void kcsan_counter_dec(enum kcsan_counter_id id); + +/* + * Returns true if data races in the function symbol that maps to func_addr + * (offsets are ignored) should *not* be reported. + */ +extern bool kcsan_skip_report_debugfs(unsigned long func_addr); + +/* + * Value-change states. + */ +enum kcsan_value_change { + /* + * Did not observe a value-change, however, it is valid to report the + * race, depending on preferences. + */ + KCSAN_VALUE_CHANGE_MAYBE, + + /* + * Did not observe a value-change, and it is invalid to report the race. + */ + KCSAN_VALUE_CHANGE_FALSE, + + /* + * The value was observed to change, and the race should be reported. + */ + KCSAN_VALUE_CHANGE_TRUE, +}; + +enum kcsan_report_type { + /* + * The thread that set up the watchpoint and briefly stalled was + * signalled that another thread triggered the watchpoint. + */ + KCSAN_REPORT_RACE_SIGNAL, + + /* + * A thread found and consumed a matching watchpoint. + */ + KCSAN_REPORT_CONSUMED_WATCHPOINT, + + /* + * No other thread was observed to race with the access, but the data + * value before and after the stall differs. + */ + KCSAN_REPORT_RACE_UNKNOWN_ORIGIN, +}; + +/* + * Print a race report from thread that encountered the race. + */ +extern void kcsan_report(const volatile void *ptr, size_t size, int access_type, + enum kcsan_value_change value_change, + enum kcsan_report_type type, int watchpoint_idx); + +#endif /* _KERNEL_KCSAN_KCSAN_H */ diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c new file mode 100644 index 000000000000..9d07e175de0f --- /dev/null +++ b/kernel/kcsan/report.c @@ -0,0 +1,638 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/debug_locks.h> +#include <linux/delay.h> +#include <linux/jiffies.h> +#include <linux/kernel.h> +#include <linux/lockdep.h> +#include <linux/preempt.h> +#include <linux/printk.h> +#include <linux/sched.h> +#include <linux/spinlock.h> +#include <linux/stacktrace.h> + +#include "kcsan.h" +#include "encoding.h" + +/* + * Max. number of stack entries to show in the report. + */ +#define NUM_STACK_ENTRIES 64 + +/* Common access info. */ +struct access_info { + const volatile void *ptr; + size_t size; + int access_type; + int task_pid; + int cpu_id; +}; + +/* + * Other thread info: communicated from other racing thread to thread that set + * up the watchpoint, which then prints the complete report atomically. + */ +struct other_info { + struct access_info ai; + unsigned long stack_entries[NUM_STACK_ENTRIES]; + int num_stack_entries; + + /* + * Optionally pass @current. Typically we do not need to pass @current + * via @other_info since just @task_pid is sufficient. Passing @current + * has additional overhead. + * + * To safely pass @current, we must either use get_task_struct/ + * put_task_struct, or stall the thread that populated @other_info. + * + * We cannot rely on get_task_struct/put_task_struct in case + * release_report() races with a task being released, and would have to + * free it in release_report(). This may result in deadlock if we want + * to use KCSAN on the allocators. + * + * Since we also want to reliably print held locks for + * CONFIG_KCSAN_VERBOSE, the current implementation stalls the thread + * that populated @other_info until it has been consumed. + */ + struct task_struct *task; +}; + +/* + * To never block any producers of struct other_info, we need as many elements + * as we have watchpoints (upper bound on concurrent races to report). + */ +static struct other_info other_infos[CONFIG_KCSAN_NUM_WATCHPOINTS + NUM_SLOTS-1]; + +/* + * Information about reported races; used to rate limit reporting. + */ +struct report_time { + /* + * The last time the race was reported. + */ + unsigned long time; + + /* + * The frames of the 2 threads; if only 1 thread is known, one frame + * will be 0. + */ + unsigned long frame1; + unsigned long frame2; +}; + +/* + * Since we also want to be able to debug allocators with KCSAN, to avoid + * deadlock, report_times cannot be dynamically resized with krealloc in + * rate_limit_report. + * + * Therefore, we use a fixed-size array, which at most will occupy a page. This + * still adequately rate limits reports, assuming that a) number of unique data + * races is not excessive, and b) occurrence of unique races within the + * same time window is limited. + */ +#define REPORT_TIMES_MAX (PAGE_SIZE / sizeof(struct report_time)) +#define REPORT_TIMES_SIZE \ + (CONFIG_KCSAN_REPORT_ONCE_IN_MS > REPORT_TIMES_MAX ? \ + REPORT_TIMES_MAX : \ + CONFIG_KCSAN_REPORT_ONCE_IN_MS) +static struct report_time report_times[REPORT_TIMES_SIZE]; + +/* + * Spinlock serializing report generation, and access to @other_infos. Although + * it could make sense to have a finer-grained locking story for @other_infos, + * report generation needs to be serialized either way, so not much is gained. + */ +static DEFINE_RAW_SPINLOCK(report_lock); + +/* + * Checks if the race identified by thread frames frame1 and frame2 has + * been reported since (now - KCSAN_REPORT_ONCE_IN_MS). + */ +static bool rate_limit_report(unsigned long frame1, unsigned long frame2) +{ + struct report_time *use_entry = &report_times[0]; + unsigned long invalid_before; + int i; + + BUILD_BUG_ON(CONFIG_KCSAN_REPORT_ONCE_IN_MS != 0 && REPORT_TIMES_SIZE == 0); + + if (CONFIG_KCSAN_REPORT_ONCE_IN_MS == 0) + return false; + + invalid_before = jiffies - msecs_to_jiffies(CONFIG_KCSAN_REPORT_ONCE_IN_MS); + + /* Check if a matching race report exists. */ + for (i = 0; i < REPORT_TIMES_SIZE; ++i) { + struct report_time *rt = &report_times[i]; + + /* + * Must always select an entry for use to store info as we + * cannot resize report_times; at the end of the scan, use_entry + * will be the oldest entry, which ideally also happened before + * KCSAN_REPORT_ONCE_IN_MS ago. + */ + if (time_before(rt->time, use_entry->time)) + use_entry = rt; + + /* + * Initially, no need to check any further as this entry as well + * as following entries have never been used. + */ + if (rt->time == 0) + break; + + /* Check if entry expired. */ + if (time_before(rt->time, invalid_before)) + continue; /* before KCSAN_REPORT_ONCE_IN_MS ago */ + + /* Reported recently, check if race matches. */ + if ((rt->frame1 == frame1 && rt->frame2 == frame2) || + (rt->frame1 == frame2 && rt->frame2 == frame1)) + return true; + } + + use_entry->time = jiffies; + use_entry->frame1 = frame1; + use_entry->frame2 = frame2; + return false; +} + +/* + * Special rules to skip reporting. + */ +static bool +skip_report(enum kcsan_value_change value_change, unsigned long top_frame) +{ + /* Should never get here if value_change==FALSE. */ + WARN_ON_ONCE(value_change == KCSAN_VALUE_CHANGE_FALSE); + + /* + * The first call to skip_report always has value_change==TRUE, since we + * cannot know the value written of an instrumented access. For the 2nd + * call there are 6 cases with CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY: + * + * 1. read watchpoint, conflicting write (value_change==TRUE): report; + * 2. read watchpoint, conflicting write (value_change==MAYBE): skip; + * 3. write watchpoint, conflicting write (value_change==TRUE): report; + * 4. write watchpoint, conflicting write (value_change==MAYBE): skip; + * 5. write watchpoint, conflicting read (value_change==MAYBE): skip; + * 6. write watchpoint, conflicting read (value_change==TRUE): report; + * + * Cases 1-4 are intuitive and expected; case 5 ensures we do not report + * data races where the write may have rewritten the same value; case 6 + * is possible either if the size is larger than what we check value + * changes for or the access type is KCSAN_ACCESS_ASSERT. + */ + if (IS_ENABLED(CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY) && + value_change == KCSAN_VALUE_CHANGE_MAYBE) { + /* + * The access is a write, but the data value did not change. + * + * We opt-out of this filter for certain functions at request of + * maintainers. + */ + char buf[64]; + int len = scnprintf(buf, sizeof(buf), "%ps", (void *)top_frame); + + if (!strnstr(buf, "rcu_", len) && + !strnstr(buf, "_rcu", len) && + !strnstr(buf, "_srcu", len)) + return true; + } + + return kcsan_skip_report_debugfs(top_frame); +} + +static const char *get_access_type(int type) +{ + if (type & KCSAN_ACCESS_ASSERT) { + if (type & KCSAN_ACCESS_SCOPED) { + if (type & KCSAN_ACCESS_WRITE) + return "assert no accesses (scoped)"; + else + return "assert no writes (scoped)"; + } else { + if (type & KCSAN_ACCESS_WRITE) + return "assert no accesses"; + else + return "assert no writes"; + } + } + + switch (type) { + case 0: + return "read"; + case KCSAN_ACCESS_ATOMIC: + return "read (marked)"; + case KCSAN_ACCESS_WRITE: + return "write"; + case KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC: + return "write (marked)"; + case KCSAN_ACCESS_SCOPED: + return "read (scoped)"; + case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_ATOMIC: + return "read (marked, scoped)"; + case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_WRITE: + return "write (scoped)"; + case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC: + return "write (marked, scoped)"; + default: + BUG(); + } +} + +static const char *get_bug_type(int type) +{ + return (type & KCSAN_ACCESS_ASSERT) != 0 ? "assert: race" : "data-race"; +} + +/* Return thread description: in task or interrupt. */ +static const char *get_thread_desc(int task_id) +{ + if (task_id != -1) { + static char buf[32]; /* safe: protected by report_lock */ + + snprintf(buf, sizeof(buf), "task %i", task_id); + return buf; + } + return "interrupt"; +} + +/* Helper to skip KCSAN-related functions in stack-trace. */ +static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries) +{ + char buf[64]; + char *cur; + int len, skip; + + for (skip = 0; skip < num_entries; ++skip) { + len = scnprintf(buf, sizeof(buf), "%ps", (void *)stack_entries[skip]); + + /* Never show tsan_* or {read,write}_once_size. */ + if (strnstr(buf, "tsan_", len) || + strnstr(buf, "_once_size", len)) + continue; + + cur = strnstr(buf, "kcsan_", len); + if (cur) { + cur += sizeof("kcsan_") - 1; + if (strncmp(cur, "test", sizeof("test") - 1)) + continue; /* KCSAN runtime function. */ + /* KCSAN related test. */ + } + + /* + * No match for runtime functions -- @skip entries to skip to + * get to first frame of interest. + */ + break; + } + + return skip; +} + +/* Compares symbolized strings of addr1 and addr2. */ +static int sym_strcmp(void *addr1, void *addr2) +{ + char buf1[64]; + char buf2[64]; + + snprintf(buf1, sizeof(buf1), "%pS", addr1); + snprintf(buf2, sizeof(buf2), "%pS", addr2); + + return strncmp(buf1, buf2, sizeof(buf1)); +} + +static void print_verbose_info(struct task_struct *task) +{ + if (!task) + return; + + /* Restore IRQ state trace for printing. */ + kcsan_restore_irqtrace(task); + + pr_err("\n"); + debug_show_held_locks(task); + print_irqtrace_events(task); +} + +/* + * Returns true if a report was generated, false otherwise. + */ +static bool print_report(enum kcsan_value_change value_change, + enum kcsan_report_type type, + const struct access_info *ai, + const struct other_info *other_info) +{ + unsigned long stack_entries[NUM_STACK_ENTRIES] = { 0 }; + int num_stack_entries = stack_trace_save(stack_entries, NUM_STACK_ENTRIES, 1); + int skipnr = get_stack_skipnr(stack_entries, num_stack_entries); + unsigned long this_frame = stack_entries[skipnr]; + unsigned long other_frame = 0; + int other_skipnr = 0; /* silence uninit warnings */ + + /* + * Must check report filter rules before starting to print. + */ + if (skip_report(KCSAN_VALUE_CHANGE_TRUE, stack_entries[skipnr])) + return false; + + if (type == KCSAN_REPORT_RACE_SIGNAL) { + other_skipnr = get_stack_skipnr(other_info->stack_entries, + other_info->num_stack_entries); + other_frame = other_info->stack_entries[other_skipnr]; + + /* @value_change is only known for the other thread */ + if (skip_report(value_change, other_frame)) + return false; + } + + if (rate_limit_report(this_frame, other_frame)) + return false; + + /* Print report header. */ + pr_err("==================================================================\n"); + switch (type) { + case KCSAN_REPORT_RACE_SIGNAL: { + int cmp; + + /* + * Order functions lexographically for consistent bug titles. + * Do not print offset of functions to keep title short. + */ + cmp = sym_strcmp((void *)other_frame, (void *)this_frame); + pr_err("BUG: KCSAN: %s in %ps / %ps\n", + get_bug_type(ai->access_type | other_info->ai.access_type), + (void *)(cmp < 0 ? other_frame : this_frame), + (void *)(cmp < 0 ? this_frame : other_frame)); + } break; + + case KCSAN_REPORT_RACE_UNKNOWN_ORIGIN: + pr_err("BUG: KCSAN: %s in %pS\n", get_bug_type(ai->access_type), + (void *)this_frame); + break; + + default: + BUG(); + } + + pr_err("\n"); + + /* Print information about the racing accesses. */ + switch (type) { + case KCSAN_REPORT_RACE_SIGNAL: + pr_err("%s to 0x%px of %zu bytes by %s on cpu %i:\n", + get_access_type(other_info->ai.access_type), other_info->ai.ptr, + other_info->ai.size, get_thread_desc(other_info->ai.task_pid), + other_info->ai.cpu_id); + + /* Print the other thread's stack trace. */ + stack_trace_print(other_info->stack_entries + other_skipnr, + other_info->num_stack_entries - other_skipnr, + 0); + + if (IS_ENABLED(CONFIG_KCSAN_VERBOSE)) + print_verbose_info(other_info->task); + + pr_err("\n"); + pr_err("%s to 0x%px of %zu bytes by %s on cpu %i:\n", + get_access_type(ai->access_type), ai->ptr, ai->size, + get_thread_desc(ai->task_pid), ai->cpu_id); + break; + + case KCSAN_REPORT_RACE_UNKNOWN_ORIGIN: + pr_err("race at unknown origin, with %s to 0x%px of %zu bytes by %s on cpu %i:\n", + get_access_type(ai->access_type), ai->ptr, ai->size, + get_thread_desc(ai->task_pid), ai->cpu_id); + break; + + default: + BUG(); + } + /* Print stack trace of this thread. */ + stack_trace_print(stack_entries + skipnr, num_stack_entries - skipnr, + 0); + + if (IS_ENABLED(CONFIG_KCSAN_VERBOSE)) + print_verbose_info(current); + + /* Print report footer. */ + pr_err("\n"); + pr_err("Reported by Kernel Concurrency Sanitizer on:\n"); + dump_stack_print_info(KERN_DEFAULT); + pr_err("==================================================================\n"); + + return true; +} + +static void release_report(unsigned long *flags, struct other_info *other_info) +{ + if (other_info) + /* + * Use size to denote valid/invalid, since KCSAN entirely + * ignores 0-sized accesses. + */ + other_info->ai.size = 0; + + raw_spin_unlock_irqrestore(&report_lock, *flags); +} + +/* + * Sets @other_info->task and awaits consumption of @other_info. + * + * Precondition: report_lock is held. + * Postcondition: report_lock is held. + */ +static void set_other_info_task_blocking(unsigned long *flags, + const struct access_info *ai, + struct other_info *other_info) +{ + /* + * We may be instrumenting a code-path where current->state is already + * something other than TASK_RUNNING. + */ + const bool is_running = current->state == TASK_RUNNING; + /* + * To avoid deadlock in case we are in an interrupt here and this is a + * race with a task on the same CPU (KCSAN_INTERRUPT_WATCHER), provide a + * timeout to ensure this works in all contexts. + * + * Await approximately the worst case delay of the reporting thread (if + * we are not interrupted). + */ + int timeout = max(kcsan_udelay_task, kcsan_udelay_interrupt); + + other_info->task = current; + do { + if (is_running) { + /* + * Let lockdep know the real task is sleeping, to print + * the held locks (recall we turned lockdep off, so + * locking/unlocking @report_lock won't be recorded). + */ + set_current_state(TASK_UNINTERRUPTIBLE); + } + raw_spin_unlock_irqrestore(&report_lock, *flags); + /* + * We cannot call schedule() since we also cannot reliably + * determine if sleeping here is permitted -- see in_atomic(). + */ + + udelay(1); + raw_spin_lock_irqsave(&report_lock, *flags); + if (timeout-- < 0) { + /* + * Abort. Reset @other_info->task to NULL, since it + * appears the other thread is still going to consume + * it. It will result in no verbose info printed for + * this task. + */ + other_info->task = NULL; + break; + } + /* + * If invalid, or @ptr nor @current matches, then @other_info + * has been consumed and we may continue. If not, retry. + */ + } while (other_info->ai.size && other_info->ai.ptr == ai->ptr && + other_info->task == current); + if (is_running) + set_current_state(TASK_RUNNING); +} + +/* Populate @other_info; requires that the provided @other_info not in use. */ +static void prepare_report_producer(unsigned long *flags, + const struct access_info *ai, + struct other_info *other_info) +{ + raw_spin_lock_irqsave(&report_lock, *flags); + + /* + * The same @other_infos entry cannot be used concurrently, because + * there is a one-to-one mapping to watchpoint slots (@watchpoints in + * core.c), and a watchpoint is only released for reuse after reporting + * is done by the consumer of @other_info. Therefore, it is impossible + * for another concurrent prepare_report_producer() to set the same + * @other_info, and are guaranteed exclusivity for the @other_infos + * entry pointed to by @other_info. + * + * To check this property holds, size should never be non-zero here, + * because every consumer of struct other_info resets size to 0 in + * release_report(). + */ + WARN_ON(other_info->ai.size); + + other_info->ai = *ai; + other_info->num_stack_entries = stack_trace_save(other_info->stack_entries, NUM_STACK_ENTRIES, 2); + + if (IS_ENABLED(CONFIG_KCSAN_VERBOSE)) + set_other_info_task_blocking(flags, ai, other_info); + + raw_spin_unlock_irqrestore(&report_lock, *flags); +} + +/* Awaits producer to fill @other_info and then returns. */ +static bool prepare_report_consumer(unsigned long *flags, + const struct access_info *ai, + struct other_info *other_info) +{ + + raw_spin_lock_irqsave(&report_lock, *flags); + while (!other_info->ai.size) { /* Await valid @other_info. */ + raw_spin_unlock_irqrestore(&report_lock, *flags); + cpu_relax(); + raw_spin_lock_irqsave(&report_lock, *flags); + } + + /* Should always have a matching access based on watchpoint encoding. */ + if (WARN_ON(!matching_access((unsigned long)other_info->ai.ptr & WATCHPOINT_ADDR_MASK, other_info->ai.size, + (unsigned long)ai->ptr & WATCHPOINT_ADDR_MASK, ai->size))) + goto discard; + + if (!matching_access((unsigned long)other_info->ai.ptr, other_info->ai.size, + (unsigned long)ai->ptr, ai->size)) { + /* + * If the actual accesses to not match, this was a false + * positive due to watchpoint encoding. + */ + kcsan_counter_inc(KCSAN_COUNTER_ENCODING_FALSE_POSITIVES); + goto discard; + } + + return true; + +discard: + release_report(flags, other_info); + return false; +} + +/* + * Depending on the report type either sets @other_info and returns false, or + * awaits @other_info and returns true. If @other_info is not required for the + * report type, simply acquires @report_lock and returns true. + */ +static noinline bool prepare_report(unsigned long *flags, + enum kcsan_report_type type, + const struct access_info *ai, + struct other_info *other_info) +{ + switch (type) { + case KCSAN_REPORT_CONSUMED_WATCHPOINT: + prepare_report_producer(flags, ai, other_info); + return false; + case KCSAN_REPORT_RACE_SIGNAL: + return prepare_report_consumer(flags, ai, other_info); + default: + /* @other_info not required; just acquire @report_lock. */ + raw_spin_lock_irqsave(&report_lock, *flags); + return true; + } +} + +void kcsan_report(const volatile void *ptr, size_t size, int access_type, + enum kcsan_value_change value_change, + enum kcsan_report_type type, int watchpoint_idx) +{ + unsigned long flags = 0; + const struct access_info ai = { + .ptr = ptr, + .size = size, + .access_type = access_type, + .task_pid = in_task() ? task_pid_nr(current) : -1, + .cpu_id = raw_smp_processor_id() + }; + struct other_info *other_info = type == KCSAN_REPORT_RACE_UNKNOWN_ORIGIN + ? NULL : &other_infos[watchpoint_idx]; + + kcsan_disable_current(); + if (WARN_ON(watchpoint_idx < 0 || watchpoint_idx >= ARRAY_SIZE(other_infos))) + goto out; + + /* + * Because we may generate reports when we're in scheduler code, the use + * of printk() could deadlock. Until such time that all printing code + * called in print_report() is scheduler-safe, accept the risk, and just + * get our message out. As such, also disable lockdep to hide the + * warning, and avoid disabling lockdep for the rest of the kernel. + */ + lockdep_off(); + + if (prepare_report(&flags, type, &ai, other_info)) { + /* + * Never report if value_change is FALSE, only if we it is + * either TRUE or MAYBE. In case of MAYBE, further filtering may + * be done once we know the full stack trace in print_report(). + */ + bool reported = value_change != KCSAN_VALUE_CHANGE_FALSE && + print_report(value_change, type, &ai, other_info); + + if (reported && panic_on_warn) + panic("panic_on_warn set ...\n"); + + release_report(&flags, other_info); + } + + lockdep_on(); +out: + kcsan_enable_current(); +} diff --git a/kernel/kcsan/selftest.c b/kernel/kcsan/selftest.c new file mode 100644 index 000000000000..d26a052d3383 --- /dev/null +++ b/kernel/kcsan/selftest.c @@ -0,0 +1,131 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/printk.h> +#include <linux/random.h> +#include <linux/types.h> + +#include "encoding.h" + +#define ITERS_PER_TEST 2000 + +/* Test requirements. */ +static bool test_requires(void) +{ + /* random should be initialized for the below tests */ + return prandom_u32() + prandom_u32() != 0; +} + +/* + * Test watchpoint encode and decode: check that encoding some access's info, + * and then subsequent decode preserves the access's info. + */ +static bool test_encode_decode(void) +{ + int i; + + for (i = 0; i < ITERS_PER_TEST; ++i) { + size_t size = prandom_u32_max(MAX_ENCODABLE_SIZE) + 1; + bool is_write = !!prandom_u32_max(2); + unsigned long addr; + + prandom_bytes(&addr, sizeof(addr)); + if (WARN_ON(!check_encodable(addr, size))) + return false; + + /* Encode and decode */ + { + const long encoded_watchpoint = + encode_watchpoint(addr, size, is_write); + unsigned long verif_masked_addr; + size_t verif_size; + bool verif_is_write; + + /* Check special watchpoints */ + if (WARN_ON(decode_watchpoint( + INVALID_WATCHPOINT, &verif_masked_addr, + &verif_size, &verif_is_write))) + return false; + if (WARN_ON(decode_watchpoint( + CONSUMED_WATCHPOINT, &verif_masked_addr, + &verif_size, &verif_is_write))) + return false; + + /* Check decoding watchpoint returns same data */ + if (WARN_ON(!decode_watchpoint( + encoded_watchpoint, &verif_masked_addr, + &verif_size, &verif_is_write))) + return false; + if (WARN_ON(verif_masked_addr != + (addr & WATCHPOINT_ADDR_MASK))) + goto fail; + if (WARN_ON(verif_size != size)) + goto fail; + if (WARN_ON(is_write != verif_is_write)) + goto fail; + + continue; +fail: + pr_err("%s fail: %s %zu bytes @ %lx -> encoded: %lx -> %s %zu bytes @ %lx\n", + __func__, is_write ? "write" : "read", size, + addr, encoded_watchpoint, + verif_is_write ? "write" : "read", verif_size, + verif_masked_addr); + return false; + } + } + + return true; +} + +/* Test access matching function. */ +static bool test_matching_access(void) +{ + if (WARN_ON(!matching_access(10, 1, 10, 1))) + return false; + if (WARN_ON(!matching_access(10, 2, 11, 1))) + return false; + if (WARN_ON(!matching_access(10, 1, 9, 2))) + return false; + if (WARN_ON(matching_access(10, 1, 11, 1))) + return false; + if (WARN_ON(matching_access(9, 1, 10, 1))) + return false; + + /* + * An access of size 0 could match another access, as demonstrated here. + * Rather than add more comparisons to 'matching_access()', which would + * end up in the fast-path for *all* checks, check_access() simply + * returns for all accesses of size 0. + */ + if (WARN_ON(!matching_access(8, 8, 12, 0))) + return false; + + return true; +} + +static int __init kcsan_selftest(void) +{ + int passed = 0; + int total = 0; + +#define RUN_TEST(do_test) \ + do { \ + ++total; \ + if (do_test()) \ + ++passed; \ + else \ + pr_err("KCSAN selftest: " #do_test " failed"); \ + } while (0) + + RUN_TEST(test_requires); + RUN_TEST(test_encode_decode); + RUN_TEST(test_matching_access); + + pr_info("KCSAN selftest: %d/%d tests passed\n", passed, total); + if (passed != total) + panic("KCSAN selftests failed"); + return 0; +} +postcore_initcall(kcsan_selftest); diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index faa74d5f6941..94661d2d13ad 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -181,34 +181,19 @@ void kimage_file_post_load_cleanup(struct kimage *image) static int kimage_validate_signature(struct kimage *image) { - const char *reason; int ret; ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf, image->kernel_buf_len); - switch (ret) { - case 0: - break; + if (ret) { - /* Certain verification errors are non-fatal if we're not - * checking errors, provided we aren't mandating that there - * must be a valid signature. - */ - case -ENODATA: - reason = "kexec of unsigned image"; - goto decide; - case -ENOPKG: - reason = "kexec of image with unsupported crypto"; - goto decide; - case -ENOKEY: - reason = "kexec of image with unavailable key"; - decide: if (IS_ENABLED(CONFIG_KEXEC_SIG_FORCE)) { - pr_notice("%s rejected\n", reason); + pr_notice("Enforced kernel signature verification failed (%d).\n", ret); return ret; } - /* If IMA is guaranteed to appraise a signature on the kexec + /* + * If IMA is guaranteed to appraise a signature on the kexec * image, permit it even if the kernel is otherwise locked * down. */ @@ -216,17 +201,10 @@ kimage_validate_signature(struct kimage *image) security_locked_down(LOCKDOWN_KEXEC)) return -EPERM; - return 0; - - /* All other errors are fatal, including nomem, unparseable - * signatures and signature check failures - even if signatures - * aren't required. - */ - default: - pr_notice("kernel signature verification failed (%d).\n", ret); + pr_debug("kernel signature verification failed (%d).\n", ret); } - return ret; + return 0; } #endif @@ -287,7 +265,7 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, goto out; } - ima_kexec_cmdline(image->cmdline_buf, + ima_kexec_cmdline(kernel_fd, image->cmdline_buf, image->cmdline_buf_len - 1); } @@ -540,6 +518,11 @@ static int locate_mem_hole_callback(struct resource *res, void *arg) unsigned long sz = end - start + 1; /* Returning 0 will take to next memory range */ + + /* Don't use memory that will be detected and handled by a driver. */ + if (res->flags & IORESOURCE_MEM_DRIVER_MANAGED) + return 0; + if (sz < kbuf->memsz) return 0; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 2625c241ac00..e87679a48ba2 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -35,6 +35,7 @@ #include <linux/ftrace.h> #include <linux/cpu.h> #include <linux/jump_label.h> +#include <linux/perf_event.h> #include <asm/sections.h> #include <asm/cacheflush.h> @@ -46,6 +47,11 @@ static int kprobes_initialized; +/* kprobe_table can be accessed by + * - Normal hlist traversal and RCU add/del under kprobe_mutex is held. + * Or + * - RCU hlist traversal under disabling preempt (breakpoint handlers) + */ static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; @@ -118,6 +124,7 @@ struct kprobe_insn_cache kprobe_insn_slots = { .mutex = __MUTEX_INITIALIZER(kprobe_insn_slots.mutex), .alloc = alloc_insn_page, .free = free_insn_page, + .sym = KPROBE_INSN_PAGE_SYM, .pages = LIST_HEAD_INIT(kprobe_insn_slots.pages), .insn_size = MAX_INSN_SIZE, .nr_garbage = 0, @@ -183,6 +190,10 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c) kip->cache = c; list_add_rcu(&kip->list, &c->pages); slot = kip->insns; + + /* Record the perf ksymbol register event after adding the page */ + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, (unsigned long)kip->insns, + PAGE_SIZE, false, c->sym); out: mutex_unlock(&c->mutex); return slot; @@ -201,6 +212,13 @@ static int collect_one_slot(struct kprobe_insn_page *kip, int idx) * next time somebody inserts a probe. */ if (!list_is_singular(&kip->list)) { + /* + * Record perf ksymbol unregister event before removing + * the page. + */ + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, + (unsigned long)kip->insns, PAGE_SIZE, true, + kip->cache->sym); list_del_rcu(&kip->list); synchronize_rcu(); kip->cache->free(kip->insns); @@ -290,12 +308,34 @@ bool __is_insn_slot_addr(struct kprobe_insn_cache *c, unsigned long addr) return ret; } +int kprobe_cache_get_kallsym(struct kprobe_insn_cache *c, unsigned int *symnum, + unsigned long *value, char *type, char *sym) +{ + struct kprobe_insn_page *kip; + int ret = -ERANGE; + + rcu_read_lock(); + list_for_each_entry_rcu(kip, &c->pages, list) { + if ((*symnum)--) + continue; + strlcpy(sym, c->sym, KSYM_NAME_LEN); + *type = 't'; + *value = (unsigned long)kip->insns; + ret = 0; + break; + } + rcu_read_unlock(); + + return ret; +} + #ifdef CONFIG_OPTPROBES /* For optimized_kprobe buffer */ struct kprobe_insn_cache kprobe_optinsn_slots = { .mutex = __MUTEX_INITIALIZER(kprobe_optinsn_slots.mutex), .alloc = alloc_insn_page, .free = free_insn_page, + .sym = KPROBE_OPTINSN_PAGE_SYM, .pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages), /* .insn_size is initialized later */ .nr_garbage = 0, @@ -326,7 +366,8 @@ struct kprobe *get_kprobe(void *addr) struct kprobe *p; head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)]; - hlist_for_each_entry_rcu(p, head, hlist) { + hlist_for_each_entry_rcu(p, head, hlist, + lockdep_is_held(&kprobe_mutex)) { if (p->addr == addr) return p; } @@ -557,8 +598,6 @@ static void kprobe_optimizer(struct work_struct *work) mutex_lock(&kprobe_mutex); cpus_read_lock(); mutex_lock(&text_mutex); - /* Lock modules while optimizing kprobes */ - mutex_lock(&module_mutex); /* * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed) @@ -583,14 +622,14 @@ static void kprobe_optimizer(struct work_struct *work) /* Step 4: Free cleaned kprobes after quiesence period */ do_free_cleaned_kprobes(); - mutex_unlock(&module_mutex); mutex_unlock(&text_mutex); cpus_read_unlock(); - mutex_unlock(&kprobe_mutex); /* Step 5: Kick optimizer again if needed */ if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) kick_kprobe_optimizer(); + + mutex_unlock(&kprobe_mutex); } /* Wait for completing optimization and unoptimization */ @@ -668,8 +707,6 @@ static void force_unoptimize_kprobe(struct optimized_kprobe *op) lockdep_assert_cpus_held(); arch_unoptimize_kprobe(op); op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; - if (kprobe_disabled(&op->kp)) - arch_disarm_kprobe(&op->kp); } /* Unoptimize a kprobe if p is optimized */ @@ -849,7 +886,7 @@ static void optimize_all_kprobes(void) kprobes_allow_optimization = true; for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; - hlist_for_each_entry_rcu(p, head, hlist) + hlist_for_each_entry(p, head, hlist) if (!kprobe_disabled(p)) optimize_kprobe(p); } @@ -876,7 +913,7 @@ static void unoptimize_all_kprobes(void) kprobes_allow_optimization = false; for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; - hlist_for_each_entry_rcu(p, head, hlist) { + hlist_for_each_entry(p, head, hlist) { if (!kprobe_disabled(p)) unoptimize_kprobe(p, false); } @@ -892,7 +929,7 @@ static void unoptimize_all_kprobes(void) static DEFINE_MUTEX(kprobe_sysctl_mutex); int sysctl_kprobes_optimization; int proc_kprobes_optimization_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, + void *buffer, size_t *length, loff_t *ppos) { int ret; @@ -1236,6 +1273,26 @@ __releases(hlist_lock) } NOKPROBE_SYMBOL(kretprobe_table_unlock); +struct kprobe kprobe_busy = { + .addr = (void *) get_kprobe, +}; + +void kprobe_busy_begin(void) +{ + struct kprobe_ctlblk *kcb; + + preempt_disable(); + __this_cpu_write(current_kprobe, &kprobe_busy); + kcb = get_kprobe_ctlblk(); + kcb->kprobe_status = KPROBE_HIT_ACTIVE; +} + +void kprobe_busy_end(void) +{ + __this_cpu_write(current_kprobe, NULL); + preempt_enable(); +} + /* * This function is called from finish_task_switch when task tk becomes dead, * so that we can recycle any function-return probe instances associated @@ -1253,6 +1310,8 @@ void kprobe_flush_task(struct task_struct *tk) /* Early boot. kretprobe_table_locks not yet initialized. */ return; + kprobe_busy_begin(); + INIT_HLIST_HEAD(&empty_rp); hash = hash_ptr(tk, KPROBE_HASH_BITS); head = &kretprobe_inst_table[hash]; @@ -1266,6 +1325,8 @@ void kprobe_flush_task(struct task_struct *tk) hlist_del(&ri->hlist); kfree(ri); } + + kprobe_busy_end(); } NOKPROBE_SYMBOL(kprobe_flush_task); @@ -1499,12 +1560,14 @@ static struct kprobe *__get_valid_kprobe(struct kprobe *p) { struct kprobe *ap, *list_p; + lockdep_assert_held(&kprobe_mutex); + ap = get_kprobe(p->addr); if (unlikely(!ap)) return NULL; if (p != ap) { - list_for_each_entry_rcu(list_p, &ap->list, list) + list_for_each_entry(list_p, &ap->list, list) if (list_p == p) /* kprobe p is a valid probe */ goto valid; @@ -1669,7 +1732,9 @@ static int aggr_kprobe_disabled(struct kprobe *ap) { struct kprobe *kp; - list_for_each_entry_rcu(kp, &ap->list, list) + lockdep_assert_held(&kprobe_mutex); + + list_for_each_entry(kp, &ap->list, list) if (!kprobe_disabled(kp)) /* * There is an active probe on the list. @@ -1748,7 +1813,7 @@ static int __unregister_kprobe_top(struct kprobe *p) else { /* If disabling probe has special handlers, update aggrprobe */ if (p->post_handler && !kprobe_gone(p)) { - list_for_each_entry_rcu(list_p, &ap->list, list) { + list_for_each_entry(list_p, &ap->list, list) { if ((list_p != p) && (list_p->post_handler)) goto noclean; } @@ -2062,13 +2127,15 @@ static void kill_kprobe(struct kprobe *p) { struct kprobe *kp; + lockdep_assert_held(&kprobe_mutex); + p->flags |= KPROBE_FLAG_GONE; if (kprobe_aggrprobe(p)) { /* * If this is an aggr_kprobe, we have to list all the * chained probes and mark them GONE. */ - list_for_each_entry_rcu(kp, &p->list, list) + list_for_each_entry(kp, &p->list, list) kp->flags |= KPROBE_FLAG_GONE; p->post_handler = NULL; kill_optimized_kprobe(p); @@ -2179,6 +2246,46 @@ int kprobe_add_area_blacklist(unsigned long start, unsigned long end) return 0; } +/* Remove all symbols in given area from kprobe blacklist */ +static void kprobe_remove_area_blacklist(unsigned long start, unsigned long end) +{ + struct kprobe_blacklist_entry *ent, *n; + + list_for_each_entry_safe(ent, n, &kprobe_blacklist, list) { + if (ent->start_addr < start || ent->start_addr >= end) + continue; + list_del(&ent->list); + kfree(ent); + } +} + +static void kprobe_remove_ksym_blacklist(unsigned long entry) +{ + kprobe_remove_area_blacklist(entry, entry + 1); +} + +int __weak arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value, + char *type, char *sym) +{ + return -ERANGE; +} + +int kprobe_get_kallsym(unsigned int symnum, unsigned long *value, char *type, + char *sym) +{ +#ifdef __ARCH_WANT_KPROBES_INSN_SLOT + if (!kprobe_cache_get_kallsym(&kprobe_insn_slots, &symnum, value, type, sym)) + return 0; +#ifdef CONFIG_OPTPROBES + if (!kprobe_cache_get_kallsym(&kprobe_optinsn_slots, &symnum, value, type, sym)) + return 0; +#endif +#endif + if (!arch_kprobe_get_kallsym(&symnum, value, type, sym)) + return 0; + return -ERANGE; +} + int __init __weak arch_populate_kprobe_blacklist(void) { return 0; @@ -2211,10 +2318,62 @@ static int __init populate_kprobe_blacklist(unsigned long *start, /* Symbols in __kprobes_text are blacklisted */ ret = kprobe_add_area_blacklist((unsigned long)__kprobes_text_start, (unsigned long)__kprobes_text_end); + if (ret) + return ret; + + /* Symbols in noinstr section are blacklisted */ + ret = kprobe_add_area_blacklist((unsigned long)__noinstr_text_start, + (unsigned long)__noinstr_text_end); return ret ? : arch_populate_kprobe_blacklist(); } +static void add_module_kprobe_blacklist(struct module *mod) +{ + unsigned long start, end; + int i; + + if (mod->kprobe_blacklist) { + for (i = 0; i < mod->num_kprobe_blacklist; i++) + kprobe_add_ksym_blacklist(mod->kprobe_blacklist[i]); + } + + start = (unsigned long)mod->kprobes_text_start; + if (start) { + end = start + mod->kprobes_text_size; + kprobe_add_area_blacklist(start, end); + } + + start = (unsigned long)mod->noinstr_text_start; + if (start) { + end = start + mod->noinstr_text_size; + kprobe_add_area_blacklist(start, end); + } +} + +static void remove_module_kprobe_blacklist(struct module *mod) +{ + unsigned long start, end; + int i; + + if (mod->kprobe_blacklist) { + for (i = 0; i < mod->num_kprobe_blacklist; i++) + kprobe_remove_ksym_blacklist(mod->kprobe_blacklist[i]); + } + + start = (unsigned long)mod->kprobes_text_start; + if (start) { + end = start + mod->kprobes_text_size; + kprobe_remove_area_blacklist(start, end); + } + + start = (unsigned long)mod->noinstr_text_start; + if (start) { + end = start + mod->noinstr_text_size; + kprobe_remove_area_blacklist(start, end); + } +} + /* Module notifier call back, checking kprobes on the module */ static int kprobes_module_callback(struct notifier_block *nb, unsigned long val, void *data) @@ -2225,6 +2384,11 @@ static int kprobes_module_callback(struct notifier_block *nb, unsigned int i; int checkcore = (val == MODULE_STATE_GOING); + if (val == MODULE_STATE_COMING) { + mutex_lock(&kprobe_mutex); + add_module_kprobe_blacklist(mod); + mutex_unlock(&kprobe_mutex); + } if (val != MODULE_STATE_GOING && val != MODULE_STATE_LIVE) return NOTIFY_DONE; @@ -2237,7 +2401,7 @@ static int kprobes_module_callback(struct notifier_block *nb, mutex_lock(&kprobe_mutex); for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; - hlist_for_each_entry_rcu(p, head, hlist) + hlist_for_each_entry(p, head, hlist) if (within_module_init((unsigned long)p->addr, mod) || (checkcore && within_module_core((unsigned long)p->addr, mod))) { @@ -2255,6 +2419,8 @@ static int kprobes_module_callback(struct notifier_block *nb, kill_kprobe(p); } } + if (val == MODULE_STATE_GOING) + remove_module_kprobe_blacklist(mod); mutex_unlock(&kprobe_mutex); return NOTIFY_DONE; } @@ -2336,7 +2502,7 @@ static void report_probe(struct seq_file *pi, struct kprobe *p, else kprobe_type = "k"; - if (!kallsyms_show_value()) + if (!kallsyms_show_value(pi->file->f_cred)) addr = NULL; if (sym) @@ -2398,28 +2564,19 @@ static int show_kprobe_addr(struct seq_file *pi, void *v) return 0; } -static const struct seq_operations kprobes_seq_ops = { +static const struct seq_operations kprobes_sops = { .start = kprobe_seq_start, .next = kprobe_seq_next, .stop = kprobe_seq_stop, .show = show_kprobe_addr }; -static int kprobes_open(struct inode *inode, struct file *filp) -{ - return seq_open(filp, &kprobes_seq_ops); -} - -static const struct file_operations debugfs_kprobes_operations = { - .open = kprobes_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; +DEFINE_SEQ_ATTRIBUTE(kprobes); /* kprobes/blacklist -- shows which functions can not be probed */ static void *kprobe_blacklist_seq_start(struct seq_file *m, loff_t *pos) { + mutex_lock(&kprobe_mutex); return seq_list_start(&kprobe_blacklist, *pos); } @@ -2437,7 +2594,7 @@ static int kprobe_blacklist_seq_show(struct seq_file *m, void *v) * If /proc/kallsyms is not showing kernel address, we won't * show them here either. */ - if (!kallsyms_show_value()) + if (!kallsyms_show_value(m->file->f_cred)) seq_printf(m, "0x%px-0x%px\t%ps\n", NULL, NULL, (void *)ent->start_addr); else @@ -2446,24 +2603,18 @@ static int kprobe_blacklist_seq_show(struct seq_file *m, void *v) return 0; } -static const struct seq_operations kprobe_blacklist_seq_ops = { - .start = kprobe_blacklist_seq_start, - .next = kprobe_blacklist_seq_next, - .stop = kprobe_seq_stop, /* Reuse void function */ - .show = kprobe_blacklist_seq_show, -}; - -static int kprobe_blacklist_open(struct inode *inode, struct file *filp) +static void kprobe_blacklist_seq_stop(struct seq_file *f, void *v) { - return seq_open(filp, &kprobe_blacklist_seq_ops); + mutex_unlock(&kprobe_mutex); } -static const struct file_operations debugfs_kprobe_blacklist_ops = { - .open = kprobe_blacklist_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, +static const struct seq_operations kprobe_blacklist_sops = { + .start = kprobe_blacklist_seq_start, + .next = kprobe_blacklist_seq_next, + .stop = kprobe_blacklist_seq_stop, + .show = kprobe_blacklist_seq_show, }; +DEFINE_SEQ_ATTRIBUTE(kprobe_blacklist); static int arm_all_kprobes(void) { @@ -2488,7 +2639,7 @@ static int arm_all_kprobes(void) for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; /* Arm all kprobes on a best-effort basis */ - hlist_for_each_entry_rcu(p, head, hlist) { + hlist_for_each_entry(p, head, hlist) { if (!kprobe_disabled(p)) { err = arm_kprobe(p); if (err) { @@ -2531,7 +2682,7 @@ static int disarm_all_kprobes(void) for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; /* Disarm all kprobes on a best-effort basis */ - hlist_for_each_entry_rcu(p, head, hlist) { + hlist_for_each_entry(p, head, hlist) { if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) { err = disarm_kprobe(p, false); if (err) { @@ -2622,13 +2773,12 @@ static int __init debugfs_kprobe_init(void) dir = debugfs_create_dir("kprobes", NULL); - debugfs_create_file("list", 0400, dir, NULL, - &debugfs_kprobes_operations); + debugfs_create_file("list", 0400, dir, NULL, &kprobes_fops); debugfs_create_file("enabled", 0600, dir, &value, &fops_kp); debugfs_create_file("blacklist", 0400, dir, NULL, - &debugfs_kprobe_blacklist_ops); + &kprobe_blacklist_fops); return 0; } diff --git a/kernel/kthread.c b/kernel/kthread.c index bfbfa481be3a..1d9e2fdfd67a 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -1,13 +1,17 @@ // SPDX-License-Identifier: GPL-2.0-only /* Kernel thread helper functions. * Copyright (C) 2004 IBM Corporation, Rusty Russell. + * Copyright (C) 2009 Red Hat, Inc. * * Creation is done via kthreadd, so that we get a clean environment * even if we're invoked from userspace (think modprobe, hotplug cpu, * etc.). */ #include <uapi/linux/sched/types.h> +#include <linux/mm.h> +#include <linux/mmu_context.h> #include <linux/sched.h> +#include <linux/sched/mm.h> #include <linux/sched/task.h> #include <linux/kthread.h> #include <linux/completion.h> @@ -23,8 +27,10 @@ #include <linux/ptrace.h> #include <linux/uaccess.h> #include <linux/numa.h> +#include <linux/sched/isolation.h> #include <trace/events/sched.h> + static DEFINE_SPINLOCK(kthread_create_lock); static LIST_HEAD(kthread_create_list); struct task_struct *kthreadd_task; @@ -46,7 +52,9 @@ struct kthread_create_info struct kthread { unsigned long flags; unsigned int cpu; + int (*threadfn)(void *); void *data; + mm_segment_t oldfs; struct completion parked; struct completion exited; #ifdef CONFIG_BLK_CGROUP @@ -153,6 +161,20 @@ bool kthread_freezable_should_stop(bool *was_frozen) EXPORT_SYMBOL_GPL(kthread_freezable_should_stop); /** + * kthread_func - return the function specified on kthread creation + * @task: kthread task in question + * + * Returns NULL if the task is not a kthread. + */ +void *kthread_func(struct task_struct *task) +{ + if (task->flags & PF_KTHREAD) + return to_kthread(task)->threadfn; + return NULL; +} +EXPORT_SYMBOL_GPL(kthread_func); + +/** * kthread_data - return data value specified on kthread creation * @task: kthread task in question * @@ -164,6 +186,7 @@ void *kthread_data(struct task_struct *task) { return to_kthread(task)->data; } +EXPORT_SYMBOL_GPL(kthread_data); /** * kthread_probe_data - speculative version of kthread_data() @@ -179,7 +202,7 @@ void *kthread_probe_data(struct task_struct *task) struct kthread *kthread = to_kthread(task); void *data = NULL; - probe_kernel_read(&data, &kthread->data, sizeof(data)); + copy_from_kernel_nofault(&data, &kthread->data, sizeof(data)); return data; } @@ -244,6 +267,7 @@ static int kthread(void *_create) do_exit(-ENOMEM); } + self->threadfn = threadfn; self->data = data; init_completion(&self->exited); init_completion(&self->parked); @@ -360,7 +384,8 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), * The kernel thread should not inherit these properties. */ sched_setscheduler_nocheck(task, SCHED_NORMAL, ¶m); - set_cpus_allowed_ptr(task, cpu_all_mask); + set_cpus_allowed_ptr(task, + housekeeping_cpumask(HK_FLAG_KTHREAD)); } kfree(create); return task; @@ -585,7 +610,7 @@ int kthreadd(void *unused) /* Setup a clean context for our children to inherit. */ set_task_comm(tsk, "kthreadd"); ignore_signals(tsk); - set_cpus_allowed_ptr(tsk, cpu_all_mask); + set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_FLAG_KTHREAD)); set_mems_allowed(node_states[N_MEMORY]); current->flags |= PF_NOFREEZE; @@ -1203,6 +1228,61 @@ void kthread_destroy_worker(struct kthread_worker *worker) } EXPORT_SYMBOL(kthread_destroy_worker); +/** + * kthread_use_mm - make the calling kthread operate on an address space + * @mm: address space to operate on + */ +void kthread_use_mm(struct mm_struct *mm) +{ + struct mm_struct *active_mm; + struct task_struct *tsk = current; + + WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD)); + WARN_ON_ONCE(tsk->mm); + + task_lock(tsk); + active_mm = tsk->active_mm; + if (active_mm != mm) { + mmgrab(mm); + tsk->active_mm = mm; + } + tsk->mm = mm; + switch_mm(active_mm, mm, tsk); + task_unlock(tsk); +#ifdef finish_arch_post_lock_switch + finish_arch_post_lock_switch(); +#endif + + if (active_mm != mm) + mmdrop(active_mm); + + to_kthread(tsk)->oldfs = get_fs(); + set_fs(USER_DS); +} +EXPORT_SYMBOL_GPL(kthread_use_mm); + +/** + * kthread_unuse_mm - reverse the effect of kthread_use_mm() + * @mm: address space to operate on + */ +void kthread_unuse_mm(struct mm_struct *mm) +{ + struct task_struct *tsk = current; + + WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD)); + WARN_ON_ONCE(!tsk->mm); + + set_fs(to_kthread(tsk)->oldfs); + + task_lock(tsk); + sync_mm_rss(mm); + tsk->mm = NULL; + /* active_mm is still 'mm' */ + enter_lazy_tlb(mm, tsk); + task_unlock(tsk); +} +EXPORT_SYMBOL_GPL(kthread_unuse_mm); + #ifdef CONFIG_BLK_CGROUP /** * kthread_associate_blkcg - associate blkcg to current kthread diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 8d1c15832e55..166d7bf49666 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -269,8 +269,8 @@ static int __init init_lstats_procfs(void) return 0; } -int sysctl_latencytop(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int sysctl_latencytop(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int err; diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index c3512e7e0801..f76fdb925532 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -191,18 +191,21 @@ static int klp_find_object_symbol(const char *objname, const char *name, return -EINVAL; } -static int klp_resolve_symbols(Elf_Shdr *relasec, struct module *pmod) +static int klp_resolve_symbols(Elf64_Shdr *sechdrs, const char *strtab, + unsigned int symndx, Elf_Shdr *relasec, + const char *sec_objname) { - int i, cnt, vmlinux, ret; - char objname[MODULE_NAME_LEN]; - char symname[KSYM_NAME_LEN]; - char *strtab = pmod->core_kallsyms.strtab; + int i, cnt, ret; + char sym_objname[MODULE_NAME_LEN]; + char sym_name[KSYM_NAME_LEN]; Elf_Rela *relas; Elf_Sym *sym; unsigned long sympos, addr; + bool sym_vmlinux; + bool sec_vmlinux = !strcmp(sec_objname, "vmlinux"); /* - * Since the field widths for objname and symname in the sscanf() + * Since the field widths for sym_objname and sym_name in the sscanf() * call are hard-coded and correspond to MODULE_NAME_LEN and * KSYM_NAME_LEN respectively, we must make sure that MODULE_NAME_LEN * and KSYM_NAME_LEN have the values we expect them to have. @@ -216,27 +219,40 @@ static int klp_resolve_symbols(Elf_Shdr *relasec, struct module *pmod) relas = (Elf_Rela *) relasec->sh_addr; /* For each rela in this klp relocation section */ for (i = 0; i < relasec->sh_size / sizeof(Elf_Rela); i++) { - sym = pmod->core_kallsyms.symtab + ELF_R_SYM(relas[i].r_info); + sym = (Elf64_Sym *)sechdrs[symndx].sh_addr + ELF_R_SYM(relas[i].r_info); if (sym->st_shndx != SHN_LIVEPATCH) { pr_err("symbol %s is not marked as a livepatch symbol\n", strtab + sym->st_name); return -EINVAL; } - /* Format: .klp.sym.objname.symname,sympos */ + /* Format: .klp.sym.sym_objname.sym_name,sympos */ cnt = sscanf(strtab + sym->st_name, ".klp.sym.%55[^.].%127[^,],%lu", - objname, symname, &sympos); + sym_objname, sym_name, &sympos); if (cnt != 3) { pr_err("symbol %s has an incorrectly formatted name\n", strtab + sym->st_name); return -EINVAL; } + sym_vmlinux = !strcmp(sym_objname, "vmlinux"); + + /* + * Prevent module-specific KLP rela sections from referencing + * vmlinux symbols. This helps prevent ordering issues with + * module special section initializations. Presumably such + * symbols are exported and normal relas can be used instead. + */ + if (!sec_vmlinux && sym_vmlinux) { + pr_err("invalid access to vmlinux symbol '%s' from module-specific livepatch relocation section", + sym_name); + return -EINVAL; + } + /* klp_find_object_symbol() treats a NULL objname as vmlinux */ - vmlinux = !strcmp(objname, "vmlinux"); - ret = klp_find_object_symbol(vmlinux ? NULL : objname, - symname, sympos, &addr); + ret = klp_find_object_symbol(sym_vmlinux ? NULL : sym_objname, + sym_name, sympos, &addr); if (ret) return ret; @@ -246,54 +262,59 @@ static int klp_resolve_symbols(Elf_Shdr *relasec, struct module *pmod) return 0; } -static int klp_write_object_relocations(struct module *pmod, - struct klp_object *obj) +/* + * At a high-level, there are two types of klp relocation sections: those which + * reference symbols which live in vmlinux; and those which reference symbols + * which live in other modules. This function is called for both types: + * + * 1) When a klp module itself loads, the module code calls this function to + * write vmlinux-specific klp relocations (.klp.rela.vmlinux.* sections). + * These relocations are written to the klp module text to allow the patched + * code/data to reference unexported vmlinux symbols. They're written as + * early as possible to ensure that other module init code (.e.g., + * jump_label_apply_nops) can access any unexported vmlinux symbols which + * might be referenced by the klp module's special sections. + * + * 2) When a to-be-patched module loads -- or is already loaded when a + * corresponding klp module loads -- klp code calls this function to write + * module-specific klp relocations (.klp.rela.{module}.* sections). These + * are written to the klp module text to allow the patched code/data to + * reference symbols which live in the to-be-patched module or one of its + * module dependencies. Exported symbols are supported, in addition to + * unexported symbols, in order to enable late module patching, which allows + * the to-be-patched module to be loaded and patched sometime *after* the + * klp module is loaded. + */ +int klp_apply_section_relocs(struct module *pmod, Elf_Shdr *sechdrs, + const char *shstrtab, const char *strtab, + unsigned int symndx, unsigned int secndx, + const char *objname) { - int i, cnt, ret = 0; - const char *objname, *secname; + int cnt, ret; char sec_objname[MODULE_NAME_LEN]; - Elf_Shdr *sec; + Elf_Shdr *sec = sechdrs + secndx; - if (WARN_ON(!klp_is_object_loaded(obj))) + /* + * Format: .klp.rela.sec_objname.section_name + * See comment in klp_resolve_symbols() for an explanation + * of the selected field width value. + */ + cnt = sscanf(shstrtab + sec->sh_name, ".klp.rela.%55[^.]", + sec_objname); + if (cnt != 1) { + pr_err("section %s has an incorrectly formatted name\n", + shstrtab + sec->sh_name); return -EINVAL; + } - objname = klp_is_module(obj) ? obj->name : "vmlinux"; - - /* For each klp relocation section */ - for (i = 1; i < pmod->klp_info->hdr.e_shnum; i++) { - sec = pmod->klp_info->sechdrs + i; - secname = pmod->klp_info->secstrings + sec->sh_name; - if (!(sec->sh_flags & SHF_RELA_LIVEPATCH)) - continue; - - /* - * Format: .klp.rela.sec_objname.section_name - * See comment in klp_resolve_symbols() for an explanation - * of the selected field width value. - */ - cnt = sscanf(secname, ".klp.rela.%55[^.]", sec_objname); - if (cnt != 1) { - pr_err("section %s has an incorrectly formatted name\n", - secname); - ret = -EINVAL; - break; - } - - if (strcmp(objname, sec_objname)) - continue; - - ret = klp_resolve_symbols(sec, pmod); - if (ret) - break; + if (strcmp(objname ? objname : "vmlinux", sec_objname)) + return 0; - ret = apply_relocate_add(pmod->klp_info->sechdrs, - pmod->core_kallsyms.strtab, - pmod->klp_info->symndx, i, pmod); - if (ret) - break; - } + ret = klp_resolve_symbols(sechdrs, strtab, symndx, sec, sec_objname); + if (ret) + return ret; - return ret; + return apply_relocate_add(sechdrs, strtab, symndx, secndx, pmod); } /* @@ -724,10 +745,27 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func) func->old_sympos ? func->old_sympos : 1); } -/* Arches may override this to finish any remaining arch-specific tasks */ -void __weak arch_klp_init_object_loaded(struct klp_patch *patch, - struct klp_object *obj) +static int klp_apply_object_relocs(struct klp_patch *patch, + struct klp_object *obj) { + int i, ret; + struct klp_modinfo *info = patch->mod->klp_info; + + for (i = 1; i < info->hdr.e_shnum; i++) { + Elf_Shdr *sec = info->sechdrs + i; + + if (!(sec->sh_flags & SHF_RELA_LIVEPATCH)) + continue; + + ret = klp_apply_section_relocs(patch->mod, info->sechdrs, + info->secstrings, + patch->mod->core_kallsyms.strtab, + info->symndx, i, obj->name); + if (ret) + return ret; + } + + return 0; } /* parts of the initialization that is done only when the object is loaded */ @@ -737,21 +775,18 @@ static int klp_init_object_loaded(struct klp_patch *patch, struct klp_func *func; int ret; - mutex_lock(&text_mutex); - - module_disable_ro(patch->mod); - ret = klp_write_object_relocations(patch->mod, obj); - if (ret) { - module_enable_ro(patch->mod, true); - mutex_unlock(&text_mutex); - return ret; + if (klp_is_module(obj)) { + /* + * Only write module-specific relocations here + * (.klp.rela.{module}.*). vmlinux-specific relocations were + * written earlier during the initialization of the klp module + * itself. + */ + ret = klp_apply_object_relocs(patch, obj); + if (ret) + return ret; } - arch_klp_init_object_loaded(patch, obj); - module_enable_ro(patch->mod, true); - - mutex_unlock(&text_mutex); - klp_for_each_func(obj, func) { ret = klp_find_object_symbol(obj->name, func->old_name, func->old_sympos, @@ -1139,6 +1174,11 @@ int klp_module_coming(struct module *mod) if (WARN_ON(mod->state != MODULE_STATE_COMING)) return -EINVAL; + if (!strcmp(mod->name, "vmlinux")) { + pr_err("vmlinux.ko: invalid module name"); + return -EINVAL; + } + mutex_lock(&klp_mutex); /* * Each module has to know that klp_module_coming() diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 45452facff3b..6d11cfb9b41f 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -5,6 +5,9 @@ KCOV_INSTRUMENT := n obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o +# Avoid recursion lockdep -> KCSAN -> ... -> lockdep. +KCSAN_SANITIZE_lockdep.o := n + ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_lockdep_proc.o = $(CC_FLAGS_FTRACE) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index ac10db66cc63..2fad21d345b0 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -393,28 +393,9 @@ void lockdep_init_task(struct task_struct *task) task->lockdep_recursion = 0; } -/* - * Split the recrursion counter in two to readily detect 'off' vs recursion. - */ -#define LOCKDEP_RECURSION_BITS 16 -#define LOCKDEP_OFF (1U << LOCKDEP_RECURSION_BITS) -#define LOCKDEP_RECURSION_MASK (LOCKDEP_OFF - 1) - -void lockdep_off(void) -{ - current->lockdep_recursion += LOCKDEP_OFF; -} -EXPORT_SYMBOL(lockdep_off); - -void lockdep_on(void) +static __always_inline void lockdep_recursion_finish(void) { - current->lockdep_recursion -= LOCKDEP_OFF; -} -EXPORT_SYMBOL(lockdep_on); - -static inline void lockdep_recursion_finish(void) -{ - if (WARN_ON_ONCE(--current->lockdep_recursion)) + if (WARN_ON_ONCE((--current->lockdep_recursion) & LOCKDEP_RECURSION_MASK)) current->lockdep_recursion = 0; } @@ -489,7 +470,7 @@ struct lock_trace { struct hlist_node hash_entry; u32 hash; u32 nr_entries; - unsigned long entries[0] __aligned(sizeof(unsigned long)); + unsigned long entries[] __aligned(sizeof(unsigned long)); }; #define LOCK_TRACE_SIZE_IN_LONGS \ (sizeof(struct lock_trace) / sizeof(unsigned long)) @@ -820,7 +801,7 @@ static int count_matching_names(struct lock_class *new_class) } /* used from NMI context -- must be lockless */ -static inline struct lock_class * +static __always_inline struct lock_class * look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass) { struct lockdep_subclass_key *key; @@ -1742,7 +1723,7 @@ static int noop_count(struct lock_list *entry, void *data) static unsigned long __lockdep_count_forward_deps(struct lock_list *this) { unsigned long count = 0; - struct lock_list *uninitialized_var(target_entry); + struct lock_list *target_entry; __bfs_forwards(this, (void *)&count, noop_count, &target_entry); @@ -1768,7 +1749,7 @@ unsigned long lockdep_count_forward_deps(struct lock_class *class) static unsigned long __lockdep_count_backward_deps(struct lock_list *this) { unsigned long count = 0; - struct lock_list *uninitialized_var(target_entry); + struct lock_list *target_entry; __bfs_backwards(this, (void *)&count, noop_count, &target_entry); @@ -1823,7 +1804,7 @@ check_noncircular(struct held_lock *src, struct held_lock *target, struct lock_trace **const trace) { int ret; - struct lock_list *uninitialized_var(target_entry); + struct lock_list *target_entry; struct lock_list src_entry = { .class = hlock_class(src), .parent = NULL, @@ -1861,7 +1842,7 @@ static noinline int check_redundant(struct held_lock *src, struct held_lock *target) { int ret; - struct lock_list *uninitialized_var(target_entry); + struct lock_list *target_entry; struct lock_list src_entry = { .class = hlock_class(src), .parent = NULL, @@ -2081,9 +2062,9 @@ print_bad_irq_dependency(struct task_struct *curr, pr_warn("-----------------------------------------------------\n"); pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", curr->comm, task_pid_nr(curr), - curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT, + lockdep_hardirq_context(), hardirq_count() >> HARDIRQ_SHIFT, curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT, - curr->hardirqs_enabled, + lockdep_hardirqs_enabled(), curr->softirqs_enabled); print_lock(next); @@ -2263,8 +2244,8 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, { unsigned long usage_mask = 0, forward_mask, backward_mask; enum lock_usage_bit forward_bit = 0, backward_bit = 0; - struct lock_list *uninitialized_var(target_entry1); - struct lock_list *uninitialized_var(target_entry); + struct lock_list *target_entry1; + struct lock_list *target_entry; struct lock_list this, that; int ret; @@ -3350,9 +3331,9 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n", curr->comm, task_pid_nr(curr), - lockdep_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT, + lockdep_hardirq_context(), hardirq_count() >> HARDIRQ_SHIFT, lockdep_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT, - lockdep_hardirqs_enabled(curr), + lockdep_hardirqs_enabled(), lockdep_softirqs_enabled(curr)); print_lock(this); @@ -3457,7 +3438,7 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this, { int ret; struct lock_list root; - struct lock_list *uninitialized_var(target_entry); + struct lock_list *target_entry; root.parent = NULL; root.class = hlock_class(this); @@ -3484,7 +3465,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this, { int ret; struct lock_list root; - struct lock_list *uninitialized_var(target_entry); + struct lock_list *target_entry; root.parent = NULL; root.class = hlock_class(this); @@ -3503,19 +3484,21 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this, void print_irqtrace_events(struct task_struct *curr) { - printk("irq event stamp: %u\n", curr->irq_events); + const struct irqtrace_events *trace = &curr->irqtrace; + + printk("irq event stamp: %u\n", trace->irq_events); printk("hardirqs last enabled at (%u): [<%px>] %pS\n", - curr->hardirq_enable_event, (void *)curr->hardirq_enable_ip, - (void *)curr->hardirq_enable_ip); + trace->hardirq_enable_event, (void *)trace->hardirq_enable_ip, + (void *)trace->hardirq_enable_ip); printk("hardirqs last disabled at (%u): [<%px>] %pS\n", - curr->hardirq_disable_event, (void *)curr->hardirq_disable_ip, - (void *)curr->hardirq_disable_ip); + trace->hardirq_disable_event, (void *)trace->hardirq_disable_ip, + (void *)trace->hardirq_disable_ip); printk("softirqs last enabled at (%u): [<%px>] %pS\n", - curr->softirq_enable_event, (void *)curr->softirq_enable_ip, - (void *)curr->softirq_enable_ip); + trace->softirq_enable_event, (void *)trace->softirq_enable_ip, + (void *)trace->softirq_enable_ip); printk("softirqs last disabled at (%u): [<%px>] %pS\n", - curr->softirq_disable_event, (void *)curr->softirq_disable_ip, - (void *)curr->softirq_disable_ip); + trace->softirq_disable_event, (void *)trace->softirq_disable_ip, + (void *)trace->softirq_disable_ip); } static int HARDIRQ_verbose(struct lock_class *class) @@ -3635,13 +3618,10 @@ mark_held_locks(struct task_struct *curr, enum lock_usage_bit base_bit) /* * Hardirqs will be enabled: */ -static void __trace_hardirqs_on_caller(unsigned long ip) +static void __trace_hardirqs_on_caller(void) { struct task_struct *curr = current; - /* we'll do an OFF -> ON transition: */ - curr->hardirqs_enabled = 1; - /* * We are going to turn hardirqs on, so set the * usage bit for all held locks: @@ -3654,20 +3634,33 @@ static void __trace_hardirqs_on_caller(unsigned long ip) * this bit from being set before) */ if (curr->softirqs_enabled) - if (!mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ)) - return; - - curr->hardirq_enable_ip = ip; - curr->hardirq_enable_event = ++curr->irq_events; - debug_atomic_inc(hardirqs_on_events); + mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ); } -void lockdep_hardirqs_on(unsigned long ip) +/** + * lockdep_hardirqs_on_prepare - Prepare for enabling interrupts + * @ip: Caller address + * + * Invoked before a possible transition to RCU idle from exit to user or + * guest mode. This ensures that all RCU operations are done before RCU + * stops watching. After the RCU transition lockdep_hardirqs_on() has to be + * invoked to set the final state. + */ +void lockdep_hardirqs_on_prepare(unsigned long ip) { - if (unlikely(!debug_locks || current->lockdep_recursion)) + if (unlikely(!debug_locks)) return; - if (unlikely(current->hardirqs_enabled)) { + /* + * NMIs do not (and cannot) track lock dependencies, nothing to do. + */ + if (unlikely(in_nmi())) + return; + + if (unlikely(current->lockdep_recursion & LOCKDEP_RECURSION_MASK)) + return; + + if (unlikely(lockdep_hardirqs_enabled())) { /* * Neither irq nor preemption are disabled here * so this is racy by nature but losing one hit @@ -3695,23 +3688,98 @@ void lockdep_hardirqs_on(unsigned long ip) * Can't allow enabling interrupts while in an interrupt handler, * that's general bad form and such. Recursion, limited stack etc.. */ - if (DEBUG_LOCKS_WARN_ON(current->hardirq_context)) + if (DEBUG_LOCKS_WARN_ON(lockdep_hardirq_context())) return; + current->hardirq_chain_key = current->curr_chain_key; + current->lockdep_recursion++; - __trace_hardirqs_on_caller(ip); + __trace_hardirqs_on_caller(); lockdep_recursion_finish(); } -NOKPROBE_SYMBOL(lockdep_hardirqs_on); +EXPORT_SYMBOL_GPL(lockdep_hardirqs_on_prepare); + +void noinstr lockdep_hardirqs_on(unsigned long ip) +{ + struct irqtrace_events *trace = ¤t->irqtrace; + + if (unlikely(!debug_locks)) + return; + + /* + * NMIs can happen in the middle of local_irq_{en,dis}able() where the + * tracking state and hardware state are out of sync. + * + * NMIs must save lockdep_hardirqs_enabled() to restore IRQ state from, + * and not rely on hardware state like normal interrupts. + */ + if (unlikely(in_nmi())) { + if (!IS_ENABLED(CONFIG_TRACE_IRQFLAGS_NMI)) + return; + + /* + * Skip: + * - recursion check, because NMI can hit lockdep; + * - hardware state check, because above; + * - chain_key check, see lockdep_hardirqs_on_prepare(). + */ + goto skip_checks; + } + + if (unlikely(current->lockdep_recursion & LOCKDEP_RECURSION_MASK)) + return; + + if (lockdep_hardirqs_enabled()) { + /* + * Neither irq nor preemption are disabled here + * so this is racy by nature but losing one hit + * in a stat is not a big deal. + */ + __debug_atomic_inc(redundant_hardirqs_on); + return; + } + + /* + * We're enabling irqs and according to our state above irqs weren't + * already enabled, yet we find the hardware thinks they are in fact + * enabled.. someone messed up their IRQ state tracing. + */ + if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + return; + + /* + * Ensure the lock stack remained unchanged between + * lockdep_hardirqs_on_prepare() and lockdep_hardirqs_on(). + */ + DEBUG_LOCKS_WARN_ON(current->hardirq_chain_key != + current->curr_chain_key); + +skip_checks: + /* we'll do an OFF -> ON transition: */ + this_cpu_write(hardirqs_enabled, 1); + trace->hardirq_enable_ip = ip; + trace->hardirq_enable_event = ++trace->irq_events; + debug_atomic_inc(hardirqs_on_events); +} +EXPORT_SYMBOL_GPL(lockdep_hardirqs_on); /* * Hardirqs were disabled: */ -void lockdep_hardirqs_off(unsigned long ip) +void noinstr lockdep_hardirqs_off(unsigned long ip) { - struct task_struct *curr = current; + if (unlikely(!debug_locks)) + return; - if (unlikely(!debug_locks || current->lockdep_recursion)) + /* + * Matching lockdep_hardirqs_on(), allow NMIs in the middle of lockdep; + * they will restore the software state. This ensures the software + * state is consistent inside NMIs as well. + */ + if (in_nmi()) { + if (!IS_ENABLED(CONFIG_TRACE_IRQFLAGS_NMI)) + return; + } else if (current->lockdep_recursion & LOCKDEP_RECURSION_MASK) return; /* @@ -3721,25 +3789,28 @@ void lockdep_hardirqs_off(unsigned long ip) if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return; - if (curr->hardirqs_enabled) { + if (lockdep_hardirqs_enabled()) { + struct irqtrace_events *trace = ¤t->irqtrace; + /* * We have done an ON -> OFF transition: */ - curr->hardirqs_enabled = 0; - curr->hardirq_disable_ip = ip; - curr->hardirq_disable_event = ++curr->irq_events; + this_cpu_write(hardirqs_enabled, 0); + trace->hardirq_disable_ip = ip; + trace->hardirq_disable_event = ++trace->irq_events; debug_atomic_inc(hardirqs_off_events); - } else + } else { debug_atomic_inc(redundant_hardirqs_off); + } } -NOKPROBE_SYMBOL(lockdep_hardirqs_off); +EXPORT_SYMBOL_GPL(lockdep_hardirqs_off); /* * Softirqs will be enabled: */ void lockdep_softirqs_on(unsigned long ip) { - struct task_struct *curr = current; + struct irqtrace_events *trace = ¤t->irqtrace; if (unlikely(!debug_locks || current->lockdep_recursion)) return; @@ -3751,7 +3822,7 @@ void lockdep_softirqs_on(unsigned long ip) if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return; - if (curr->softirqs_enabled) { + if (current->softirqs_enabled) { debug_atomic_inc(redundant_softirqs_on); return; } @@ -3760,17 +3831,17 @@ void lockdep_softirqs_on(unsigned long ip) /* * We'll do an OFF -> ON transition: */ - curr->softirqs_enabled = 1; - curr->softirq_enable_ip = ip; - curr->softirq_enable_event = ++curr->irq_events; + current->softirqs_enabled = 1; + trace->softirq_enable_ip = ip; + trace->softirq_enable_event = ++trace->irq_events; debug_atomic_inc(softirqs_on_events); /* * We are going to turn softirqs on, so set the * usage bit for all held locks, if hardirqs are * enabled too: */ - if (curr->hardirqs_enabled) - mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ); + if (lockdep_hardirqs_enabled()) + mark_held_locks(current, LOCK_ENABLED_SOFTIRQ); lockdep_recursion_finish(); } @@ -3779,8 +3850,6 @@ void lockdep_softirqs_on(unsigned long ip) */ void lockdep_softirqs_off(unsigned long ip) { - struct task_struct *curr = current; - if (unlikely(!debug_locks || current->lockdep_recursion)) return; @@ -3790,13 +3859,15 @@ void lockdep_softirqs_off(unsigned long ip) if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return; - if (curr->softirqs_enabled) { + if (current->softirqs_enabled) { + struct irqtrace_events *trace = ¤t->irqtrace; + /* * We have done an ON -> OFF transition: */ - curr->softirqs_enabled = 0; - curr->softirq_disable_ip = ip; - curr->softirq_disable_event = ++curr->irq_events; + current->softirqs_enabled = 0; + trace->softirq_disable_ip = ip; + trace->softirq_disable_event = ++trace->irq_events; debug_atomic_inc(softirqs_off_events); /* * Whoops, we wanted softirqs off, so why aren't they? @@ -3818,7 +3889,7 @@ mark_usage(struct task_struct *curr, struct held_lock *hlock, int check) */ if (!hlock->trylock) { if (hlock->read) { - if (curr->hardirq_context) + if (lockdep_hardirq_context()) if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ_READ)) return 0; @@ -3827,7 +3898,7 @@ mark_usage(struct task_struct *curr, struct held_lock *hlock, int check) LOCK_USED_IN_SOFTIRQ_READ)) return 0; } else { - if (curr->hardirq_context) + if (lockdep_hardirq_context()) if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ)) return 0; if (curr->softirq_context) @@ -3865,7 +3936,7 @@ lock_used: static inline unsigned int task_irq_context(struct task_struct *task) { - return LOCK_CHAIN_HARDIRQ_CONTEXT * !!task->hardirq_context + + return LOCK_CHAIN_HARDIRQ_CONTEXT * !!lockdep_hardirq_context() + LOCK_CHAIN_SOFTIRQ_CONTEXT * !!task->softirq_context; } @@ -3958,7 +4029,7 @@ static inline short task_wait_context(struct task_struct *curr) * Set appropriate wait type for the context; for IRQs we have to take * into account force_irqthread as that is implied by PREEMPT_RT. */ - if (curr->hardirq_context) { + if (lockdep_hardirq_context()) { /* * Check if force_irqthreads will run us threaded. */ @@ -4399,7 +4470,7 @@ static void print_unlock_imbalance_bug(struct task_struct *curr, curr->comm, task_pid_nr(curr)); print_lockdep_cache(lock); pr_cont(") at:\n"); - print_ip_sym(ip); + print_ip_sym(KERN_WARNING, ip); pr_warn("but there are no more locks to release!\n"); pr_warn("\nother info that might help us debug this:\n"); lockdep_print_held_locks(curr); @@ -4408,8 +4479,8 @@ static void print_unlock_imbalance_bug(struct task_struct *curr, dump_stack(); } -static int match_held_lock(const struct held_lock *hlock, - const struct lockdep_map *lock) +static noinstr int match_held_lock(const struct held_lock *hlock, + const struct lockdep_map *lock) { if (hlock->instance == lock) return 1; @@ -4696,7 +4767,7 @@ __lock_release(struct lockdep_map *lock, unsigned long ip) return 0; } -static nokprobe_inline +static __always_inline int __lock_is_held(const struct lockdep_map *lock, int read) { struct task_struct *curr = current; @@ -4801,11 +4872,11 @@ static void check_flags(unsigned long flags) return; if (irqs_disabled_flags(flags)) { - if (DEBUG_LOCKS_WARN_ON(current->hardirqs_enabled)) { + if (DEBUG_LOCKS_WARN_ON(lockdep_hardirqs_enabled())) { printk("possible reason: unannotated irqs-off.\n"); } } else { - if (DEBUG_LOCKS_WARN_ON(!current->hardirqs_enabled)) { + if (DEBUG_LOCKS_WARN_ON(!lockdep_hardirqs_enabled())) { printk("possible reason: unannotated irqs-on.\n"); } } @@ -4956,7 +5027,7 @@ void lock_release(struct lockdep_map *lock, unsigned long ip) } EXPORT_SYMBOL_GPL(lock_release); -int lock_is_held_type(const struct lockdep_map *lock, int read) +noinstr int lock_is_held_type(const struct lockdep_map *lock, int read) { unsigned long flags; int ret = 0; @@ -5050,7 +5121,7 @@ static void print_lock_contention_bug(struct task_struct *curr, curr->comm, task_pid_nr(curr)); print_lockdep_cache(lock); pr_cont(") at:\n"); - print_ip_sym(ip); + print_ip_sym(KERN_WARNING, ip); pr_warn("but there are no locks held!\n"); pr_warn("\nother info that might help us debug this:\n"); lockdep_print_held_locks(curr); @@ -5826,9 +5897,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) pr_warn("\n%srcu_scheduler_active = %d, debug_locks = %d\n", !rcu_lockdep_current_cpu_online() ? "RCU used illegally from offline CPU!\n" - : !rcu_is_watching() - ? "RCU used illegally from idle CPU!\n" - : "", + : "", rcu_scheduler_active, debug_locks); /* diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 5efbfc68ce99..9cfa5e89cff7 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -436,8 +436,6 @@ static int torture_rtmutex_lock(void) __acquires(torture_rtmutex) static void torture_rtmutex_boost(struct torture_random_state *trsp) { - int policy; - struct sched_param param; const unsigned int factor = 50000; /* yes, quite arbitrary */ if (!rt_task(current)) { @@ -448,8 +446,7 @@ static void torture_rtmutex_boost(struct torture_random_state *trsp) */ if (trsp && !(torture_random(trsp) % (cxt.nrealwriters_stress * factor))) { - policy = SCHED_FIFO; - param.sched_priority = MAX_RT_PRIO - 1; + sched_set_fifo(current); } else /* common case, do nothing */ return; } else { @@ -462,13 +459,10 @@ static void torture_rtmutex_boost(struct torture_random_state *trsp) */ if (!trsp || !(torture_random(trsp) % (cxt.nrealwriters_stress * factor * 2))) { - policy = SCHED_NORMAL; - param.sched_priority = 0; + sched_set_normal(current, 0); } else /* common case, do nothing */ return; } - - sched_setscheduler_nocheck(current, policy, ¶m); } static void torture_rtmutex_delay(struct torture_random_state *trsp) @@ -631,13 +625,13 @@ static int lock_torture_writer(void *arg) cxt.cur_ops->writelock(); if (WARN_ON_ONCE(lock_is_write_held)) lwsp->n_lock_fail++; - lock_is_write_held = 1; + lock_is_write_held = true; if (WARN_ON_ONCE(lock_is_read_held)) lwsp->n_lock_fail++; /* rare, but... */ lwsp->n_lock_acquired++; cxt.cur_ops->write_delay(&rand); - lock_is_write_held = 0; + lock_is_write_held = false; cxt.cur_ops->writeunlock(); stutter_wait("lock_torture_writer"); @@ -665,13 +659,13 @@ static int lock_torture_reader(void *arg) schedule_timeout_uninterruptible(1); cxt.cur_ops->readlock(); - lock_is_read_held = 1; + lock_is_read_held = true; if (WARN_ON_ONCE(lock_is_write_held)) lrsp->n_lock_fail++; /* rare, but... */ lrsp->n_lock_acquired++; cxt.cur_ops->read_delay(&rand); - lock_is_read_held = 0; + lock_is_read_held = false; cxt.cur_ops->readunlock(); stutter_wait("lock_torture_reader"); @@ -686,7 +680,7 @@ static int lock_torture_reader(void *arg) static void __torture_print_stats(char *page, struct lock_stress_stats *statp, bool write) { - bool fail = 0; + bool fail = false; int i, n_stress; long max = 0, min = statp ? statp[0].n_lock_acquired : 0; long long sum = 0; @@ -904,7 +898,7 @@ static int __init lock_torture_init(void) /* Initialize the statistics so that each run gets its own numbers. */ if (nwriters_stress) { - lock_is_write_held = 0; + lock_is_write_held = false; cxt.lwsa = kmalloc_array(cxt.nrealwriters_stress, sizeof(*cxt.lwsa), GFP_KERNEL); @@ -935,7 +929,7 @@ static int __init lock_torture_init(void) } if (nreaders_stress) { - lock_is_read_held = 0; + lock_is_read_held = false; cxt.lrsa = kmalloc_array(cxt.nrealreaders_stress, sizeof(*cxt.lrsa), GFP_KERNEL); diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c index 1f7734949ac8..1de006ed3aa8 100644 --- a/kernel/locking/osq_lock.c +++ b/kernel/locking/osq_lock.c @@ -154,7 +154,11 @@ bool osq_lock(struct optimistic_spin_queue *lock) */ for (;;) { - if (prev->next == node && + /* + * cpu_relax() below implies a compiler barrier which would + * prevent this comparison being optimized away. + */ + if (data_race(prev->next) == node && cmpxchg(&prev->next, node, NULL) == node) break; diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index b9515fcc9b29..cbff6ba53d56 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -581,4 +581,11 @@ EXPORT_SYMBOL(queued_spin_lock_slowpath); #include "qspinlock_paravirt.h" #include "qspinlock.c" +bool nopvspin __initdata; +static __init int parse_nopvspin(char *arg) +{ + nopvspin = true; + return 0; +} +early_param("nopvspin", parse_nopvspin); #endif diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c index fd4fe1f5b458..36e69100e8e0 100644 --- a/kernel/locking/rtmutex-debug.c +++ b/kernel/locking/rtmutex-debug.c @@ -125,7 +125,7 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) printk("\n%s/%d's [blocked] stackdump:\n\n", task->comm, task_pid_nr(task)); - show_stack(task, NULL); + show_stack(task, NULL, KERN_DEFAULT); printk("\n%s/%d's [current] stackdump:\n\n", current->comm, task_pid_nr(current)); dump_stack(); diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index c9f090d64f00..cfdd5b93264d 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -141,7 +141,6 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock) * set up. */ #ifndef CONFIG_DEBUG_RT_MUTEXES -# define rt_mutex_cmpxchg_relaxed(l,c,n) (cmpxchg_relaxed(&l->owner, c, n) == c) # define rt_mutex_cmpxchg_acquire(l,c,n) (cmpxchg_acquire(&l->owner, c, n) == c) # define rt_mutex_cmpxchg_release(l,c,n) (cmpxchg_release(&l->owner, c, n) == c) @@ -202,7 +201,6 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, } #else -# define rt_mutex_cmpxchg_relaxed(l,c,n) (0) # define rt_mutex_cmpxchg_acquire(l,c,n) (0) # define rt_mutex_cmpxchg_release(l,c,n) (0) diff --git a/kernel/module.c b/kernel/module.c index 646f1e2330d2..e7b4ff7e4fd0 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -4,6 +4,9 @@ Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM. */ + +#define INCLUDE_VERMAGIC + #include <linux/export.h> #include <linux/extable.h> #include <linux/moduleloader.h> @@ -1507,8 +1510,7 @@ static inline bool sect_empty(const Elf_Shdr *sect) } struct module_sect_attr { - struct module_attribute mattr; - char *name; + struct bin_attribute battr; unsigned long address; }; @@ -1518,13 +1520,18 @@ struct module_sect_attrs { struct module_sect_attr attrs[]; }; -static ssize_t module_sect_show(struct module_attribute *mattr, - struct module_kobject *mk, char *buf) +static ssize_t module_sect_read(struct file *file, struct kobject *kobj, + struct bin_attribute *battr, + char *buf, loff_t pos, size_t count) { struct module_sect_attr *sattr = - container_of(mattr, struct module_sect_attr, mattr); - return sprintf(buf, "0x%px\n", kptr_restrict < 2 ? - (void *)sattr->address : NULL); + container_of(battr, struct module_sect_attr, battr); + + if (pos != 0) + return -EINVAL; + + return sprintf(buf, "0x%px\n", + kallsyms_show_value(file->f_cred) ? (void *)sattr->address : NULL); } static void free_sect_attrs(struct module_sect_attrs *sect_attrs) @@ -1532,7 +1539,7 @@ static void free_sect_attrs(struct module_sect_attrs *sect_attrs) unsigned int section; for (section = 0; section < sect_attrs->nsections; section++) - kfree(sect_attrs->attrs[section].name); + kfree(sect_attrs->attrs[section].battr.attr.name); kfree(sect_attrs); } @@ -1541,42 +1548,41 @@ static void add_sect_attrs(struct module *mod, const struct load_info *info) unsigned int nloaded = 0, i, size[2]; struct module_sect_attrs *sect_attrs; struct module_sect_attr *sattr; - struct attribute **gattr; + struct bin_attribute **gattr; /* Count loaded sections and allocate structures */ for (i = 0; i < info->hdr->e_shnum; i++) if (!sect_empty(&info->sechdrs[i])) nloaded++; size[0] = ALIGN(struct_size(sect_attrs, attrs, nloaded), - sizeof(sect_attrs->grp.attrs[0])); - size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.attrs[0]); + sizeof(sect_attrs->grp.bin_attrs[0])); + size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.bin_attrs[0]); sect_attrs = kzalloc(size[0] + size[1], GFP_KERNEL); if (sect_attrs == NULL) return; /* Setup section attributes. */ sect_attrs->grp.name = "sections"; - sect_attrs->grp.attrs = (void *)sect_attrs + size[0]; + sect_attrs->grp.bin_attrs = (void *)sect_attrs + size[0]; sect_attrs->nsections = 0; sattr = §_attrs->attrs[0]; - gattr = §_attrs->grp.attrs[0]; + gattr = §_attrs->grp.bin_attrs[0]; for (i = 0; i < info->hdr->e_shnum; i++) { Elf_Shdr *sec = &info->sechdrs[i]; if (sect_empty(sec)) continue; + sysfs_bin_attr_init(&sattr->battr); sattr->address = sec->sh_addr; - sattr->name = kstrdup(info->secstrings + sec->sh_name, - GFP_KERNEL); - if (sattr->name == NULL) + sattr->battr.attr.name = + kstrdup(info->secstrings + sec->sh_name, GFP_KERNEL); + if (sattr->battr.attr.name == NULL) goto out; sect_attrs->nsections++; - sysfs_attr_init(&sattr->mattr.attr); - sattr->mattr.show = module_sect_show; - sattr->mattr.store = NULL; - sattr->mattr.attr.name = sattr->name; - sattr->mattr.attr.mode = S_IRUSR; - *(gattr++) = &(sattr++)->mattr.attr; + sattr->battr.read = module_sect_read; + sattr->battr.size = 3 /* "0x", "\n" */ + (BITS_PER_LONG / 4); + sattr->battr.attr.mode = 0400; + *(gattr++) = &(sattr++)->battr; } *gattr = NULL; @@ -1666,7 +1672,7 @@ static void add_notes_attrs(struct module *mod, const struct load_info *info) continue; if (info->sechdrs[i].sh_type == SHT_NOTE) { sysfs_bin_attr_init(nattr); - nattr->attr.name = mod->sect_attrs->attrs[loaded].name; + nattr->attr.name = mod->sect_attrs->attrs[loaded].battr.attr.name; nattr->attr.mode = S_IRUGO; nattr->size = info->sechdrs[i].sh_size; nattr->private = (void *) info->sechdrs[i].sh_addr; @@ -1943,7 +1949,6 @@ static void mod_sysfs_teardown(struct module *mod) mod_sysfs_fini(mod); } -#ifdef CONFIG_ARCH_HAS_STRICT_MODULE_RWX /* * LKM RO/NX protection: protect module's text/ro-data * from modification and any data from execution. @@ -1957,6 +1962,14 @@ static void mod_sysfs_teardown(struct module *mod) * * These values are always page-aligned (as is base) */ + +/* + * Since some arches are moving towards PAGE_KERNEL module allocations instead + * of PAGE_KERNEL_EXEC, keep frob_text() and module_enable_x() outside of the + * CONFIG_STRICT_MODULE_RWX block below because they are needed regardless of + * whether we are strict. + */ +#ifdef CONFIG_ARCH_HAS_STRICT_MODULE_RWX static void frob_text(const struct module_layout *layout, int (*set_memory)(unsigned long start, int num_pages)) { @@ -1966,6 +1979,15 @@ static void frob_text(const struct module_layout *layout, layout->text_size >> PAGE_SHIFT); } +static void module_enable_x(const struct module *mod) +{ + frob_text(&mod->core_layout, set_memory_x); + frob_text(&mod->init_layout, set_memory_x); +} +#else /* !CONFIG_ARCH_HAS_STRICT_MODULE_RWX */ +static void module_enable_x(const struct module *mod) { } +#endif /* CONFIG_ARCH_HAS_STRICT_MODULE_RWX */ + #ifdef CONFIG_STRICT_MODULE_RWX static void frob_rodata(const struct module_layout *layout, int (*set_memory)(unsigned long start, int num_pages)) @@ -1997,20 +2019,7 @@ static void frob_writable_data(const struct module_layout *layout, (layout->size - layout->ro_after_init_size) >> PAGE_SHIFT); } -/* livepatching wants to disable read-only so it can frob module. */ -void module_disable_ro(const struct module *mod) -{ - if (!rodata_enabled) - return; - - frob_text(&mod->core_layout, set_memory_rw); - frob_rodata(&mod->core_layout, set_memory_rw); - frob_ro_after_init(&mod->core_layout, set_memory_rw); - frob_text(&mod->init_layout, set_memory_rw); - frob_rodata(&mod->init_layout, set_memory_rw); -} - -void module_enable_ro(const struct module *mod, bool after_init) +static void module_enable_ro(const struct module *mod, bool after_init) { if (!rodata_enabled) return; @@ -2036,19 +2045,29 @@ static void module_enable_nx(const struct module *mod) frob_writable_data(&mod->init_layout, set_memory_nx); } +static int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, + char *secstrings, struct module *mod) +{ + const unsigned long shf_wx = SHF_WRITE|SHF_EXECINSTR; + int i; + + for (i = 0; i < hdr->e_shnum; i++) { + if ((sechdrs[i].sh_flags & shf_wx) == shf_wx) + return -ENOEXEC; + } + + return 0; +} + #else /* !CONFIG_STRICT_MODULE_RWX */ static void module_enable_nx(const struct module *mod) { } -#endif /* CONFIG_STRICT_MODULE_RWX */ -static void module_enable_x(const struct module *mod) +static void module_enable_ro(const struct module *mod, bool after_init) {} +static int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, + char *secstrings, struct module *mod) { - frob_text(&mod->core_layout, set_memory_x); - frob_text(&mod->init_layout, set_memory_x); + return 0; } -#else /* !CONFIG_ARCH_HAS_STRICT_MODULE_RWX */ -static void module_enable_nx(const struct module *mod) { } -static void module_enable_x(const struct module *mod) { } -#endif /* CONFIG_ARCH_HAS_STRICT_MODULE_RWX */ - +#endif /* CONFIG_STRICT_MODULE_RWX */ #ifdef CONFIG_LIVEPATCH /* @@ -2334,11 +2353,13 @@ static int apply_relocations(struct module *mod, const struct load_info *info) if (!(info->sechdrs[infosec].sh_flags & SHF_ALLOC)) continue; - /* Livepatch relocation sections are applied by livepatch */ if (info->sechdrs[i].sh_flags & SHF_RELA_LIVEPATCH) - continue; - - if (info->sechdrs[i].sh_type == SHT_REL) + err = klp_apply_section_relocs(mod, info->sechdrs, + info->secstrings, + info->strtab, + info->index.sym, i, + NULL); + else if (info->sechdrs[i].sh_type == SHT_REL) err = apply_relocate(info->sechdrs, info->strtab, info->index.sym, i, mod); else if (info->sechdrs[i].sh_type == SHT_RELA) @@ -2400,7 +2421,7 @@ static void layout_sections(struct module *mod, struct load_info *info) if ((s->sh_flags & masks[m][0]) != masks[m][0] || (s->sh_flags & masks[m][1]) || s->sh_entsize != ~0UL - || strstarts(sname, ".init")) + || module_init_section(sname)) continue; s->sh_entsize = get_offset(mod, &mod->core_layout.size, s, i); pr_debug("\t%s\n", sname); @@ -2433,7 +2454,7 @@ static void layout_sections(struct module *mod, struct load_info *info) if ((s->sh_flags & masks[m][0]) != masks[m][0] || (s->sh_flags & masks[m][1]) || s->sh_entsize != ~0UL - || !strstarts(sname, ".init")) + || !module_init_section(sname)) continue; s->sh_entsize = (get_offset(mod, &mod->init_layout.size, s, i) | INIT_OFFSET_MASK); @@ -2765,7 +2786,14 @@ static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug) void * __weak module_alloc(unsigned long size) { - return vmalloc_exec(size); + return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, + GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, + NUMA_NO_NODE, __builtin_return_address(0)); +} + +bool __weak module_init_section(const char *name) +{ + return strstarts(name, ".init"); } bool __weak module_exit_section(const char *name) @@ -2946,8 +2974,7 @@ static int copy_module_from_user(const void __user *umod, unsigned long len, return err; /* Suck in entire file: we'll want most of it. */ - info->hdr = __vmalloc(info->len, - GFP_KERNEL | __GFP_NOWARN, PAGE_KERNEL); + info->hdr = __vmalloc(info->len, GFP_KERNEL | __GFP_NOWARN); if (!info->hdr) return -ENOMEM; @@ -3150,6 +3177,9 @@ static int find_module_sections(struct module *mod, struct load_info *info) } #endif + mod->noinstr_text_start = section_objs(info, ".noinstr.text", 1, + &mod->noinstr_text_size); + #ifdef CONFIG_TRACEPOINTS mod->tracepoints_ptrs = section_objs(info, "__tracepoints_ptrs", sizeof(*mod->tracepoints_ptrs), @@ -3194,13 +3224,20 @@ static int find_module_sections(struct module *mod, struct load_info *info) sizeof(*mod->ei_funcs), &mod->num_ei_funcs); #endif +#ifdef CONFIG_KPROBES + mod->kprobes_text_start = section_objs(info, ".kprobes.text", 1, + &mod->kprobes_text_size); + mod->kprobe_blacklist = section_objs(info, "_kprobe_blacklist", + sizeof(unsigned long), + &mod->num_kprobe_blacklist); +#endif mod->extable = section_objs(info, "__ex_table", sizeof(*mod->extable), &mod->num_exentries); if (section_addr(info, "__obsparm")) pr_warn("%s: Ignoring obsolete parameters\n", mod->name); - info->debug = section_objs(info, "__verbose", + info->debug = section_objs(info, "__dyndbg", sizeof(*info->debug), &info->num_debug); return 0; @@ -3312,12 +3349,6 @@ static int check_module_license_and_versions(struct module *mod) static void flush_module_icache(const struct module *mod) { - mm_segment_t old_fs; - - /* flush the icache in correct context */ - old_fs = get_fs(); - set_fs(KERNEL_DS); - /* * Flush the instruction cache, since we've played with text. * Do it before processing of module parameters, so the module @@ -3329,8 +3360,6 @@ static void flush_module_icache(const struct module *mod) + mod->init_layout.size); flush_icache_range((unsigned long)mod->core_layout.base, (unsigned long)mod->core_layout.base + mod->core_layout.size); - - set_fs(old_fs); } int __weak module_frob_arch_sections(Elf_Ehdr *hdr, @@ -3378,6 +3407,11 @@ static struct module *layout_and_allocate(struct load_info *info, int flags) if (err < 0) return ERR_PTR(err); + err = module_enforce_rwx_sections(info->hdr, info->sechdrs, + info->secstrings, info->mod); + if (err < 0) + return ERR_PTR(err); + /* We will do a special allocation for per-cpu sections later. */ info->sechdrs[info->index.pcpu].sh_flags &= ~(unsigned long)SHF_ALLOC; @@ -4348,7 +4382,7 @@ static int modules_open(struct inode *inode, struct file *file) if (!err) { struct seq_file *m = file->private_data; - m->private = kallsyms_show_value() ? NULL : (void *)8ul; + m->private = kallsyms_show_value(file->f_cred) ? NULL : (void *)8ul; } return err; diff --git a/kernel/notifier.c b/kernel/notifier.c index 5989bbb93039..84c987dfbe03 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -519,7 +519,6 @@ NOKPROBE_SYMBOL(notify_die); int register_die_notifier(struct notifier_block *nb) { - vmalloc_sync_mappings(); return atomic_notifier_chain_register(&die_chain, nb); } EXPORT_SYMBOL_GPL(register_die_notifier); diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index ed9882108cd2..12dd41b39a7f 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -19,6 +19,8 @@ #include <net/net_namespace.h> #include <linux/ipc_namespace.h> #include <linux/time_namespace.h> +#include <linux/fs_struct.h> +#include <linux/proc_fs.h> #include <linux/proc_ns.h> #include <linux/file.h> #include <linux/syscalls.h> @@ -257,37 +259,313 @@ void exit_task_namespaces(struct task_struct *p) switch_task_namespaces(p, NULL); } -SYSCALL_DEFINE2(setns, int, fd, int, nstype) +static int check_setns_flags(unsigned long flags) { - struct task_struct *tsk = current; - struct nsproxy *new_nsproxy; - struct file *file; - struct ns_common *ns; - int err; + if (!flags || (flags & ~(CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | + CLONE_NEWNET | CLONE_NEWTIME | CLONE_NEWUSER | + CLONE_NEWPID | CLONE_NEWCGROUP))) + return -EINVAL; + +#ifndef CONFIG_USER_NS + if (flags & CLONE_NEWUSER) + return -EINVAL; +#endif +#ifndef CONFIG_PID_NS + if (flags & CLONE_NEWPID) + return -EINVAL; +#endif +#ifndef CONFIG_UTS_NS + if (flags & CLONE_NEWUTS) + return -EINVAL; +#endif +#ifndef CONFIG_IPC_NS + if (flags & CLONE_NEWIPC) + return -EINVAL; +#endif +#ifndef CONFIG_CGROUPS + if (flags & CLONE_NEWCGROUP) + return -EINVAL; +#endif +#ifndef CONFIG_NET_NS + if (flags & CLONE_NEWNET) + return -EINVAL; +#endif +#ifndef CONFIG_TIME_NS + if (flags & CLONE_NEWTIME) + return -EINVAL; +#endif - file = proc_ns_fget(fd); - if (IS_ERR(file)) - return PTR_ERR(file); + return 0; +} - err = -EINVAL; - ns = get_proc_ns(file_inode(file)); - if (nstype && (ns->ops->type != nstype)) - goto out; +static void put_nsset(struct nsset *nsset) +{ + unsigned flags = nsset->flags; + + if (flags & CLONE_NEWUSER) + put_cred(nsset_cred(nsset)); + /* + * We only created a temporary copy if we attached to more than just + * the mount namespace. + */ + if (nsset->fs && (flags & CLONE_NEWNS) && (flags & ~CLONE_NEWNS)) + free_fs_struct(nsset->fs); + if (nsset->nsproxy) + free_nsproxy(nsset->nsproxy); +} + +static int prepare_nsset(unsigned flags, struct nsset *nsset) +{ + struct task_struct *me = current; - new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs); - if (IS_ERR(new_nsproxy)) { - err = PTR_ERR(new_nsproxy); + nsset->nsproxy = create_new_namespaces(0, me, current_user_ns(), me->fs); + if (IS_ERR(nsset->nsproxy)) + return PTR_ERR(nsset->nsproxy); + + if (flags & CLONE_NEWUSER) + nsset->cred = prepare_creds(); + else + nsset->cred = current_cred(); + if (!nsset->cred) goto out; + + /* Only create a temporary copy of fs_struct if we really need to. */ + if (flags == CLONE_NEWNS) { + nsset->fs = me->fs; + } else if (flags & CLONE_NEWNS) { + nsset->fs = copy_fs_struct(me->fs); + if (!nsset->fs) + goto out; } - err = ns->ops->install(new_nsproxy, ns); - if (err) { - free_nsproxy(new_nsproxy); - goto out; + nsset->flags = flags; + return 0; + +out: + put_nsset(nsset); + return -ENOMEM; +} + +static inline int validate_ns(struct nsset *nsset, struct ns_common *ns) +{ + return ns->ops->install(nsset, ns); +} + +/* + * This is the inverse operation to unshare(). + * Ordering is equivalent to the standard ordering used everywhere else + * during unshare and process creation. The switch to the new set of + * namespaces occurs at the point of no return after installation of + * all requested namespaces was successful in commit_nsset(). + */ +static int validate_nsset(struct nsset *nsset, struct pid *pid) +{ + int ret = 0; + unsigned flags = nsset->flags; + struct user_namespace *user_ns = NULL; + struct pid_namespace *pid_ns = NULL; + struct nsproxy *nsp; + struct task_struct *tsk; + + /* Take a "snapshot" of the target task's namespaces. */ + rcu_read_lock(); + tsk = pid_task(pid, PIDTYPE_PID); + if (!tsk) { + rcu_read_unlock(); + return -ESRCH; } - switch_task_namespaces(tsk, new_nsproxy); - perf_event_namespaces(tsk); + if (!ptrace_may_access(tsk, PTRACE_MODE_READ_REALCREDS)) { + rcu_read_unlock(); + return -EPERM; + } + + task_lock(tsk); + nsp = tsk->nsproxy; + if (nsp) + get_nsproxy(nsp); + task_unlock(tsk); + if (!nsp) { + rcu_read_unlock(); + return -ESRCH; + } + +#ifdef CONFIG_PID_NS + if (flags & CLONE_NEWPID) { + pid_ns = task_active_pid_ns(tsk); + if (unlikely(!pid_ns)) { + rcu_read_unlock(); + ret = -ESRCH; + goto out; + } + get_pid_ns(pid_ns); + } +#endif + +#ifdef CONFIG_USER_NS + if (flags & CLONE_NEWUSER) + user_ns = get_user_ns(__task_cred(tsk)->user_ns); +#endif + rcu_read_unlock(); + + /* + * Install requested namespaces. The caller will have + * verified earlier that the requested namespaces are + * supported on this kernel. We don't report errors here + * if a namespace is requested that isn't supported. + */ +#ifdef CONFIG_USER_NS + if (flags & CLONE_NEWUSER) { + ret = validate_ns(nsset, &user_ns->ns); + if (ret) + goto out; + } +#endif + + if (flags & CLONE_NEWNS) { + ret = validate_ns(nsset, from_mnt_ns(nsp->mnt_ns)); + if (ret) + goto out; + } + +#ifdef CONFIG_UTS_NS + if (flags & CLONE_NEWUTS) { + ret = validate_ns(nsset, &nsp->uts_ns->ns); + if (ret) + goto out; + } +#endif + +#ifdef CONFIG_IPC_NS + if (flags & CLONE_NEWIPC) { + ret = validate_ns(nsset, &nsp->ipc_ns->ns); + if (ret) + goto out; + } +#endif + +#ifdef CONFIG_PID_NS + if (flags & CLONE_NEWPID) { + ret = validate_ns(nsset, &pid_ns->ns); + if (ret) + goto out; + } +#endif + +#ifdef CONFIG_CGROUPS + if (flags & CLONE_NEWCGROUP) { + ret = validate_ns(nsset, &nsp->cgroup_ns->ns); + if (ret) + goto out; + } +#endif + +#ifdef CONFIG_NET_NS + if (flags & CLONE_NEWNET) { + ret = validate_ns(nsset, &nsp->net_ns->ns); + if (ret) + goto out; + } +#endif + +#ifdef CONFIG_TIME_NS + if (flags & CLONE_NEWTIME) { + ret = validate_ns(nsset, &nsp->time_ns->ns); + if (ret) + goto out; + } +#endif + +out: + if (pid_ns) + put_pid_ns(pid_ns); + if (nsp) + put_nsproxy(nsp); + put_user_ns(user_ns); + + return ret; +} + +/* + * This is the point of no return. There are just a few namespaces + * that do some actual work here and it's sufficiently minimal that + * a separate ns_common operation seems unnecessary for now. + * Unshare is doing the same thing. If we'll end up needing to do + * more in a given namespace or a helper here is ultimately not + * exported anymore a simple commit handler for each namespace + * should be added to ns_common. + */ +static void commit_nsset(struct nsset *nsset) +{ + unsigned flags = nsset->flags; + struct task_struct *me = current; + +#ifdef CONFIG_USER_NS + if (flags & CLONE_NEWUSER) { + /* transfer ownership */ + commit_creds(nsset_cred(nsset)); + nsset->cred = NULL; + } +#endif + + /* We only need to commit if we have used a temporary fs_struct. */ + if ((flags & CLONE_NEWNS) && (flags & ~CLONE_NEWNS)) { + set_fs_root(me->fs, &nsset->fs->root); + set_fs_pwd(me->fs, &nsset->fs->pwd); + } + +#ifdef CONFIG_IPC_NS + if (flags & CLONE_NEWIPC) + exit_sem(me); +#endif + +#ifdef CONFIG_TIME_NS + if (flags & CLONE_NEWTIME) + timens_commit(me, nsset->nsproxy->time_ns); +#endif + + /* transfer ownership */ + switch_task_namespaces(me, nsset->nsproxy); + nsset->nsproxy = NULL; +} + +SYSCALL_DEFINE2(setns, int, fd, int, flags) +{ + struct file *file; + struct ns_common *ns = NULL; + struct nsset nsset = {}; + int err = 0; + + file = fget(fd); + if (!file) + return -EBADF; + + if (proc_ns_file(file)) { + ns = get_proc_ns(file_inode(file)); + if (flags && (ns->ops->type != flags)) + err = -EINVAL; + flags = ns->ops->type; + } else if (!IS_ERR(pidfd_pid(file))) { + err = check_setns_flags(flags); + } else { + err = -EINVAL; + } + if (err) + goto out; + + err = prepare_nsset(flags, &nsset); + if (err) + goto out; + + if (proc_ns_file(file)) + err = validate_ns(&nsset, ns); + else + err = validate_nsset(&nsset, file->private_data); + if (!err) { + commit_nsset(&nsset); + perf_event_namespaces(current); + } + put_nsset(&nsset); out: fput(file); return err; diff --git a/kernel/padata.c b/kernel/padata.c index a6afa12fb75e..16cb894dc272 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -7,6 +7,9 @@ * Copyright (C) 2008, 2009 secunet Security Networks AG * Copyright (C) 2008, 2009 Steffen Klassert <steffen.klassert@secunet.com> * + * Copyright (c) 2020 Oracle and/or its affiliates. + * Author: Daniel Jordan <daniel.m.jordan@oracle.com> + * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. @@ -21,6 +24,7 @@ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. */ +#include <linux/completion.h> #include <linux/export.h> #include <linux/cpumask.h> #include <linux/err.h> @@ -31,11 +35,30 @@ #include <linux/slab.h> #include <linux/sysfs.h> #include <linux/rcupdate.h> -#include <linux/module.h> -#define MAX_OBJ_NUM 1000 +#define PADATA_WORK_ONSTACK 1 /* Work's memory is on stack */ + +struct padata_work { + struct work_struct pw_work; + struct list_head pw_list; /* padata_free_works linkage */ + void *pw_data; +}; + +static DEFINE_SPINLOCK(padata_works_lock); +static struct padata_work *padata_works; +static LIST_HEAD(padata_free_works); + +struct padata_mt_job_state { + spinlock_t lock; + struct completion completion; + struct padata_mt_job *job; + int nworks; + int nworks_fini; + unsigned long chunk_size; +}; static void padata_free_pd(struct parallel_data *pd); +static void __init padata_mt_helper(struct work_struct *work); static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) { @@ -59,30 +82,82 @@ static int padata_cpu_hash(struct parallel_data *pd, unsigned int seq_nr) return padata_index_to_cpu(pd, cpu_index); } -static void padata_parallel_worker(struct work_struct *parallel_work) +static struct padata_work *padata_work_alloc(void) { - struct padata_parallel_queue *pqueue; - LIST_HEAD(local_list); + struct padata_work *pw; - local_bh_disable(); - pqueue = container_of(parallel_work, - struct padata_parallel_queue, work); + lockdep_assert_held(&padata_works_lock); - spin_lock(&pqueue->parallel.lock); - list_replace_init(&pqueue->parallel.list, &local_list); - spin_unlock(&pqueue->parallel.lock); + if (list_empty(&padata_free_works)) + return NULL; /* No more work items allowed to be queued. */ - while (!list_empty(&local_list)) { - struct padata_priv *padata; + pw = list_first_entry(&padata_free_works, struct padata_work, pw_list); + list_del(&pw->pw_list); + return pw; +} - padata = list_entry(local_list.next, - struct padata_priv, list); +static void padata_work_init(struct padata_work *pw, work_func_t work_fn, + void *data, int flags) +{ + if (flags & PADATA_WORK_ONSTACK) + INIT_WORK_ONSTACK(&pw->pw_work, work_fn); + else + INIT_WORK(&pw->pw_work, work_fn); + pw->pw_data = data; +} - list_del_init(&padata->list); +static int __init padata_work_alloc_mt(int nworks, void *data, + struct list_head *head) +{ + int i; - padata->parallel(padata); + spin_lock(&padata_works_lock); + /* Start at 1 because the current task participates in the job. */ + for (i = 1; i < nworks; ++i) { + struct padata_work *pw = padata_work_alloc(); + + if (!pw) + break; + padata_work_init(pw, padata_mt_helper, data, 0); + list_add(&pw->pw_list, head); + } + spin_unlock(&padata_works_lock); + + return i; +} + +static void padata_work_free(struct padata_work *pw) +{ + lockdep_assert_held(&padata_works_lock); + list_add(&pw->pw_list, &padata_free_works); +} + +static void __init padata_works_free(struct list_head *works) +{ + struct padata_work *cur, *next; + + if (list_empty(works)) + return; + + spin_lock(&padata_works_lock); + list_for_each_entry_safe(cur, next, works, pw_list) { + list_del(&cur->pw_list); + padata_work_free(cur); } + spin_unlock(&padata_works_lock); +} +static void padata_parallel_worker(struct work_struct *parallel_work) +{ + struct padata_work *pw = container_of(parallel_work, struct padata_work, + pw_work); + struct padata_priv *padata = pw->pw_data; + + local_bh_disable(); + padata->parallel(padata); + spin_lock(&padata_works_lock); + padata_work_free(pw); + spin_unlock(&padata_works_lock); local_bh_enable(); } @@ -106,9 +181,9 @@ int padata_do_parallel(struct padata_shell *ps, struct padata_priv *padata, int *cb_cpu) { struct padata_instance *pinst = ps->pinst; - int i, cpu, cpu_index, target_cpu, err; - struct padata_parallel_queue *queue; + int i, cpu, cpu_index, err; struct parallel_data *pd; + struct padata_work *pw; rcu_read_lock_bh(); @@ -136,25 +211,25 @@ int padata_do_parallel(struct padata_shell *ps, if ((pinst->flags & PADATA_RESET)) goto out; - if (atomic_read(&pd->refcnt) >= MAX_OBJ_NUM) - goto out; - - err = 0; atomic_inc(&pd->refcnt); padata->pd = pd; padata->cb_cpu = *cb_cpu; - padata->seq_nr = atomic_inc_return(&pd->seq_nr); - target_cpu = padata_cpu_hash(pd, padata->seq_nr); - padata->cpu = target_cpu; - queue = per_cpu_ptr(pd->pqueue, target_cpu); - - spin_lock(&queue->parallel.lock); - list_add_tail(&padata->list, &queue->parallel.list); - spin_unlock(&queue->parallel.lock); + rcu_read_unlock_bh(); - queue_work(pinst->parallel_wq, &queue->work); + spin_lock(&padata_works_lock); + padata->seq_nr = ++pd->seq_nr; + pw = padata_work_alloc(); + spin_unlock(&padata_works_lock); + if (pw) { + padata_work_init(pw, padata_parallel_worker, padata, 0); + queue_work(pinst->parallel_wq, &pw->pw_work); + } else { + /* Maximum works limit exceeded, run in the current task. */ + padata->parallel(padata); + } + return 0; out: rcu_read_unlock_bh(); @@ -175,13 +250,11 @@ EXPORT_SYMBOL(padata_do_parallel); static struct padata_priv *padata_find_next(struct parallel_data *pd, bool remove_object) { - struct padata_parallel_queue *next_queue; struct padata_priv *padata; struct padata_list *reorder; int cpu = pd->cpu; - next_queue = per_cpu_ptr(pd->pqueue, cpu); - reorder = &next_queue->reorder; + reorder = per_cpu_ptr(pd->reorder_list, cpu); spin_lock(&reorder->lock); if (list_empty(&reorder->list)) { @@ -216,7 +289,7 @@ static void padata_reorder(struct parallel_data *pd) int cb_cpu; struct padata_priv *padata; struct padata_serial_queue *squeue; - struct padata_parallel_queue *next_queue; + struct padata_list *reorder; /* * We need to ensure that only one cpu can work on dequeueing of @@ -260,13 +333,12 @@ static void padata_reorder(struct parallel_data *pd) * * Ensure reorder queue is read after pd->lock is dropped so we see * new objects from another task in padata_do_serial. Pairs with - * smp_mb__after_atomic in padata_do_serial. + * smp_mb in padata_do_serial. */ smp_mb(); - next_queue = per_cpu_ptr(pd->pqueue, pd->cpu); - if (!list_empty(&next_queue->reorder.list) && - padata_find_next(pd, false)) + reorder = per_cpu_ptr(pd->reorder_list, pd->cpu); + if (!list_empty(&reorder->list) && padata_find_next(pd, false)) queue_work(pinst->serial_wq, &pd->reorder_work); } @@ -325,24 +397,24 @@ static void padata_serial_worker(struct work_struct *serial_work) void padata_do_serial(struct padata_priv *padata) { struct parallel_data *pd = padata->pd; - struct padata_parallel_queue *pqueue = per_cpu_ptr(pd->pqueue, - padata->cpu); + int hashed_cpu = padata_cpu_hash(pd, padata->seq_nr); + struct padata_list *reorder = per_cpu_ptr(pd->reorder_list, hashed_cpu); struct padata_priv *cur; - spin_lock(&pqueue->reorder.lock); + spin_lock(&reorder->lock); /* Sort in ascending order of sequence number. */ - list_for_each_entry_reverse(cur, &pqueue->reorder.list, list) + list_for_each_entry_reverse(cur, &reorder->list, list) if (cur->seq_nr < padata->seq_nr) break; list_add(&padata->list, &cur->list); - spin_unlock(&pqueue->reorder.lock); + spin_unlock(&reorder->lock); /* * Ensure the addition to the reorder list is ordered correctly * with the trylock of pd->lock in padata_reorder. Pairs with smp_mb * in padata_reorder. */ - smp_mb__after_atomic(); + smp_mb(); padata_reorder(pd); } @@ -365,26 +437,96 @@ static int padata_setup_cpumasks(struct padata_instance *pinst) return err; } -static int pd_setup_cpumasks(struct parallel_data *pd, - const struct cpumask *pcpumask, - const struct cpumask *cbcpumask) +static void __init padata_mt_helper(struct work_struct *w) { - int err = -ENOMEM; + struct padata_work *pw = container_of(w, struct padata_work, pw_work); + struct padata_mt_job_state *ps = pw->pw_data; + struct padata_mt_job *job = ps->job; + bool done; - if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL)) - goto out; - if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) - goto free_pcpu_mask; + spin_lock(&ps->lock); - cpumask_copy(pd->cpumask.pcpu, pcpumask); - cpumask_copy(pd->cpumask.cbcpu, cbcpumask); + while (job->size > 0) { + unsigned long start, size, end; - return 0; + start = job->start; + /* So end is chunk size aligned if enough work remains. */ + size = roundup(start + 1, ps->chunk_size) - start; + size = min(size, job->size); + end = start + size; -free_pcpu_mask: - free_cpumask_var(pd->cpumask.pcpu); -out: - return err; + job->start = end; + job->size -= size; + + spin_unlock(&ps->lock); + job->thread_fn(start, end, job->fn_arg); + spin_lock(&ps->lock); + } + + ++ps->nworks_fini; + done = (ps->nworks_fini == ps->nworks); + spin_unlock(&ps->lock); + + if (done) + complete(&ps->completion); +} + +/** + * padata_do_multithreaded - run a multithreaded job + * @job: Description of the job. + * + * See the definition of struct padata_mt_job for more details. + */ +void __init padata_do_multithreaded(struct padata_mt_job *job) +{ + /* In case threads finish at different times. */ + static const unsigned long load_balance_factor = 4; + struct padata_work my_work, *pw; + struct padata_mt_job_state ps; + LIST_HEAD(works); + int nworks; + + if (job->size == 0) + return; + + /* Ensure at least one thread when size < min_chunk. */ + nworks = max(job->size / job->min_chunk, 1ul); + nworks = min(nworks, job->max_threads); + + if (nworks == 1) { + /* Single thread, no coordination needed, cut to the chase. */ + job->thread_fn(job->start, job->start + job->size, job->fn_arg); + return; + } + + spin_lock_init(&ps.lock); + init_completion(&ps.completion); + ps.job = job; + ps.nworks = padata_work_alloc_mt(nworks, &ps, &works); + ps.nworks_fini = 0; + + /* + * Chunk size is the amount of work a helper does per call to the + * thread function. Load balance large jobs between threads by + * increasing the number of chunks, guarantee at least the minimum + * chunk size from the caller, and honor the caller's alignment. + */ + ps.chunk_size = job->size / (ps.nworks * load_balance_factor); + ps.chunk_size = max(ps.chunk_size, job->min_chunk); + ps.chunk_size = roundup(ps.chunk_size, job->align); + + list_for_each_entry(pw, &works, pw_list) + queue_work(system_unbound_wq, &pw->pw_work); + + /* Use the current thread, which saves starting a workqueue worker. */ + padata_work_init(&my_work, padata_mt_helper, &ps, PADATA_WORK_ONSTACK); + padata_mt_helper(&my_work.pw_work); + + /* Wait for all the helpers to finish. */ + wait_for_completion(&ps.completion); + + destroy_work_on_stack(&my_work.pw_work); + padata_works_free(&works); } static void __padata_list_init(struct padata_list *pd_list) @@ -407,19 +549,15 @@ static void padata_init_squeues(struct parallel_data *pd) } } -/* Initialize all percpu queues used by parallel workers */ -static void padata_init_pqueues(struct parallel_data *pd) +/* Initialize per-CPU reorder lists */ +static void padata_init_reorder_list(struct parallel_data *pd) { int cpu; - struct padata_parallel_queue *pqueue; + struct padata_list *list; for_each_cpu(cpu, pd->cpumask.pcpu) { - pqueue = per_cpu_ptr(pd->pqueue, cpu); - - __padata_list_init(&pqueue->reorder); - __padata_list_init(&pqueue->parallel); - INIT_WORK(&pqueue->work, padata_parallel_worker); - atomic_set(&pqueue->num_obj, 0); + list = per_cpu_ptr(pd->reorder_list, cpu); + __padata_list_init(list); } } @@ -427,32 +565,33 @@ static void padata_init_pqueues(struct parallel_data *pd) static struct parallel_data *padata_alloc_pd(struct padata_shell *ps) { struct padata_instance *pinst = ps->pinst; - const struct cpumask *cbcpumask; - const struct cpumask *pcpumask; struct parallel_data *pd; - cbcpumask = pinst->rcpumask.cbcpu; - pcpumask = pinst->rcpumask.pcpu; - pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL); if (!pd) goto err; - pd->pqueue = alloc_percpu(struct padata_parallel_queue); - if (!pd->pqueue) + pd->reorder_list = alloc_percpu(struct padata_list); + if (!pd->reorder_list) goto err_free_pd; pd->squeue = alloc_percpu(struct padata_serial_queue); if (!pd->squeue) - goto err_free_pqueue; + goto err_free_reorder_list; pd->ps = ps; - if (pd_setup_cpumasks(pd, pcpumask, cbcpumask)) + + if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL)) goto err_free_squeue; + if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) + goto err_free_pcpu; + + cpumask_and(pd->cpumask.pcpu, pinst->cpumask.pcpu, cpu_online_mask); + cpumask_and(pd->cpumask.cbcpu, pinst->cpumask.cbcpu, cpu_online_mask); - padata_init_pqueues(pd); + padata_init_reorder_list(pd); padata_init_squeues(pd); - atomic_set(&pd->seq_nr, -1); + pd->seq_nr = -1; atomic_set(&pd->refcnt, 1); spin_lock_init(&pd->lock); pd->cpu = cpumask_first(pd->cpumask.pcpu); @@ -460,10 +599,12 @@ static struct parallel_data *padata_alloc_pd(struct padata_shell *ps) return pd; +err_free_pcpu: + free_cpumask_var(pd->cpumask.pcpu); err_free_squeue: free_percpu(pd->squeue); -err_free_pqueue: - free_percpu(pd->pqueue); +err_free_reorder_list: + free_percpu(pd->reorder_list); err_free_pd: kfree(pd); err: @@ -474,7 +615,7 @@ static void padata_free_pd(struct parallel_data *pd) { free_cpumask_var(pd->cpumask.pcpu); free_cpumask_var(pd->cpumask.cbcpu); - free_percpu(pd->pqueue); + free_percpu(pd->reorder_list); free_percpu(pd->squeue); kfree(pd); } @@ -516,12 +657,6 @@ static int padata_replace(struct padata_instance *pinst) pinst->flags |= PADATA_RESET; - cpumask_and(pinst->rcpumask.pcpu, pinst->cpumask.pcpu, - cpu_online_mask); - - cpumask_and(pinst->rcpumask.cbcpu, pinst->cpumask.cbcpu, - cpu_online_mask); - list_for_each_entry(ps, &pinst->pslist, list) { err = padata_replace_one(ps); if (err) @@ -623,43 +758,6 @@ out: } EXPORT_SYMBOL(padata_set_cpumask); -/** - * padata_start - start the parallel processing - * - * @pinst: padata instance to start - * - * Return: 0 on success or negative error code - */ -int padata_start(struct padata_instance *pinst) -{ - int err = 0; - - mutex_lock(&pinst->lock); - - if (pinst->flags & PADATA_INVALID) - err = -EINVAL; - - __padata_start(pinst); - - mutex_unlock(&pinst->lock); - - return err; -} -EXPORT_SYMBOL(padata_start); - -/** - * padata_stop - stop the parallel processing - * - * @pinst: padata instance to stop - */ -void padata_stop(struct padata_instance *pinst) -{ - mutex_lock(&pinst->lock); - __padata_stop(pinst); - mutex_unlock(&pinst->lock); -} -EXPORT_SYMBOL(padata_stop); - #ifdef CONFIG_HOTPLUG_CPU static int __padata_add_cpu(struct padata_instance *pinst, int cpu) @@ -703,7 +801,7 @@ static int padata_cpu_online(unsigned int cpu, struct hlist_node *node) struct padata_instance *pinst; int ret; - pinst = hlist_entry_safe(node, struct padata_instance, node); + pinst = hlist_entry_safe(node, struct padata_instance, cpu_online_node); if (!pinst_has_cpu(pinst, cpu)) return 0; @@ -718,7 +816,7 @@ static int padata_cpu_dead(unsigned int cpu, struct hlist_node *node) struct padata_instance *pinst; int ret; - pinst = hlist_entry_safe(node, struct padata_instance, node); + pinst = hlist_entry_safe(node, struct padata_instance, cpu_dead_node); if (!pinst_has_cpu(pinst, cpu)) return 0; @@ -734,15 +832,13 @@ static enum cpuhp_state hp_online; static void __padata_free(struct padata_instance *pinst) { #ifdef CONFIG_HOTPLUG_CPU - cpuhp_state_remove_instance_nocalls(CPUHP_PADATA_DEAD, &pinst->node); - cpuhp_state_remove_instance_nocalls(hp_online, &pinst->node); + cpuhp_state_remove_instance_nocalls(CPUHP_PADATA_DEAD, + &pinst->cpu_dead_node); + cpuhp_state_remove_instance_nocalls(hp_online, &pinst->cpu_online_node); #endif WARN_ON(!list_empty(&pinst->pslist)); - padata_stop(pinst); - free_cpumask_var(pinst->rcpumask.cbcpu); - free_cpumask_var(pinst->rcpumask.pcpu); free_cpumask_var(pinst->cpumask.pcpu); free_cpumask_var(pinst->cpumask.cbcpu); destroy_workqueue(pinst->serial_wq); @@ -877,18 +973,12 @@ static struct kobj_type padata_attr_type = { }; /** - * padata_alloc - allocate and initialize a padata instance and specify - * cpumasks for serial and parallel workers. - * + * padata_alloc - allocate and initialize a padata instance * @name: used to identify the instance - * @pcpumask: cpumask that will be used for padata parallelization - * @cbcpumask: cpumask that will be used for padata serialization * * Return: new instance on success, NULL on error */ -static struct padata_instance *padata_alloc(const char *name, - const struct cpumask *pcpumask, - const struct cpumask *cbcpumask) +struct padata_instance *padata_alloc(const char *name) { struct padata_instance *pinst; @@ -914,44 +1004,31 @@ static struct padata_instance *padata_alloc(const char *name, free_cpumask_var(pinst->cpumask.pcpu); goto err_free_serial_wq; } - if (!padata_validate_cpumask(pinst, pcpumask) || - !padata_validate_cpumask(pinst, cbcpumask)) - goto err_free_masks; - - if (!alloc_cpumask_var(&pinst->rcpumask.pcpu, GFP_KERNEL)) - goto err_free_masks; - if (!alloc_cpumask_var(&pinst->rcpumask.cbcpu, GFP_KERNEL)) - goto err_free_rcpumask_pcpu; INIT_LIST_HEAD(&pinst->pslist); - cpumask_copy(pinst->cpumask.pcpu, pcpumask); - cpumask_copy(pinst->cpumask.cbcpu, cbcpumask); - cpumask_and(pinst->rcpumask.pcpu, pcpumask, cpu_online_mask); - cpumask_and(pinst->rcpumask.cbcpu, cbcpumask, cpu_online_mask); + cpumask_copy(pinst->cpumask.pcpu, cpu_possible_mask); + cpumask_copy(pinst->cpumask.cbcpu, cpu_possible_mask); if (padata_setup_cpumasks(pinst)) - goto err_free_rcpumask_cbcpu; + goto err_free_masks; - pinst->flags = 0; + __padata_start(pinst); kobject_init(&pinst->kobj, &padata_attr_type); mutex_init(&pinst->lock); #ifdef CONFIG_HOTPLUG_CPU - cpuhp_state_add_instance_nocalls_cpuslocked(hp_online, &pinst->node); + cpuhp_state_add_instance_nocalls_cpuslocked(hp_online, + &pinst->cpu_online_node); cpuhp_state_add_instance_nocalls_cpuslocked(CPUHP_PADATA_DEAD, - &pinst->node); + &pinst->cpu_dead_node); #endif put_online_cpus(); return pinst; -err_free_rcpumask_cbcpu: - free_cpumask_var(pinst->rcpumask.cbcpu); -err_free_rcpumask_pcpu: - free_cpumask_var(pinst->rcpumask.pcpu); err_free_masks: free_cpumask_var(pinst->cpumask.pcpu); free_cpumask_var(pinst->cpumask.cbcpu); @@ -965,21 +1042,7 @@ err_free_inst: err: return NULL; } - -/** - * padata_alloc_possible - Allocate and initialize padata instance. - * Use the cpu_possible_mask for serial and - * parallel workers. - * - * @name: used to identify the instance - * - * Return: new instance on success, NULL on error - */ -struct padata_instance *padata_alloc_possible(const char *name) -{ - return padata_alloc(name, cpu_possible_mask, cpu_possible_mask); -} -EXPORT_SYMBOL(padata_alloc_possible); +EXPORT_SYMBOL(padata_alloc); /** * padata_free - free a padata instance @@ -1050,32 +1113,41 @@ void padata_free_shell(struct padata_shell *ps) } EXPORT_SYMBOL(padata_free_shell); -#ifdef CONFIG_HOTPLUG_CPU - -static __init int padata_driver_init(void) +void __init padata_init(void) { + unsigned int i, possible_cpus; +#ifdef CONFIG_HOTPLUG_CPU int ret; ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "padata:online", padata_cpu_online, NULL); if (ret < 0) - return ret; + goto err; hp_online = ret; ret = cpuhp_setup_state_multi(CPUHP_PADATA_DEAD, "padata:dead", NULL, padata_cpu_dead); - if (ret < 0) { - cpuhp_remove_multi_state(hp_online); - return ret; - } - return 0; -} -module_init(padata_driver_init); + if (ret < 0) + goto remove_online_state; +#endif -static __exit void padata_driver_exit(void) -{ + possible_cpus = num_possible_cpus(); + padata_works = kmalloc_array(possible_cpus, sizeof(struct padata_work), + GFP_KERNEL); + if (!padata_works) + goto remove_dead_state; + + for (i = 0; i < possible_cpus; ++i) + list_add(&padata_works[i].pw_list, &padata_free_works); + + return; + +remove_dead_state: +#ifdef CONFIG_HOTPLUG_CPU cpuhp_remove_multi_state(CPUHP_PADATA_DEAD); +remove_online_state: cpuhp_remove_multi_state(hp_online); -} -module_exit(padata_driver_exit); +err: #endif + pr_warn("padata: initialization failed\n"); +} diff --git a/kernel/panic.c b/kernel/panic.c index b69ee9e76cb2..e2157ca387c8 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -36,6 +36,14 @@ #define PANIC_TIMER_STEP 100 #define PANIC_BLINK_SPD 18 +#ifdef CONFIG_SMP +/* + * Should we dump all CPUs backtraces in an oops event? + * Defaults to 0, can be changed via sysctl. + */ +unsigned int __read_mostly sysctl_oops_all_cpu_backtrace; +#endif /* CONFIG_SMP */ + int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE; static unsigned long tainted_mask = IS_ENABLED(CONFIG_GCC_PLUGIN_RANDSTRUCT) ? (1 << TAINT_RANDSTRUCT) : 0; @@ -44,6 +52,8 @@ static int pause_on_oops_flag; static DEFINE_SPINLOCK(pause_on_oops_lock); bool crash_kexec_post_notifiers; int panic_on_warn __read_mostly; +unsigned long panic_on_taint; +bool panic_on_taint_nousertaint = false; int panic_timeout = CONFIG_PANIC_TIMEOUT; EXPORT_SYMBOL_GPL(panic_timeout); @@ -434,6 +444,11 @@ void add_taint(unsigned flag, enum lockdep_ok lockdep_ok) pr_warn("Disabling lock debugging due to kernel taint\n"); set_bit(flag, &tainted_mask); + + if (tainted_mask & panic_on_taint) { + panic_on_taint = 0; + panic("panic_on_taint set ..."); + } } EXPORT_SYMBOL(add_taint); @@ -515,6 +530,9 @@ void oops_enter(void) /* can't trust the integrity of the kernel anymore: */ debug_locks_off(); do_oops_enter_exit(); + + if (sysctl_oops_all_cpu_backtrace) + trigger_all_cpu_backtrace(); } /* @@ -662,10 +680,12 @@ device_initcall(register_warn_debugfs); * Called when gcc's -fstack-protector feature is used, and * gcc detects corruption of the on-stack canary value */ -__visible void __stack_chk_fail(void) +__visible noinstr void __stack_chk_fail(void) { + instrumentation_begin(); panic("stack-protector: Kernel stack is corrupted in: %pB", __builtin_return_address(0)); + instrumentation_end(); } EXPORT_SYMBOL(__stack_chk_fail); @@ -686,3 +706,30 @@ static int __init oops_setup(char *s) return 0; } early_param("oops", oops_setup); + +static int __init panic_on_taint_setup(char *s) +{ + char *taint_str; + + if (!s) + return -EINVAL; + + taint_str = strsep(&s, ","); + if (kstrtoul(taint_str, 16, &panic_on_taint)) + return -EINVAL; + + /* make sure panic_on_taint doesn't hold out-of-range TAINT flags */ + panic_on_taint &= TAINT_FLAGS_MAX; + + if (!panic_on_taint) + return -EINVAL; + + if (s && !strcmp(s, "nousertaint")) + panic_on_taint_nousertaint = true; + + pr_info("panic_on_taint: bitmask=0x%lx nousertaint_mode=%sabled\n", + panic_on_taint, panic_on_taint_nousertaint ? "en" : "dis"); + + return 0; +} +early_param("panic_on_taint", panic_on_taint_setup); diff --git a/kernel/pid.c b/kernel/pid.c index c835b844aca7..b2562a7ce525 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -42,6 +42,7 @@ #include <linux/sched/signal.h> #include <linux/sched/task.h> #include <linux/idr.h> +#include <net/sock.h> struct pid init_struct_pid = { .count = REFCOUNT_INIT(1), @@ -198,7 +199,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, if (tid != 1 && !tmp->child_reaper) goto out_free; retval = -EPERM; - if (!ns_capable(tmp->user_ns, CAP_SYS_ADMIN)) + if (!checkpoint_restore_ns_capable(tmp->user_ns)) goto out_free; set_tid_size--; } @@ -363,6 +364,25 @@ void change_pid(struct task_struct *task, enum pid_type type, attach_pid(task, type); } +void exchange_tids(struct task_struct *left, struct task_struct *right) +{ + struct pid *pid1 = left->thread_pid; + struct pid *pid2 = right->thread_pid; + struct hlist_head *head1 = &pid1->tasks[PIDTYPE_PID]; + struct hlist_head *head2 = &pid2->tasks[PIDTYPE_PID]; + + /* Swap the single entry tid lists */ + hlists_swap_heads_rcu(head1, head2); + + /* Swap the per task_struct pid */ + rcu_assign_pointer(left->thread_pid, pid2); + rcu_assign_pointer(right->thread_pid, pid1); + + /* Swap the cached value */ + WRITE_ONCE(left->pid, pid_nr(pid2)); + WRITE_ONCE(right->pid, pid_nr(pid1)); +} + /* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */ void transfer_pid(struct task_struct *old, struct task_struct *new, enum pid_type type) @@ -476,8 +496,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, rcu_read_lock(); if (!ns) ns = task_active_pid_ns(current); - if (likely(pid_alive(task))) - nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns); + nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns); rcu_read_unlock(); return nr; @@ -617,17 +636,8 @@ static int pidfd_getfd(struct pid *pid, int fd) if (IS_ERR(file)) return PTR_ERR(file); - ret = security_file_receive(file); - if (ret) { - fput(file); - return ret; - } - - ret = get_unused_fd_flags(O_CLOEXEC); - if (ret < 0) - fput(file); - else - fd_install(ret, file); + ret = receive_fd(file, O_CLOEXEC); + fput(file); return ret; } diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 01f8ba32cc0c..ac135bd600eb 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -263,13 +263,13 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) #ifdef CONFIG_CHECKPOINT_RESTORE static int pid_ns_ctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct pid_namespace *pid_ns = task_active_pid_ns(current); struct ctl_table tmp = *table; int ret, next; - if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN)) + if (write && !checkpoint_restore_ns_capable(pid_ns->user_ns)) return -EPERM; /* @@ -378,13 +378,14 @@ static void pidns_put(struct ns_common *ns) put_pid_ns(to_pid_ns(ns)); } -static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns) +static int pidns_install(struct nsset *nsset, struct ns_common *ns) { + struct nsproxy *nsproxy = nsset->nsproxy; struct pid_namespace *active = task_active_pid_ns(current); struct pid_namespace *ancestor, *new = to_pid_ns(ns); if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) || - !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) + !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) return -EPERM; /* diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index c208566c844b..a7320f07689d 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -3,7 +3,7 @@ config SUSPEND bool "Suspend to RAM and standby" depends on ARCH_SUSPEND_POSSIBLE default y - ---help--- + help Allow the system to enter sleep states in which main memory is powered and thus its contents are preserved, such as the suspend-to-RAM state (e.g. the ACPI S3 state). @@ -42,7 +42,7 @@ config HIBERNATION select LZO_COMPRESS select LZO_DECOMPRESS select CRC32 - ---help--- + help Enable the suspend to disk (STD) functionality, which is usually called "hibernation" in user interfaces. STD checkpoints the system and powers it off; and restores that checkpoint on reboot. @@ -80,11 +80,23 @@ config HIBERNATION For more information take a look at <file:Documentation/power/swsusp.rst>. +config HIBERNATION_SNAPSHOT_DEV + bool "Userspace snapshot device" + depends on HIBERNATION + default y + help + Device used by the uswsusp tools. + + Say N if no snapshotting from userspace is needed, this also + reduces the attack surface of the kernel. + + If in doubt, say Y. + config PM_STD_PARTITION string "Default resume partition" depends on HIBERNATION default "" - ---help--- + help The default resume partition is the partition that the suspend- to-disk implementation will look for a suspended disk image. @@ -119,7 +131,7 @@ config PM_SLEEP_SMP_NONZERO_CPU def_bool y depends on PM_SLEEP_SMP depends on ARCH_SUSPEND_NONZERO_CPU - ---help--- + help If an arch can suspend (for suspend, hibernate, kexec, etc) on a non-zero numbered CPU, it may define ARCH_SUSPEND_NONZERO_CPU. This will allow nohz_full mask to include CPU0. @@ -128,7 +140,7 @@ config PM_AUTOSLEEP bool "Opportunistic sleep" depends on PM_SLEEP default n - ---help--- + help Allow the kernel to trigger a system transition into a global sleep state automatically whenever there are no active wakeup sources. @@ -136,7 +148,7 @@ config PM_WAKELOCKS bool "User space wakeup sources interface" depends on PM_SLEEP default n - ---help--- + help Allow user space to create, activate and deactivate wakeup source objects with the help of a sysfs-based interface. @@ -153,7 +165,7 @@ config PM_WAKELOCKS_GC config PM bool "Device power management core functionality" - ---help--- + help Enable functionality allowing I/O devices to be put into energy-saving (low power) states, for example after a specified period of inactivity (autosuspended), and woken up in response to a hardware-generated @@ -167,7 +179,7 @@ config PM config PM_DEBUG bool "Power Management Debug Support" depends on PM - ---help--- + help This option enables various debugging support in the Power Management code. This is helpful when debugging and reporting PM bugs, like suspend support. @@ -175,7 +187,7 @@ config PM_DEBUG config PM_ADVANCED_DEBUG bool "Extra PM attributes in sysfs for low-level debugging/testing" depends on PM_DEBUG - ---help--- + help Add extra sysfs attributes allowing one to access some Power Management fields of device objects from user space. If you are not a kernel developer interested in debugging/testing Power Management, say "no". @@ -183,7 +195,7 @@ config PM_ADVANCED_DEBUG config PM_TEST_SUSPEND bool "Test suspend/resume and wakealarm during bootup" depends on SUSPEND && PM_DEBUG && RTC_CLASS=y - ---help--- + help This option will let you suspend your machine during bootup, and make it wake up a few seconds later using an RTC wakeup alarm. Enable this with a kernel parameter like "test_suspend=mem". @@ -198,7 +210,7 @@ config PM_SLEEP_DEBUG config DPM_WATCHDOG bool "Device suspend/resume watchdog" depends on PM_DEBUG && PSTORE && EXPERT - ---help--- + help Sets up a watchdog timer to capture drivers that are locked up attempting to suspend/resume a device. A detected lockup causes system panic with message @@ -231,7 +243,7 @@ config PM_TRACE_RTC depends on PM_SLEEP_DEBUG depends on X86 select PM_TRACE - ---help--- + help This enables some cheesy code to save the last PM event point in the RTC across reboots, so that you can debug a machine that just hangs during suspend (or more commonly, during resume). diff --git a/kernel/power/Makefile b/kernel/power/Makefile index e7e47d9be1e5..5899260a8bef 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -10,7 +10,8 @@ obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o obj-$(CONFIG_FREEZER) += process.o obj-$(CONFIG_SUSPEND) += suspend.o obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o -obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o +obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o +obj-$(CONFIG_HIBERNATION_SNAPSHOT_DEV) += user.o obj-$(CONFIG_PM_AUTOSLEEP) += autosleep.o obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 0a9326f5f421..c1ff7fa030ab 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -1,9 +1,10 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Energy Model of CPUs + * Energy Model of devices * - * Copyright (c) 2018, Arm ltd. + * Copyright (c) 2018-2020, Arm ltd. * Written by: Quentin Perret, Arm ltd. + * Improvements provided by: Lukasz Luba, Arm ltd. */ #define pr_fmt(fmt) "energy_model: " fmt @@ -15,30 +16,32 @@ #include <linux/sched/topology.h> #include <linux/slab.h> -/* Mapping of each CPU to the performance domain to which it belongs. */ -static DEFINE_PER_CPU(struct em_perf_domain *, em_data); - /* * Mutex serializing the registrations of performance domains and letting * callbacks defined by drivers sleep. */ static DEFINE_MUTEX(em_pd_mutex); +static bool _is_cpu_device(struct device *dev) +{ + return (dev->bus == &cpu_subsys); +} + #ifdef CONFIG_DEBUG_FS static struct dentry *rootdir; -static void em_debug_create_cs(struct em_cap_state *cs, struct dentry *pd) +static void em_debug_create_ps(struct em_perf_state *ps, struct dentry *pd) { struct dentry *d; char name[24]; - snprintf(name, sizeof(name), "cs:%lu", cs->frequency); + snprintf(name, sizeof(name), "ps:%lu", ps->frequency); - /* Create per-cs directory */ + /* Create per-ps directory */ d = debugfs_create_dir(name, pd); - debugfs_create_ulong("frequency", 0444, d, &cs->frequency); - debugfs_create_ulong("power", 0444, d, &cs->power); - debugfs_create_ulong("cost", 0444, d, &cs->cost); + debugfs_create_ulong("frequency", 0444, d, &ps->frequency); + debugfs_create_ulong("power", 0444, d, &ps->power); + debugfs_create_ulong("cost", 0444, d, &ps->cost); } static int em_debug_cpus_show(struct seq_file *s, void *unused) @@ -49,22 +52,30 @@ static int em_debug_cpus_show(struct seq_file *s, void *unused) } DEFINE_SHOW_ATTRIBUTE(em_debug_cpus); -static void em_debug_create_pd(struct em_perf_domain *pd, int cpu) +static void em_debug_create_pd(struct device *dev) { struct dentry *d; - char name[8]; int i; - snprintf(name, sizeof(name), "pd%d", cpu); - /* Create the directory of the performance domain */ - d = debugfs_create_dir(name, rootdir); + d = debugfs_create_dir(dev_name(dev), rootdir); - debugfs_create_file("cpus", 0444, d, pd->cpus, &em_debug_cpus_fops); + if (_is_cpu_device(dev)) + debugfs_create_file("cpus", 0444, d, dev->em_pd->cpus, + &em_debug_cpus_fops); + + /* Create a sub-directory for each performance state */ + for (i = 0; i < dev->em_pd->nr_perf_states; i++) + em_debug_create_ps(&dev->em_pd->table[i], d); - /* Create a sub-directory for each capacity state */ - for (i = 0; i < pd->nr_cap_states; i++) - em_debug_create_cs(&pd->table[i], d); +} + +static void em_debug_remove_pd(struct device *dev) +{ + struct dentry *debug_dir; + + debug_dir = debugfs_lookup(dev_name(dev), rootdir); + debugfs_remove_recursive(debug_dir); } static int __init em_debug_init(void) @@ -76,58 +87,55 @@ static int __init em_debug_init(void) } core_initcall(em_debug_init); #else /* CONFIG_DEBUG_FS */ -static void em_debug_create_pd(struct em_perf_domain *pd, int cpu) {} +static void em_debug_create_pd(struct device *dev) {} +static void em_debug_remove_pd(struct device *dev) {} #endif -static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states, - struct em_data_callback *cb) + +static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, + int nr_states, struct em_data_callback *cb) { unsigned long opp_eff, prev_opp_eff = ULONG_MAX; unsigned long power, freq, prev_freq = 0; - int i, ret, cpu = cpumask_first(span); - struct em_cap_state *table; - struct em_perf_domain *pd; + struct em_perf_state *table; + int i, ret; u64 fmax; - if (!cb->active_power) - return NULL; - - pd = kzalloc(sizeof(*pd) + cpumask_size(), GFP_KERNEL); - if (!pd) - return NULL; - table = kcalloc(nr_states, sizeof(*table), GFP_KERNEL); if (!table) - goto free_pd; + return -ENOMEM; - /* Build the list of capacity states for this performance domain */ + /* Build the list of performance states for this performance domain */ for (i = 0, freq = 0; i < nr_states; i++, freq++) { /* * active_power() is a driver callback which ceils 'freq' to - * lowest capacity state of 'cpu' above 'freq' and updates + * lowest performance state of 'dev' above 'freq' and updates * 'power' and 'freq' accordingly. */ - ret = cb->active_power(&power, &freq, cpu); + ret = cb->active_power(&power, &freq, dev); if (ret) { - pr_err("pd%d: invalid cap. state: %d\n", cpu, ret); - goto free_cs_table; + dev_err(dev, "EM: invalid perf. state: %d\n", + ret); + goto free_ps_table; } /* * We expect the driver callback to increase the frequency for - * higher capacity states. + * higher performance states. */ if (freq <= prev_freq) { - pr_err("pd%d: non-increasing freq: %lu\n", cpu, freq); - goto free_cs_table; + dev_err(dev, "EM: non-increasing freq: %lu\n", + freq); + goto free_ps_table; } /* * The power returned by active_state() is expected to be * positive, in milli-watts and to fit into 16 bits. */ - if (!power || power > EM_CPU_MAX_POWER) { - pr_err("pd%d: invalid power: %lu\n", cpu, power); - goto free_cs_table; + if (!power || power > EM_MAX_POWER) { + dev_err(dev, "EM: invalid power: %lu\n", + power); + goto free_ps_table; } table[i].power = power; @@ -141,12 +149,12 @@ static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states, */ opp_eff = freq / power; if (opp_eff >= prev_opp_eff) - pr_warn("pd%d: hertz/watts ratio non-monotonically decreasing: em_cap_state %d >= em_cap_state%d\n", - cpu, i, i - 1); + dev_dbg(dev, "EM: hertz/watts ratio non-monotonically decreasing: em_perf_state %d >= em_perf_state%d\n", + i, i - 1); prev_opp_eff = opp_eff; } - /* Compute the cost of each capacity_state. */ + /* Compute the cost of each performance state. */ fmax = (u64) table[nr_states - 1].frequency; for (i = 0; i < nr_states; i++) { table[i].cost = div64_u64(fmax * table[i].power, @@ -154,39 +162,94 @@ static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states, } pd->table = table; - pd->nr_cap_states = nr_states; - cpumask_copy(to_cpumask(pd->cpus), span); - - em_debug_create_pd(pd, cpu); + pd->nr_perf_states = nr_states; - return pd; + return 0; -free_cs_table: +free_ps_table: kfree(table); -free_pd: - kfree(pd); + return -EINVAL; +} + +static int em_create_pd(struct device *dev, int nr_states, + struct em_data_callback *cb, cpumask_t *cpus) +{ + struct em_perf_domain *pd; + struct device *cpu_dev; + int cpu, ret; + + if (_is_cpu_device(dev)) { + pd = kzalloc(sizeof(*pd) + cpumask_size(), GFP_KERNEL); + if (!pd) + return -ENOMEM; + + cpumask_copy(em_span_cpus(pd), cpus); + } else { + pd = kzalloc(sizeof(*pd), GFP_KERNEL); + if (!pd) + return -ENOMEM; + } + + ret = em_create_perf_table(dev, pd, nr_states, cb); + if (ret) { + kfree(pd); + return ret; + } + + if (_is_cpu_device(dev)) + for_each_cpu(cpu, cpus) { + cpu_dev = get_cpu_device(cpu); + cpu_dev->em_pd = pd; + } + + dev->em_pd = pd; + + return 0; +} + +/** + * em_pd_get() - Return the performance domain for a device + * @dev : Device to find the performance domain for + * + * Returns the performance domain to which @dev belongs, or NULL if it doesn't + * exist. + */ +struct em_perf_domain *em_pd_get(struct device *dev) +{ + if (IS_ERR_OR_NULL(dev)) + return NULL; - return NULL; + return dev->em_pd; } +EXPORT_SYMBOL_GPL(em_pd_get); /** * em_cpu_get() - Return the performance domain for a CPU * @cpu : CPU to find the performance domain for * - * Return: the performance domain to which 'cpu' belongs, or NULL if it doesn't + * Returns the performance domain to which @cpu belongs, or NULL if it doesn't * exist. */ struct em_perf_domain *em_cpu_get(int cpu) { - return READ_ONCE(per_cpu(em_data, cpu)); + struct device *cpu_dev; + + cpu_dev = get_cpu_device(cpu); + if (!cpu_dev) + return NULL; + + return em_pd_get(cpu_dev); } EXPORT_SYMBOL_GPL(em_cpu_get); /** - * em_register_perf_domain() - Register the Energy Model of a performance domain - * @span : Mask of CPUs in the performance domain - * @nr_states : Number of capacity states to register + * em_dev_register_perf_domain() - Register the Energy Model (EM) for a device + * @dev : Device for which the EM is to register + * @nr_states : Number of performance states to register * @cb : Callback functions providing the data of the Energy Model + * @cpus : Pointer to cpumask_t, which in case of a CPU device is + * obligatory. It can be taken from i.e. 'policy->cpus'. For other + * type of devices this should be set to NULL. * * Create Energy Model tables for a performance domain using the callbacks * defined in cb. @@ -196,14 +259,13 @@ EXPORT_SYMBOL_GPL(em_cpu_get); * * Return 0 on success */ -int em_register_perf_domain(cpumask_t *span, unsigned int nr_states, - struct em_data_callback *cb) +int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, + struct em_data_callback *cb, cpumask_t *cpus) { unsigned long cap, prev_cap = 0; - struct em_perf_domain *pd; - int cpu, ret = 0; + int cpu, ret; - if (!span || !nr_states || !cb) + if (!dev || !nr_states || !cb) return -EINVAL; /* @@ -212,47 +274,79 @@ int em_register_perf_domain(cpumask_t *span, unsigned int nr_states, */ mutex_lock(&em_pd_mutex); - for_each_cpu(cpu, span) { - /* Make sure we don't register again an existing domain. */ - if (READ_ONCE(per_cpu(em_data, cpu))) { - ret = -EEXIST; - goto unlock; - } + if (dev->em_pd) { + ret = -EEXIST; + goto unlock; + } - /* - * All CPUs of a domain must have the same micro-architecture - * since they all share the same table. - */ - cap = arch_scale_cpu_capacity(cpu); - if (prev_cap && prev_cap != cap) { - pr_err("CPUs of %*pbl must have the same capacity\n", - cpumask_pr_args(span)); + if (_is_cpu_device(dev)) { + if (!cpus) { + dev_err(dev, "EM: invalid CPU mask\n"); ret = -EINVAL; goto unlock; } - prev_cap = cap; + + for_each_cpu(cpu, cpus) { + if (em_cpu_get(cpu)) { + dev_err(dev, "EM: exists for CPU%d\n", cpu); + ret = -EEXIST; + goto unlock; + } + /* + * All CPUs of a domain must have the same + * micro-architecture since they all share the same + * table. + */ + cap = arch_scale_cpu_capacity(cpu); + if (prev_cap && prev_cap != cap) { + dev_err(dev, "EM: CPUs of %*pbl must have the same capacity\n", + cpumask_pr_args(cpus)); + + ret = -EINVAL; + goto unlock; + } + prev_cap = cap; + } } - /* Create the performance domain and add it to the Energy Model. */ - pd = em_create_pd(span, nr_states, cb); - if (!pd) { - ret = -EINVAL; + ret = em_create_pd(dev, nr_states, cb, cpus); + if (ret) goto unlock; - } - for_each_cpu(cpu, span) { - /* - * The per-cpu array can be read concurrently from em_cpu_get(). - * The barrier enforces the ordering needed to make sure readers - * can only access well formed em_perf_domain structs. - */ - smp_store_release(per_cpu_ptr(&em_data, cpu), pd); - } + em_debug_create_pd(dev); + dev_info(dev, "EM: created perf domain\n"); - pr_debug("Created perf domain %*pbl\n", cpumask_pr_args(span)); unlock: mutex_unlock(&em_pd_mutex); - return ret; } -EXPORT_SYMBOL_GPL(em_register_perf_domain); +EXPORT_SYMBOL_GPL(em_dev_register_perf_domain); + +/** + * em_dev_unregister_perf_domain() - Unregister Energy Model (EM) for a device + * @dev : Device for which the EM is registered + * + * Unregister the EM for the specified @dev (but not a CPU device). + */ +void em_dev_unregister_perf_domain(struct device *dev) +{ + if (IS_ERR_OR_NULL(dev) || !dev->em_pd) + return; + + if (_is_cpu_device(dev)) + return; + + /* + * The mutex separates all register/unregister requests and protects + * from potential clean-up/setup issues in the debugfs directories. + * The debugfs directory name is the same as device's name. + */ + mutex_lock(&em_pd_mutex); + em_debug_remove_pd(dev); + + kfree(dev->em_pd->table); + kfree(dev->em_pd); + dev->em_pd = NULL; + mutex_unlock(&em_pd_mutex); +} +EXPORT_SYMBOL_GPL(em_dev_unregister_perf_domain); diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 30bd28d1d418..5714f51ba9f8 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -67,6 +67,18 @@ bool freezer_test_done; static const struct platform_hibernation_ops *hibernation_ops; +static atomic_t hibernate_atomic = ATOMIC_INIT(1); + +bool hibernate_acquire(void) +{ + return atomic_add_unless(&hibernate_atomic, -1, 0); +} + +void hibernate_release(void) +{ + atomic_inc(&hibernate_atomic); +} + bool hibernation_available(void) { return nohibernate == 0 && !security_locked_down(LOCKDOWN_HIBERNATION); @@ -704,7 +716,7 @@ int hibernate(void) lock_system_sleep(); /* The snapshot device should not be opened while we're running */ - if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { + if (!hibernate_acquire()) { error = -EBUSY; goto Unlock; } @@ -775,7 +787,7 @@ int hibernate(void) Exit: __pm_notifier_call_chain(PM_POST_HIBERNATION, nr_calls, NULL); pm_restore_console(); - atomic_inc(&snapshot_device_available); + hibernate_release(); Unlock: unlock_system_sleep(); pr_info("hibernation exit\n"); @@ -880,7 +892,7 @@ static int software_resume(void) goto Unlock; /* The snapshot device should not be opened while we're running */ - if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { + if (!hibernate_acquire()) { error = -EBUSY; swsusp_close(FMODE_READ); goto Unlock; @@ -911,7 +923,7 @@ static int software_resume(void) __pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL); pm_restore_console(); pr_info("resume failed (%d)\n", error); - atomic_inc(&snapshot_device_available); + hibernate_release(); /* For success case, the suspend path will release the lock */ Unlock: mutex_unlock(&system_transition_mutex); @@ -1050,7 +1062,7 @@ power_attr(disk); static ssize_t resume_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf,"%d:%d\n", MAJOR(swsusp_resume_device), + return sprintf(buf, "%d:%d\n", MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device)); } @@ -1150,7 +1162,7 @@ static ssize_t reserved_size_store(struct kobject *kobj, power_attr(reserved_size); -static struct attribute * g[] = { +static struct attribute *g[] = { &disk_attr.attr, &resume_offset_attr.attr, &resume_attr.attr, @@ -1178,7 +1190,7 @@ static int __init resume_setup(char *str) if (noresume) return 1; - strncpy( resume_file, str, 255 ); + strncpy(resume_file, str, 255); return 1; } diff --git a/kernel/power/power.h b/kernel/power/power.h index 7cdc64dc2373..32fc89ac96c3 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -32,7 +32,7 @@ static inline int init_header_complete(struct swsusp_info *info) return arch_hibernation_header_save(info, MAX_ARCH_HEADER_SIZE); } -static inline char *check_image_kernel(struct swsusp_info *info) +static inline const char *check_image_kernel(struct swsusp_info *info) { return arch_hibernation_header_restore(info) ? "architecture specific data" : NULL; @@ -154,8 +154,8 @@ extern int snapshot_write_next(struct snapshot_handle *handle); extern void snapshot_write_finalize(struct snapshot_handle *handle); extern int snapshot_image_loaded(struct snapshot_handle *handle); -/* If unset, the snapshot device cannot be open. */ -extern atomic_t snapshot_device_available; +extern bool hibernate_acquire(void); +extern void hibernate_release(void); extern sector_t alloc_swapdev_block(int swap); extern void free_all_swap_pages(int swap); diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c index 6d475281c730..562aa0e450ed 100644 --- a/kernel/power/poweroff.c +++ b/kernel/power/poweroff.c @@ -29,7 +29,7 @@ static void handle_poweroff(int key) schedule_work_on(cpumask_first(cpu_online_mask), &poweroff_work); } -static struct sysrq_key_op sysrq_poweroff_op = { +static const struct sysrq_key_op sysrq_poweroff_op = { .handler = handle_poweroff, .help_msg = "poweroff(o)", .action_msg = "Power Off", diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 659800157b17..cef154261fe2 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -34,7 +34,6 @@ #include <linux/uaccess.h> #include <asm/mmu_context.h> -#include <asm/pgtable.h> #include <asm/tlbflush.h> #include <asm/io.h> @@ -2024,7 +2023,7 @@ static int init_header_complete(struct swsusp_info *info) return 0; } -static char *check_image_kernel(struct swsusp_info *info) +static const char *check_image_kernel(struct swsusp_info *info) { if (info->version_code != LINUX_VERSION_CODE) return "kernel version"; @@ -2177,7 +2176,7 @@ static void mark_unsafe_pages(struct memory_bitmap *bm) static int check_header(struct swsusp_info *info) { - char *reason; + const char *reason; reason = check_image_kernel(info); if (!reason && info->num_physpages != get_num_physpages()) diff --git a/kernel/power/swap.c b/kernel/power/swap.c index ca0fcb5ced71..01e2858b5fe3 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -1590,7 +1590,7 @@ int swsusp_unmark(void) } #endif -static int swsusp_header_init(void) +static int __init swsusp_header_init(void) { swsusp_header = (struct swsusp_header*) __get_free_page(GFP_KERNEL); if (!swsusp_header) diff --git a/kernel/power/user.c b/kernel/power/user.c index 7959449765d9..d5eedc2baa2a 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -35,9 +35,13 @@ static struct snapshot_data { bool ready; bool platform_support; bool free_bitmaps; + struct inode *bd_inode; } snapshot_state; -atomic_t snapshot_device_available = ATOMIC_INIT(1); +int is_hibernate_resume_dev(const struct inode *bd_inode) +{ + return hibernation_available() && snapshot_state.bd_inode == bd_inode; +} static int snapshot_open(struct inode *inode, struct file *filp) { @@ -49,13 +53,13 @@ static int snapshot_open(struct inode *inode, struct file *filp) lock_system_sleep(); - if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { + if (!hibernate_acquire()) { error = -EBUSY; goto Unlock; } if ((filp->f_flags & O_ACCMODE) == O_RDWR) { - atomic_inc(&snapshot_device_available); + hibernate_release(); error = -ENOSYS; goto Unlock; } @@ -92,11 +96,12 @@ static int snapshot_open(struct inode *inode, struct file *filp) __pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL); } if (error) - atomic_inc(&snapshot_device_available); + hibernate_release(); data->frozen = false; data->ready = false; data->platform_support = false; + data->bd_inode = NULL; Unlock: unlock_system_sleep(); @@ -112,6 +117,7 @@ static int snapshot_release(struct inode *inode, struct file *filp) swsusp_free(); data = filp->private_data; + data->bd_inode = NULL; free_all_swap_pages(data->swap); if (data->frozen) { pm_restore_gfp_mask(); @@ -122,7 +128,7 @@ static int snapshot_release(struct inode *inode, struct file *filp) } pm_notifier_call_chain(data->mode == O_RDONLY ? PM_POST_HIBERNATION : PM_POST_RESTORE); - atomic_inc(&snapshot_device_available); + hibernate_release(); unlock_system_sleep(); @@ -204,6 +210,7 @@ struct compat_resume_swap_area { static int snapshot_set_swap_area(struct snapshot_data *data, void __user *argp) { + struct block_device *bdev; sector_t offset; dev_t swdev; @@ -234,9 +241,12 @@ static int snapshot_set_swap_area(struct snapshot_data *data, data->swap = -1; return -EINVAL; } - data->swap = swap_type_of(swdev, offset, NULL); + data->swap = swap_type_of(swdev, offset, &bdev); if (data->swap < 0) return -ENODEV; + + data->bd_inode = bdev->bd_inode; + bdput(bdev); return 0; } diff --git a/kernel/printk/console_cmdline.h b/kernel/printk/console_cmdline.h index 11f19c466af5..3ca74ad391d6 100644 --- a/kernel/printk/console_cmdline.h +++ b/kernel/printk/console_cmdline.h @@ -6,6 +6,7 @@ struct console_cmdline { char name[16]; /* Name of the driver */ int index; /* Minor dev. to use */ + bool user_specified; /* Specified by command line vs. platform */ char *options; /* Options for the driver */ #ifdef CONFIG_A11Y_BRAILLE_CONSOLE char *brl_options; /* Options for braille driver */ diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index b2b0f526f249..660f9a6bf73a 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -6,9 +6,11 @@ #ifdef CONFIG_PRINTK -#define PRINTK_SAFE_CONTEXT_MASK 0x3fffffff -#define PRINTK_NMI_DIRECT_CONTEXT_MASK 0x40000000 -#define PRINTK_NMI_CONTEXT_MASK 0x80000000 +#define PRINTK_SAFE_CONTEXT_MASK 0x007ffffff +#define PRINTK_NMI_DIRECT_CONTEXT_MASK 0x008000000 +#define PRINTK_NMI_CONTEXT_MASK 0xff0000000 + +#define PRINTK_NMI_CONTEXT_OFFSET 0x010000000 extern raw_spinlock_t logbuf_lock; diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 9a9b6156270b..9b75f6bfc333 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -35,7 +35,6 @@ #include <linux/memblock.h> #include <linux/syscalls.h> #include <linux/crash_core.h> -#include <linux/kdb.h> #include <linux/ratelimit.h> #include <linux/kmsg_dump.h> #include <linux/syslog.h> @@ -173,7 +172,7 @@ __setup("printk.devkmsg=", control_devkmsg); char devkmsg_log_str[DEVKMSG_STR_MAX_SIZE] = "ratelimit"; int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { char old_str[DEVKMSG_STR_MAX_SIZE]; unsigned int old; @@ -280,6 +279,7 @@ static struct console *exclusive_console; static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES]; static int preferred_console = -1; +static bool has_preferred_console; int console_set_on_cmdline; EXPORT_SYMBOL(console_set_on_cmdline); @@ -943,6 +943,14 @@ out: return ret; } +/* + * Be careful when modifying this function!!! + * + * Only few operations are supported because the device works only with the + * entire variable length messages (records). Non-standard values are + * returned in the other cases and has been this way for quite some time. + * User space applications might depend on this behavior. + */ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) { struct devkmsg_user *user = file->private_data; @@ -2036,18 +2044,7 @@ EXPORT_SYMBOL(vprintk); int vprintk_default(const char *fmt, va_list args) { - int r; - -#ifdef CONFIG_KGDB_KDB - /* Allow to pass printk() to kdb but avoid a recursion. */ - if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0)) { - r = vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args); - return r; - } -#endif - r = vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args); - - return r; + return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args); } EXPORT_SYMBOL_GPL(vprintk_default); @@ -2140,7 +2137,7 @@ asmlinkage __visible void early_printk(const char *fmt, ...) #endif static int __add_preferred_console(char *name, int idx, char *options, - char *brl_options) + char *brl_options, bool user_specified) { struct console_cmdline *c; int i; @@ -2155,6 +2152,8 @@ static int __add_preferred_console(char *name, int idx, char *options, if (strcmp(c->name, name) == 0 && c->index == idx) { if (!brl_options) preferred_console = i; + if (user_specified) + c->user_specified = true; return 0; } } @@ -2164,6 +2163,7 @@ static int __add_preferred_console(char *name, int idx, char *options, preferred_console = i; strlcpy(c->name, name, sizeof(c->name)); c->options = options; + c->user_specified = user_specified; braille_set_options(c, brl_options); c->index = idx; @@ -2190,6 +2190,9 @@ static int __init console_setup(char *str) char *s, *options, *brl_options = NULL; int idx; + if (str[0] == 0) + return 1; + if (_braille_console_setup(&str, &brl_options)) return 1; @@ -2218,7 +2221,7 @@ static int __init console_setup(char *str) idx = simple_strtoul(s, NULL, 10); *s = 0; - __add_preferred_console(buf, idx, options, brl_options); + __add_preferred_console(buf, idx, options, brl_options, true); console_set_on_cmdline = 1; return 1; } @@ -2239,7 +2242,7 @@ __setup("console=", console_setup); */ int add_preferred_console(char *name, int idx, char *options) { - return __add_preferred_console(name, idx, options, NULL); + return __add_preferred_console(name, idx, options, NULL, false); } bool console_suspend_enabled = true; @@ -2438,9 +2441,9 @@ again: printk_safe_enter_irqsave(flags); raw_spin_lock(&logbuf_lock); if (console_seq < log_first_seq) { - len = sprintf(text, - "** %llu printk messages dropped **\n", - log_first_seq - console_seq); + len = snprintf(text, sizeof(text), + "** %llu printk messages dropped **\n", + log_first_seq - console_seq); /* messages are gone, move to first one */ console_seq = log_first_seq; @@ -2652,6 +2655,63 @@ static int __init keep_bootcon_setup(char *str) early_param("keep_bootcon", keep_bootcon_setup); /* + * This is called by register_console() to try to match + * the newly registered console with any of the ones selected + * by either the command line or add_preferred_console() and + * setup/enable it. + * + * Care need to be taken with consoles that are statically + * enabled such as netconsole + */ +static int try_enable_new_console(struct console *newcon, bool user_specified) +{ + struct console_cmdline *c; + int i, err; + + for (i = 0, c = console_cmdline; + i < MAX_CMDLINECONSOLES && c->name[0]; + i++, c++) { + if (c->user_specified != user_specified) + continue; + if (!newcon->match || + newcon->match(newcon, c->name, c->index, c->options) != 0) { + /* default matching */ + BUILD_BUG_ON(sizeof(c->name) != sizeof(newcon->name)); + if (strcmp(c->name, newcon->name) != 0) + continue; + if (newcon->index >= 0 && + newcon->index != c->index) + continue; + if (newcon->index < 0) + newcon->index = c->index; + + if (_braille_register_console(newcon, c)) + return 0; + + if (newcon->setup && + (err = newcon->setup(newcon, c->options)) != 0) + return err; + } + newcon->flags |= CON_ENABLED; + if (i == preferred_console) { + newcon->flags |= CON_CONSDEV; + has_preferred_console = true; + } + return 0; + } + + /* + * Some consoles, such as pstore and netconsole, can be enabled even + * without matching. Accept the pre-enabled consoles only when match() + * and setup() had a chance to be called. + */ + if (newcon->flags & CON_ENABLED && c->user_specified == user_specified) + return 0; + + return -ENOENT; +} + +/* * The console driver calls this routine during kernel initialization * to register the console printing procedure with printk() and to * print any messages that were printed by the kernel before the @@ -2672,11 +2732,9 @@ early_param("keep_bootcon", keep_bootcon_setup); */ void register_console(struct console *newcon) { - int i; unsigned long flags; struct console *bcon = NULL; - struct console_cmdline *c; - static bool has_preferred; + int err; for_each_console(bcon) { if (WARN(bcon == newcon, "console '%s%d' already registered\n", @@ -2701,15 +2759,15 @@ void register_console(struct console *newcon) if (console_drivers && console_drivers->flags & CON_BOOT) bcon = console_drivers; - if (!has_preferred || bcon || !console_drivers) - has_preferred = preferred_console >= 0; + if (!has_preferred_console || bcon || !console_drivers) + has_preferred_console = preferred_console >= 0; /* * See if we want to use this console driver. If we * didn't select a console we take the first one * that registers here. */ - if (!has_preferred) { + if (!has_preferred_console) { if (newcon->index < 0) newcon->index = 0; if (newcon->setup == NULL || @@ -2717,47 +2775,20 @@ void register_console(struct console *newcon) newcon->flags |= CON_ENABLED; if (newcon->device) { newcon->flags |= CON_CONSDEV; - has_preferred = true; + has_preferred_console = true; } } } - /* - * See if this console matches one we selected on - * the command line. - */ - for (i = 0, c = console_cmdline; - i < MAX_CMDLINECONSOLES && c->name[0]; - i++, c++) { - if (!newcon->match || - newcon->match(newcon, c->name, c->index, c->options) != 0) { - /* default matching */ - BUILD_BUG_ON(sizeof(c->name) != sizeof(newcon->name)); - if (strcmp(c->name, newcon->name) != 0) - continue; - if (newcon->index >= 0 && - newcon->index != c->index) - continue; - if (newcon->index < 0) - newcon->index = c->index; - - if (_braille_register_console(newcon, c)) - return; + /* See if this console matches one we selected on the command line */ + err = try_enable_new_console(newcon, true); - if (newcon->setup && - newcon->setup(newcon, c->options) != 0) - break; - } + /* If not, try to match against the platform default(s) */ + if (err == -ENOENT) + err = try_enable_new_console(newcon, false); - newcon->flags |= CON_ENABLED; - if (i == preferred_console) { - newcon->flags |= CON_CONSDEV; - has_preferred = true; - } - break; - } - - if (!(newcon->flags & CON_ENABLED)) + /* printk() messages are not printed to the Braille console. */ + if (err || newcon->flags & CON_BRL) return; /* @@ -2779,6 +2810,8 @@ void register_console(struct console *newcon) console_drivers = newcon; if (newcon->next) newcon->next->flags &= ~CON_CONSDEV; + /* Ensure this flag is always set for the head of the list */ + newcon->flags |= CON_CONSDEV; } else { newcon->next = console_drivers->next; console_drivers->next = newcon; @@ -3144,6 +3177,23 @@ EXPORT_SYMBOL_GPL(kmsg_dump_unregister); static bool always_kmsg_dump; module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR); +const char *kmsg_dump_reason_str(enum kmsg_dump_reason reason) +{ + switch (reason) { + case KMSG_DUMP_PANIC: + return "Panic"; + case KMSG_DUMP_OOPS: + return "Oops"; + case KMSG_DUMP_EMERG: + return "Emergency"; + case KMSG_DUMP_SHUTDOWN: + return "Shutdown"; + default: + return "Unknown"; + } +} +EXPORT_SYMBOL_GPL(kmsg_dump_reason_str); + /** * kmsg_dump - dump kernel log to kernel message dumpers. * @reason: the reason (oops, panic etc) for dumping @@ -3157,12 +3207,19 @@ void kmsg_dump(enum kmsg_dump_reason reason) struct kmsg_dumper *dumper; unsigned long flags; - if ((reason > KMSG_DUMP_OOPS) && !always_kmsg_dump) - return; - rcu_read_lock(); list_for_each_entry_rcu(dumper, &dump_list, list) { - if (dumper->max_reason && reason > dumper->max_reason) + enum kmsg_dump_reason max_reason = dumper->max_reason; + + /* + * If client has not provided a specific max_reason, default + * to KMSG_DUMP_OOPS, unless always_kmsg_dump was set. + */ + if (max_reason == KMSG_DUMP_UNDEF) { + max_reason = always_kmsg_dump ? KMSG_DUMP_MAX : + KMSG_DUMP_OOPS; + } + if (reason > max_reason) continue; /* initialize iterator with data about the stored records */ @@ -3360,7 +3417,7 @@ out: EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer); /** - * kmsg_dump_rewind_nolock - reset the interator (unlocked version) + * kmsg_dump_rewind_nolock - reset the iterator (unlocked version) * @dumper: registered kmsg dumper * * Reset the dumper's iterator so that kmsg_dump_get_line() and @@ -3378,7 +3435,7 @@ void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper) } /** - * kmsg_dump_rewind - reset the interator + * kmsg_dump_rewind - reset the iterator * @dumper: registered kmsg dumper * * Reset the dumper's iterator so that kmsg_dump_get_line() and diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index d9a659a686f3..50aeae770434 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -6,10 +6,12 @@ #include <linux/preempt.h> #include <linux/spinlock.h> #include <linux/debug_locks.h> +#include <linux/kdb.h> #include <linux/smp.h> #include <linux/cpumask.h> #include <linux/irq_work.h> #include <linux/printk.h> +#include <linux/kprobes.h> #include "internal.h" @@ -293,14 +295,14 @@ static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args) return printk_safe_log_store(s, fmt, args); } -void notrace printk_nmi_enter(void) +void noinstr printk_nmi_enter(void) { - this_cpu_or(printk_context, PRINTK_NMI_CONTEXT_MASK); + this_cpu_add(printk_context, PRINTK_NMI_CONTEXT_OFFSET); } -void notrace printk_nmi_exit(void) +void noinstr printk_nmi_exit(void) { - this_cpu_and(printk_context, ~PRINTK_NMI_CONTEXT_MASK); + this_cpu_sub(printk_context, PRINTK_NMI_CONTEXT_OFFSET); } /* @@ -359,6 +361,12 @@ void __printk_safe_exit(void) __printf(1, 0) int vprintk_func(const char *fmt, va_list args) { +#ifdef CONFIG_KGDB_KDB + /* Allow to pass printk() to kdb but avoid a recursion. */ + if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0)) + return vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args); +#endif + /* * Try to use the main logbuf even in NMI. But avoid calling console * drivers that might have their own locks. diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 1cc940fef17c..0ebe15a84985 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -70,13 +70,37 @@ config TREE_SRCU help This option selects the full-fledged version of SRCU. +config TASKS_RCU_GENERIC + def_bool TASKS_RCU || TASKS_RUDE_RCU || TASKS_TRACE_RCU + select SRCU + help + This option enables generic infrastructure code supporting + task-based RCU implementations. Not for manual selection. + config TASKS_RCU def_bool PREEMPTION - select SRCU help This option enables a task-based RCU implementation that uses only voluntary context switch (not preemption!), idle, and - user-mode execution as quiescent states. + user-mode execution as quiescent states. Not for manual selection. + +config TASKS_RUDE_RCU + def_bool 0 + help + This option enables a task-based RCU implementation that uses + only context switch (including preemption) and user-mode + execution as quiescent states. It forces IPIs and context + switches on all online CPUs, including idle ones, so use + with caution. + +config TASKS_TRACE_RCU + def_bool 0 + help + This option enables a task-based RCU implementation that uses + explicit rcu_read_lock_trace() read-side markers, and allows + these readers to appear in the idle loop as well as on the CPU + hotplug code paths. It can force IPIs on online CPUs, including + idle ones, so use with caution. config RCU_STALL_COMMON def_bool TREE_RCU @@ -210,4 +234,22 @@ config RCU_NOCB_CPU Say Y here if you want to help to debug reduced OS jitter. Say N here if you are unsure. +config TASKS_TRACE_RCU_READ_MB + bool "Tasks Trace RCU readers use memory barriers in user and idle" + depends on RCU_EXPERT + default PREEMPT_RT || NR_CPUS < 8 + help + Use this option to further reduce the number of IPIs sent + to CPUs executing in userspace or idle during tasks trace + RCU grace periods. Given that a reasonable setting of + the rcupdate.rcu_task_ipi_delay kernel boot parameter + eliminates such IPIs for many workloads, proper setting + of this Kconfig option is important mostly for aggressive + real-time installations and for battery-powered devices, + hence the default chosen above. + + Say Y here if you hate IPIs. + Say N here if you hate read-side memory barriers. + Take the default if you are unsure. + endmenu # "RCU Subsystem" diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 4aa02eee8f6c..3cf6132a4bb9 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -29,6 +29,8 @@ config RCU_PERF_TEST select TORTURE_TEST select SRCU select TASKS_RCU + select TASKS_RUDE_RCU + select TASKS_TRACE_RCU default n help This option provides a kernel module that runs performance @@ -46,6 +48,8 @@ config RCU_TORTURE_TEST select TORTURE_TEST select SRCU select TASKS_RCU + select TASKS_RUDE_RCU + select TASKS_TRACE_RCU default n help This option provides a kernel module that runs torture tests @@ -57,6 +61,25 @@ config RCU_TORTURE_TEST Say M if you want the RCU torture tests to build as a module. Say N if you are unsure. +config RCU_REF_SCALE_TEST + tristate "Scalability tests for read-side synchronization (RCU and others)" + depends on DEBUG_KERNEL + select TORTURE_TEST + select SRCU + select TASKS_RCU + select TASKS_RUDE_RCU + select TASKS_TRACE_RCU + default n + help + This option provides a kernel module that runs performance tests + useful comparing RCU with various read-side synchronization mechanisms. + The kernel module may be built after the fact on the running kernel to be + tested, if desired. + + Say Y here if you want these performance tests built into the kernel. + Say M if you want to build it as a module instead. + Say N if you are unsure. + config RCU_CPU_STALL_TIMEOUT int "RCU CPU stall timeout in seconds" depends on RCU_STALL_COMMON diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index f91f2c2cf138..95f5117ef8da 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -12,6 +12,7 @@ obj-$(CONFIG_TREE_SRCU) += srcutree.o obj-$(CONFIG_TINY_SRCU) += srcutiny.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o +obj-$(CONFIG_RCU_REF_SCALE_TEST) += refscale.o obj-$(CONFIG_TREE_RCU) += tree.o obj-$(CONFIG_TINY_RCU) += tiny.o obj-$(CONFIG_RCU_NEED_SEGCBLIST) += rcu_segcblist.o diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 00ddc92c5774..cf66a3ccd757 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -431,6 +431,7 @@ bool rcu_gp_is_expedited(void); /* Internal RCU use. */ void rcu_expedite_gp(void); void rcu_unexpedite_gp(void); void rcupdate_announce_bootup_oddness(void); +void show_rcu_tasks_gp_kthreads(void); void rcu_request_urgent_qs_task(struct task_struct *t); #endif /* #else #ifdef CONFIG_TINY_RCU */ @@ -441,6 +442,8 @@ void rcu_request_urgent_qs_task(struct task_struct *t); enum rcutorture_type { RCU_FLAVOR, RCU_TASKS_FLAVOR, + RCU_TASKS_RUDE_FLAVOR, + RCU_TASKS_TRACING_FLAVOR, RCU_TRIVIAL_FLAVOR, SRCU_FLAVOR, INVALID_RCU_FLAVOR @@ -454,6 +457,7 @@ void do_trace_rcu_torture_read(const char *rcutorturename, unsigned long secs, unsigned long c_old, unsigned long c); +void rcu_gp_set_torture_wait(int duration); #else static inline void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, unsigned long *gp_seq) @@ -471,6 +475,7 @@ void do_trace_rcu_torture_read(const char *rcutorturename, #define do_trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c) \ do { } while (0) #endif +static inline void rcu_gp_set_torture_wait(int duration) { } #endif #if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST) @@ -498,6 +503,7 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type, #endif #ifdef CONFIG_TINY_RCU +static inline bool rcu_dynticks_zero_in_eqs(int cpu, int *vp) { return false; } static inline unsigned long rcu_get_gp_seq(void) { return 0; } static inline unsigned long rcu_exp_batches_completed(void) { return 0; } static inline unsigned long @@ -507,6 +513,7 @@ static inline void show_rcu_gp_kthreads(void) { } static inline int rcu_get_gp_kthreads_prio(void) { return 0; } static inline void rcu_fwd_progress_check(unsigned long j) { } #else /* #ifdef CONFIG_TINY_RCU */ +bool rcu_dynticks_zero_in_eqs(int cpu, int *vp); unsigned long rcu_get_gp_seq(void); unsigned long rcu_exp_batches_completed(void); unsigned long srcu_batches_completed(struct srcu_struct *sp); diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index a4a8d097d84d..21448d3374e2 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -69,6 +69,11 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>"); * value specified by nr_cpus for a read-only test. * * Various other use cases may of course be specified. + * + * Note that this test's readers are intended only as a test load for + * the writers. The reader performance statistics will be overly + * pessimistic due to the per-critical-section interrupt disabling, + * test-end checks, and the pair of calls through pointers. */ #ifdef MODULE @@ -88,6 +93,7 @@ torture_param(bool, shutdown, RCUPERF_SHUTDOWN, torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable"); torture_param(int, kfree_rcu_test, 0, "Do we run a kfree_rcu() perf test?"); +torture_param(int, kfree_mult, 1, "Multiple of kfree_obj size to allocate."); static char *perf_type = "rcu"; module_param(perf_type, charp, 0444); @@ -308,8 +314,10 @@ static void rcu_perf_wait_shutdown(void) } /* - * RCU perf reader kthread. Repeatedly does empty RCU read-side - * critical section, minimizing update-side interference. + * RCU perf reader kthread. Repeatedly does empty RCU read-side critical + * section, minimizing update-side interference. However, the point of + * this test is not to evaluate reader performance, but instead to serve + * as a test load for update-side performance testing. */ static int rcu_perf_reader(void *arg) @@ -353,7 +361,6 @@ rcu_perf_writer(void *arg) int i_max; long me = (long)arg; struct rcu_head *rhp = NULL; - struct sched_param sp; bool started = false, done = false, alldone = false; u64 t; u64 *wdp; @@ -362,8 +369,7 @@ rcu_perf_writer(void *arg) VERBOSE_PERFOUT_STRING("rcu_perf_writer task started"); WARN_ON(!wdpp); set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); - sp.sched_priority = 1; - sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); + sched_set_fifo_low(current); if (holdoff) schedule_timeout_uninterruptible(holdoff * HZ); @@ -419,9 +425,7 @@ retry: started = true; if (!done && i >= MIN_MEAS) { done = true; - sp.sched_priority = 0; - sched_setscheduler_nocheck(current, - SCHED_NORMAL, &sp); + sched_set_normal(current, 0); pr_alert("%s%s rcu_perf_writer %ld has %d measurements\n", perf_type, PERF_FLAG, me, MIN_MEAS); if (atomic_inc_return(&n_rcu_perf_writer_finished) >= @@ -575,11 +579,8 @@ static int compute_real(int n) static int rcu_perf_shutdown(void *arg) { - do { - wait_event(shutdown_wq, - atomic_read(&n_rcu_perf_writer_finished) >= - nrealwriters); - } while (atomic_read(&n_rcu_perf_writer_finished) < nrealwriters); + wait_event(shutdown_wq, + atomic_read(&n_rcu_perf_writer_finished) >= nrealwriters); smp_mb(); /* Wake before output. */ rcu_perf_cleanup(); kernel_power_off(); @@ -635,7 +636,7 @@ kfree_perf_thread(void *arg) } for (i = 0; i < kfree_alloc_num; i++) { - alloc_ptr = kmalloc(sizeof(struct kfree_obj), GFP_KERNEL); + alloc_ptr = kmalloc(kfree_mult * sizeof(struct kfree_obj), GFP_KERNEL); if (!alloc_ptr) return -ENOMEM; @@ -692,11 +693,8 @@ kfree_perf_cleanup(void) static int kfree_perf_shutdown(void *arg) { - do { - wait_event(shutdown_wq, - atomic_read(&n_kfree_perf_thread_ended) >= - kfree_nrealthreads); - } while (atomic_read(&n_kfree_perf_thread_ended) < kfree_nrealthreads); + wait_event(shutdown_wq, + atomic_read(&n_kfree_perf_thread_ended) >= kfree_nrealthreads); smp_mb(); /* Wake before output. */ @@ -722,6 +720,8 @@ kfree_perf_init(void) schedule_timeout_uninterruptible(1); } + pr_alert("kfree object size=%zu\n", kfree_mult * sizeof(struct kfree_obj)); + kfree_reader_tasks = kcalloc(kfree_nrealthreads, sizeof(kfree_reader_tasks[0]), GFP_KERNEL); if (kfree_reader_tasks == NULL) { diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 5453bd557f43..f453bf8d2f1e 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -7,7 +7,7 @@ * Authors: Paul E. McKenney <paulmck@linux.ibm.com> * Josh Triplett <josh@joshtriplett.org> * - * See also: Documentation/RCU/torture.txt + * See also: Documentation/RCU/torture.rst */ #define pr_fmt(fmt) fmt @@ -20,7 +20,7 @@ #include <linux/err.h> #include <linux/spinlock.h> #include <linux/smp.h> -#include <linux/rcupdate.h> +#include <linux/rcupdate_wait.h> #include <linux/interrupt.h> #include <linux/sched/signal.h> #include <uapi/linux/sched/types.h> @@ -45,12 +45,25 @@ #include <linux/sched/sysctl.h> #include <linux/oom.h> #include <linux/tick.h> +#include <linux/rcupdate_trace.h> #include "rcu.h" MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com> and Josh Triplett <josh@joshtriplett.org>"); +#ifndef data_race +#define data_race(expr) \ + ({ \ + expr; \ + }) +#endif +#ifndef ASSERT_EXCLUSIVE_WRITER +#define ASSERT_EXCLUSIVE_WRITER(var) do { } while (0) +#endif +#ifndef ASSERT_EXCLUSIVE_ACCESS +#define ASSERT_EXCLUSIVE_ACCESS(var) do { } while (0) +#endif /* Bits for ->extendables field, extendables param, and related definitions. */ #define RCUTORTURE_RDR_SHIFT 8 /* Put SRCU index in upper bits. */ @@ -96,12 +109,19 @@ torture_param(int, object_debug, 0, torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)"); torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (jiffies), 0=disable"); +torture_param(int, read_exit_delay, 13, + "Delay between read-then-exit episodes (s)"); +torture_param(int, read_exit_burst, 16, + "# of read-then-exit bursts per episode, zero to disable"); torture_param(int, shuffle_interval, 3, "Number of seconds between shuffles"); torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable."); torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable."); torture_param(int, stall_cpu_holdoff, 10, "Time to wait before starting stall (s)."); torture_param(int, stall_cpu_irqsoff, 0, "Disable interrupts while stalling."); +torture_param(int, stall_cpu_block, 0, "Sleep while stalling."); +torture_param(int, stall_gp_kthread, 0, + "Grace-period kthread stall duration (s)."); torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s"); torture_param(int, stutter, 5, "Number of seconds to run/halt test"); @@ -130,6 +150,7 @@ static struct task_struct *stall_task; static struct task_struct *fwd_prog_task; static struct task_struct **barrier_cbs_tasks; static struct task_struct *barrier_task; +static struct task_struct *read_exit_task; #define RCU_TORTURE_PIPE_LEN 10 @@ -161,6 +182,7 @@ static long n_rcu_torture_boosts; static atomic_long_t n_rcu_torture_timers; static long n_barrier_attempts; static long n_barrier_successes; /* did rcu_barrier test succeed? */ +static unsigned long n_read_exits; static struct list_head rcu_torture_removed; static unsigned long shutdown_jiffies; @@ -665,6 +687,11 @@ static void rcu_tasks_torture_deferred_free(struct rcu_torture *p) call_rcu_tasks(&p->rtort_rcu, rcu_torture_cb); } +static void synchronize_rcu_mult_test(void) +{ + synchronize_rcu_mult(call_rcu_tasks, call_rcu); +} + static struct rcu_torture_ops tasks_ops = { .ttype = RCU_TASKS_FLAVOR, .init = rcu_sync_torture_init, @@ -674,7 +701,7 @@ static struct rcu_torture_ops tasks_ops = { .get_gp_seq = rcu_no_completed, .deferred_free = rcu_tasks_torture_deferred_free, .sync = synchronize_rcu_tasks, - .exp_sync = synchronize_rcu_tasks, + .exp_sync = synchronize_rcu_mult_test, .call = call_rcu_tasks, .cb_barrier = rcu_barrier_tasks, .fqs = NULL, @@ -725,6 +752,72 @@ static struct rcu_torture_ops trivial_ops = { .name = "trivial" }; +/* + * Definitions for rude RCU-tasks torture testing. + */ + +static void rcu_tasks_rude_torture_deferred_free(struct rcu_torture *p) +{ + call_rcu_tasks_rude(&p->rtort_rcu, rcu_torture_cb); +} + +static struct rcu_torture_ops tasks_rude_ops = { + .ttype = RCU_TASKS_RUDE_FLAVOR, + .init = rcu_sync_torture_init, + .readlock = rcu_torture_read_lock_trivial, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = rcu_torture_read_unlock_trivial, + .get_gp_seq = rcu_no_completed, + .deferred_free = rcu_tasks_rude_torture_deferred_free, + .sync = synchronize_rcu_tasks_rude, + .exp_sync = synchronize_rcu_tasks_rude, + .call = call_rcu_tasks_rude, + .cb_barrier = rcu_barrier_tasks_rude, + .fqs = NULL, + .stats = NULL, + .irq_capable = 1, + .name = "tasks-rude" +}; + +/* + * Definitions for tracing RCU-tasks torture testing. + */ + +static int tasks_tracing_torture_read_lock(void) +{ + rcu_read_lock_trace(); + return 0; +} + +static void tasks_tracing_torture_read_unlock(int idx) +{ + rcu_read_unlock_trace(); +} + +static void rcu_tasks_tracing_torture_deferred_free(struct rcu_torture *p) +{ + call_rcu_tasks_trace(&p->rtort_rcu, rcu_torture_cb); +} + +static struct rcu_torture_ops tasks_tracing_ops = { + .ttype = RCU_TASKS_TRACING_FLAVOR, + .init = rcu_sync_torture_init, + .readlock = tasks_tracing_torture_read_lock, + .read_delay = srcu_read_delay, /* just reuse srcu's version. */ + .readunlock = tasks_tracing_torture_read_unlock, + .get_gp_seq = rcu_no_completed, + .deferred_free = rcu_tasks_tracing_torture_deferred_free, + .sync = synchronize_rcu_tasks_trace, + .exp_sync = synchronize_rcu_tasks_trace, + .call = call_rcu_tasks_trace, + .cb_barrier = rcu_barrier_tasks_trace, + .fqs = NULL, + .stats = NULL, + .irq_capable = 1, + .slow_gps = 1, + .name = "tasks-tracing" +}; + static unsigned long rcutorture_seq_diff(unsigned long new, unsigned long old) { if (!cur_ops->gp_diff) @@ -734,7 +827,7 @@ static unsigned long rcutorture_seq_diff(unsigned long new, unsigned long old) static bool __maybe_unused torturing_tasks(void) { - return cur_ops == &tasks_ops; + return cur_ops == &tasks_ops || cur_ops == &tasks_rude_ops; } /* @@ -802,16 +895,11 @@ static int rcu_torture_boost(void *arg) unsigned long endtime; unsigned long oldstarttime; struct rcu_boost_inflight rbi = { .inflight = 0 }; - struct sched_param sp; VERBOSE_TOROUT_STRING("rcu_torture_boost started"); /* Set real-time priority. */ - sp.sched_priority = 1; - if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) { - VERBOSE_TOROUT_STRING("rcu_torture_boost RT prio failed!"); - n_rcu_torture_boost_rterror++; - } + sched_set_fifo_low(current); init_rcu_head_on_stack(&rbi.rcu); /* Each pass through the following loop does one boost-test cycle. */ @@ -833,7 +921,7 @@ static int rcu_torture_boost(void *arg) /* Wait for the next test interval. */ oldstarttime = boost_starttime; - while (ULONG_CMP_LT(jiffies, oldstarttime)) { + while (time_before(jiffies, oldstarttime)) { schedule_timeout_interruptible(oldstarttime - jiffies); stutter_wait("rcu_torture_boost"); if (torture_must_stop()) @@ -843,7 +931,7 @@ static int rcu_torture_boost(void *arg) /* Do one boost-test interval. */ endtime = oldstarttime + test_boost_duration * HZ; call_rcu_time = jiffies; - while (ULONG_CMP_LT(jiffies, endtime)) { + while (time_before(jiffies, endtime)) { /* If we don't have a callback in flight, post one. */ if (!smp_load_acquire(&rbi.inflight)) { /* RCU core before ->inflight = 1. */ @@ -914,7 +1002,7 @@ rcu_torture_fqs(void *arg) VERBOSE_TOROUT_STRING("rcu_torture_fqs task started"); do { fqs_resume_time = jiffies + fqs_stutter * HZ; - while (ULONG_CMP_LT(jiffies, fqs_resume_time) && + while (time_before(jiffies, fqs_resume_time) && !kthread_should_stop()) { schedule_timeout_interruptible(1); } @@ -1079,6 +1167,7 @@ rcu_torture_writer(void *arg) WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count); } } while (!torture_must_stop()); + rcu_torture_current = NULL; // Let stats task know that we are done. /* Reset expediting back to unexpedited. */ if (expediting > 0) expediting = -expediting; @@ -1147,6 +1236,7 @@ static void rcutorture_one_extend(int *readstate, int newstate, struct torture_random_state *trsp, struct rt_read_seg *rtrsp) { + unsigned long flags; int idxnew = -1; int idxold = *readstate; int statesnew = ~*readstate & newstate; @@ -1181,8 +1271,15 @@ static void rcutorture_one_extend(int *readstate, int newstate, rcu_read_unlock_bh(); if (statesold & RCUTORTURE_RDR_SCHED) rcu_read_unlock_sched(); - if (statesold & RCUTORTURE_RDR_RCU) + if (statesold & RCUTORTURE_RDR_RCU) { + bool lockit = !statesnew && !(torture_random(trsp) & 0xffff); + + if (lockit) + raw_spin_lock_irqsave(¤t->pi_lock, flags); cur_ops->readunlock(idxold >> RCUTORTURE_RDR_SHIFT); + if (lockit) + raw_spin_unlock_irqrestore(¤t->pi_lock, flags); + } /* Delay if neither beginning nor end and there was a change. */ if ((statesnew || statesold) && *readstate && newstate) @@ -1275,6 +1372,7 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp) struct rt_read_seg *rtrsp1; unsigned long long ts; + WARN_ON_ONCE(!rcu_is_watching()); newstate = rcutorture_extend_mask(readstate, trsp); rcutorture_one_extend(&readstate, newstate, trsp, rtrsp++); started = cur_ops->get_gp_seq(); @@ -1283,6 +1381,7 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp) rcu_read_lock_bh_held() || rcu_read_lock_sched_held() || srcu_read_lock_held(srcu_ctlp) || + rcu_read_lock_trace_held() || torturing_tasks()); if (p == NULL) { /* Wait for rcu_torture_writer to get underway */ @@ -1443,10 +1542,11 @@ rcu_torture_stats_print(void) n_rcu_torture_boosts, atomic_long_read(&n_rcu_torture_timers)); torture_onoff_stats(); - pr_cont("barrier: %ld/%ld:%ld\n", - n_barrier_successes, - n_barrier_attempts, - n_rcu_torture_barrier_error); + pr_cont("barrier: %ld/%ld:%ld ", + data_race(n_barrier_successes), + data_race(n_barrier_attempts), + data_race(n_rcu_torture_barrier_error)); + pr_cont("read-exits: %ld\n", data_race(n_read_exits)); pr_alert("%s%s ", torture_type, TORTURE_FLAG); if (atomic_read(&n_rcu_torture_mberror) || @@ -1536,16 +1636,20 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) "test_boost=%d/%d test_boost_interval=%d " "test_boost_duration=%d shutdown_secs=%d " "stall_cpu=%d stall_cpu_holdoff=%d stall_cpu_irqsoff=%d " + "stall_cpu_block=%d " "n_barrier_cbs=%d " - "onoff_interval=%d onoff_holdoff=%d\n", + "onoff_interval=%d onoff_holdoff=%d " + "read_exit_delay=%d read_exit_burst=%d\n", torture_type, tag, nrealreaders, nfakewriters, stat_interval, verbose, test_no_idle_hz, shuffle_interval, stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, test_boost, cur_ops->can_boost, test_boost_interval, test_boost_duration, shutdown_secs, stall_cpu, stall_cpu_holdoff, stall_cpu_irqsoff, + stall_cpu_block, n_barrier_cbs, - onoff_interval, onoff_holdoff); + onoff_interval, onoff_holdoff, + read_exit_delay, read_exit_burst); } static int rcutorture_booster_cleanup(unsigned int cpu) @@ -1599,6 +1703,7 @@ static int rcutorture_booster_init(unsigned int cpu) */ static int rcu_torture_stall(void *args) { + int idx; unsigned long stop_at; VERBOSE_TOROUT_STRING("rcu_torture_stall task started"); @@ -1607,26 +1712,37 @@ static int rcu_torture_stall(void *args) schedule_timeout_interruptible(stall_cpu_holdoff * HZ); VERBOSE_TOROUT_STRING("rcu_torture_stall end holdoff"); } - if (!kthread_should_stop()) { + if (!kthread_should_stop() && stall_gp_kthread > 0) { + VERBOSE_TOROUT_STRING("rcu_torture_stall begin GP stall"); + rcu_gp_set_torture_wait(stall_gp_kthread * HZ); + for (idx = 0; idx < stall_gp_kthread + 2; idx++) { + if (kthread_should_stop()) + break; + schedule_timeout_uninterruptible(HZ); + } + } + if (!kthread_should_stop() && stall_cpu > 0) { + VERBOSE_TOROUT_STRING("rcu_torture_stall begin CPU stall"); stop_at = ktime_get_seconds() + stall_cpu; /* RCU CPU stall is expected behavior in following code. */ - rcu_read_lock(); + idx = cur_ops->readlock(); if (stall_cpu_irqsoff) local_irq_disable(); - else + else if (!stall_cpu_block) preempt_disable(); pr_alert("rcu_torture_stall start on CPU %d.\n", - smp_processor_id()); + raw_smp_processor_id()); while (ULONG_CMP_LT((unsigned long)ktime_get_seconds(), stop_at)) - continue; /* Induce RCU CPU stall warning. */ + if (stall_cpu_block) + schedule_timeout_uninterruptible(HZ); if (stall_cpu_irqsoff) local_irq_enable(); - else + else if (!stall_cpu_block) preempt_enable(); - rcu_read_unlock(); - pr_alert("rcu_torture_stall end.\n"); + cur_ops->readunlock(idx); } + pr_alert("rcu_torture_stall end.\n"); torture_shutdown_absorb("rcu_torture_stall"); while (!kthread_should_stop()) schedule_timeout_interruptible(10 * HZ); @@ -1636,7 +1752,7 @@ static int rcu_torture_stall(void *args) /* Spawn CPU-stall kthread, if stall_cpu specified. */ static int __init rcu_torture_stall_init(void) { - if (stall_cpu <= 0) + if (stall_cpu <= 0 && stall_gp_kthread <= 0) return 0; return torture_create_kthread(rcu_torture_stall, NULL, stall_task); } @@ -1692,8 +1808,8 @@ struct rcu_fwd { unsigned long rcu_launder_gp_seq_start; }; -struct rcu_fwd *rcu_fwds; -bool rcu_fwd_emergency_stop; +static struct rcu_fwd *rcu_fwds; +static bool rcu_fwd_emergency_stop; static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp) { @@ -2065,7 +2181,7 @@ static void rcu_torture_barrier1cb(void *rcu_void) static int rcu_torture_barrier_cbs(void *arg) { long myid = (long)arg; - bool lastphase = 0; + bool lastphase = false; bool newphase; struct rcu_head rcu; @@ -2228,6 +2344,99 @@ static bool rcu_torture_can_boost(void) return true; } +static bool read_exit_child_stop; +static bool read_exit_child_stopped; +static wait_queue_head_t read_exit_wq; + +// Child kthread which just does an rcutorture reader and exits. +static int rcu_torture_read_exit_child(void *trsp_in) +{ + struct torture_random_state *trsp = trsp_in; + + set_user_nice(current, MAX_NICE); + // Minimize time between reading and exiting. + while (!kthread_should_stop()) + schedule_timeout_uninterruptible(1); + (void)rcu_torture_one_read(trsp); + return 0; +} + +// Parent kthread which creates and destroys read-exit child kthreads. +static int rcu_torture_read_exit(void *unused) +{ + int count = 0; + bool errexit = false; + int i; + struct task_struct *tsp; + DEFINE_TORTURE_RANDOM(trs); + + // Allocate and initialize. + set_user_nice(current, MAX_NICE); + VERBOSE_TOROUT_STRING("rcu_torture_read_exit: Start of test"); + + // Each pass through this loop does one read-exit episode. + do { + if (++count > read_exit_burst) { + VERBOSE_TOROUT_STRING("rcu_torture_read_exit: End of episode"); + rcu_barrier(); // Wait for task_struct free, avoid OOM. + for (i = 0; i < read_exit_delay; i++) { + schedule_timeout_uninterruptible(HZ); + if (READ_ONCE(read_exit_child_stop)) + break; + } + if (!READ_ONCE(read_exit_child_stop)) + VERBOSE_TOROUT_STRING("rcu_torture_read_exit: Start of episode"); + count = 0; + } + if (READ_ONCE(read_exit_child_stop)) + break; + // Spawn child. + tsp = kthread_run(rcu_torture_read_exit_child, + &trs, "%s", + "rcu_torture_read_exit_child"); + if (IS_ERR(tsp)) { + VERBOSE_TOROUT_ERRSTRING("out of memory"); + errexit = true; + tsp = NULL; + break; + } + cond_resched(); + kthread_stop(tsp); + n_read_exits ++; + stutter_wait("rcu_torture_read_exit"); + } while (!errexit && !READ_ONCE(read_exit_child_stop)); + + // Clean up and exit. + smp_store_release(&read_exit_child_stopped, true); // After reaping. + smp_mb(); // Store before wakeup. + wake_up(&read_exit_wq); + while (!torture_must_stop()) + schedule_timeout_uninterruptible(1); + torture_kthread_stopping("rcu_torture_read_exit"); + return 0; +} + +static int rcu_torture_read_exit_init(void) +{ + if (read_exit_burst <= 0) + return -EINVAL; + init_waitqueue_head(&read_exit_wq); + read_exit_child_stop = false; + read_exit_child_stopped = false; + return torture_create_kthread(rcu_torture_read_exit, NULL, + read_exit_task); +} + +static void rcu_torture_read_exit_cleanup(void) +{ + if (!read_exit_task) + return; + WRITE_ONCE(read_exit_child_stop, true); + smp_mb(); // Above write before wait. + wait_event(read_exit_wq, smp_load_acquire(&read_exit_child_stopped)); + torture_stop_kthread(rcutorture_read_exit, read_exit_task); +} + static enum cpuhp_state rcutor_hp; static void @@ -2249,6 +2458,7 @@ rcu_torture_cleanup(void) } show_rcu_gp_kthreads(); + rcu_torture_read_exit_cleanup(); rcu_torture_barrier_cleanup(); torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task); torture_stop_kthread(rcu_torture_stall, stall_task); @@ -2260,7 +2470,6 @@ rcu_torture_cleanup(void) reader_tasks[i]); kfree(reader_tasks); } - rcu_torture_current = NULL; if (fakewriter_tasks) { for (i = 0; i < nfakewriters; i++) { @@ -2400,7 +2609,8 @@ rcu_torture_init(void) int firsterr = 0; static struct rcu_torture_ops *torture_ops[] = { &rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, - &busted_srcud_ops, &tasks_ops, &trivial_ops, + &busted_srcud_ops, &tasks_ops, &tasks_rude_ops, + &tasks_tracing_ops, &trivial_ops, }; if (!torture_init_begin(torture_type, verbose)) @@ -2571,6 +2781,9 @@ rcu_torture_init(void) firsterr = rcu_torture_barrier_init(); if (firsterr) goto unwind; + firsterr = rcu_torture_read_exit_init(); + if (firsterr) + goto unwind; if (object_debug) rcu_test_debug_objects(); torture_init_end(); diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c new file mode 100644 index 000000000000..d9291f883b54 --- /dev/null +++ b/kernel/rcu/refscale.c @@ -0,0 +1,717 @@ +// SPDX-License-Identifier: GPL-2.0+ +// +// Scalability test comparing RCU vs other mechanisms +// for acquiring references on objects. +// +// Copyright (C) Google, 2020. +// +// Author: Joel Fernandes <joel@joelfernandes.org> + +#define pr_fmt(fmt) fmt + +#include <linux/atomic.h> +#include <linux/bitops.h> +#include <linux/completion.h> +#include <linux/cpu.h> +#include <linux/delay.h> +#include <linux/err.h> +#include <linux/init.h> +#include <linux/interrupt.h> +#include <linux/kthread.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/notifier.h> +#include <linux/percpu.h> +#include <linux/rcupdate.h> +#include <linux/rcupdate_trace.h> +#include <linux/reboot.h> +#include <linux/sched.h> +#include <linux/spinlock.h> +#include <linux/smp.h> +#include <linux/stat.h> +#include <linux/srcu.h> +#include <linux/slab.h> +#include <linux/torture.h> +#include <linux/types.h> + +#include "rcu.h" + +#define SCALE_FLAG "-ref-scale: " + +#define SCALEOUT(s, x...) \ + pr_alert("%s" SCALE_FLAG s, scale_type, ## x) + +#define VERBOSE_SCALEOUT(s, x...) \ + do { if (verbose) pr_alert("%s" SCALE_FLAG s, scale_type, ## x); } while (0) + +#define VERBOSE_SCALEOUT_ERRSTRING(s, x...) \ + do { if (verbose) pr_alert("%s" SCALE_FLAG "!!! " s, scale_type, ## x); } while (0) + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Joel Fernandes (Google) <joel@joelfernandes.org>"); + +static char *scale_type = "rcu"; +module_param(scale_type, charp, 0444); +MODULE_PARM_DESC(scale_type, "Type of test (rcu, srcu, refcnt, rwsem, rwlock."); + +torture_param(int, verbose, 0, "Enable verbose debugging printk()s"); + +// Wait until there are multiple CPUs before starting test. +torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_SCALE_TEST) ? 10 : 0, + "Holdoff time before test start (s)"); +// Number of loops per experiment, all readers execute operations concurrently. +torture_param(long, loops, 10000, "Number of loops per experiment."); +// Number of readers, with -1 defaulting to about 75% of the CPUs. +torture_param(int, nreaders, -1, "Number of readers, -1 for 75% of CPUs."); +// Number of runs. +torture_param(int, nruns, 30, "Number of experiments to run."); +// Reader delay in nanoseconds, 0 for no delay. +torture_param(int, readdelay, 0, "Read-side delay in nanoseconds."); + +#ifdef MODULE +# define REFSCALE_SHUTDOWN 0 +#else +# define REFSCALE_SHUTDOWN 1 +#endif + +torture_param(bool, shutdown, REFSCALE_SHUTDOWN, + "Shutdown at end of scalability tests."); + +struct reader_task { + struct task_struct *task; + int start_reader; + wait_queue_head_t wq; + u64 last_duration_ns; +}; + +static struct task_struct *shutdown_task; +static wait_queue_head_t shutdown_wq; + +static struct task_struct *main_task; +static wait_queue_head_t main_wq; +static int shutdown_start; + +static struct reader_task *reader_tasks; + +// Number of readers that are part of the current experiment. +static atomic_t nreaders_exp; + +// Use to wait for all threads to start. +static atomic_t n_init; +static atomic_t n_started; +static atomic_t n_warmedup; +static atomic_t n_cooleddown; + +// Track which experiment is currently running. +static int exp_idx; + +// Operations vector for selecting different types of tests. +struct ref_scale_ops { + void (*init)(void); + void (*cleanup)(void); + void (*readsection)(const int nloops); + void (*delaysection)(const int nloops, const int udl, const int ndl); + const char *name; +}; + +static struct ref_scale_ops *cur_ops; + +static void un_delay(const int udl, const int ndl) +{ + if (udl) + udelay(udl); + if (ndl) + ndelay(ndl); +} + +static void ref_rcu_read_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) { + rcu_read_lock(); + rcu_read_unlock(); + } +} + +static void ref_rcu_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) { + rcu_read_lock(); + un_delay(udl, ndl); + rcu_read_unlock(); + } +} + +static void rcu_sync_scale_init(void) +{ +} + +static struct ref_scale_ops rcu_ops = { + .init = rcu_sync_scale_init, + .readsection = ref_rcu_read_section, + .delaysection = ref_rcu_delay_section, + .name = "rcu" +}; + +// Definitions for SRCU ref scale testing. +DEFINE_STATIC_SRCU(srcu_refctl_scale); +static struct srcu_struct *srcu_ctlp = &srcu_refctl_scale; + +static void srcu_ref_scale_read_section(const int nloops) +{ + int i; + int idx; + + for (i = nloops; i >= 0; i--) { + idx = srcu_read_lock(srcu_ctlp); + srcu_read_unlock(srcu_ctlp, idx); + } +} + +static void srcu_ref_scale_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + int idx; + + for (i = nloops; i >= 0; i--) { + idx = srcu_read_lock(srcu_ctlp); + un_delay(udl, ndl); + srcu_read_unlock(srcu_ctlp, idx); + } +} + +static struct ref_scale_ops srcu_ops = { + .init = rcu_sync_scale_init, + .readsection = srcu_ref_scale_read_section, + .delaysection = srcu_ref_scale_delay_section, + .name = "srcu" +}; + +// Definitions for RCU Tasks ref scale testing: Empty read markers. +// These definitions also work for RCU Rude readers. +static void rcu_tasks_ref_scale_read_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) + continue; +} + +static void rcu_tasks_ref_scale_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) + un_delay(udl, ndl); +} + +static struct ref_scale_ops rcu_tasks_ops = { + .init = rcu_sync_scale_init, + .readsection = rcu_tasks_ref_scale_read_section, + .delaysection = rcu_tasks_ref_scale_delay_section, + .name = "rcu-tasks" +}; + +// Definitions for RCU Tasks Trace ref scale testing. +static void rcu_trace_ref_scale_read_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) { + rcu_read_lock_trace(); + rcu_read_unlock_trace(); + } +} + +static void rcu_trace_ref_scale_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) { + rcu_read_lock_trace(); + un_delay(udl, ndl); + rcu_read_unlock_trace(); + } +} + +static struct ref_scale_ops rcu_trace_ops = { + .init = rcu_sync_scale_init, + .readsection = rcu_trace_ref_scale_read_section, + .delaysection = rcu_trace_ref_scale_delay_section, + .name = "rcu-trace" +}; + +// Definitions for reference count +static atomic_t refcnt; + +static void ref_refcnt_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) { + atomic_inc(&refcnt); + atomic_dec(&refcnt); + } +} + +static void ref_refcnt_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) { + atomic_inc(&refcnt); + un_delay(udl, ndl); + atomic_dec(&refcnt); + } +} + +static struct ref_scale_ops refcnt_ops = { + .init = rcu_sync_scale_init, + .readsection = ref_refcnt_section, + .delaysection = ref_refcnt_delay_section, + .name = "refcnt" +}; + +// Definitions for rwlock +static rwlock_t test_rwlock; + +static void ref_rwlock_init(void) +{ + rwlock_init(&test_rwlock); +} + +static void ref_rwlock_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) { + read_lock(&test_rwlock); + read_unlock(&test_rwlock); + } +} + +static void ref_rwlock_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) { + read_lock(&test_rwlock); + un_delay(udl, ndl); + read_unlock(&test_rwlock); + } +} + +static struct ref_scale_ops rwlock_ops = { + .init = ref_rwlock_init, + .readsection = ref_rwlock_section, + .delaysection = ref_rwlock_delay_section, + .name = "rwlock" +}; + +// Definitions for rwsem +static struct rw_semaphore test_rwsem; + +static void ref_rwsem_init(void) +{ + init_rwsem(&test_rwsem); +} + +static void ref_rwsem_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) { + down_read(&test_rwsem); + up_read(&test_rwsem); + } +} + +static void ref_rwsem_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) { + down_read(&test_rwsem); + un_delay(udl, ndl); + up_read(&test_rwsem); + } +} + +static struct ref_scale_ops rwsem_ops = { + .init = ref_rwsem_init, + .readsection = ref_rwsem_section, + .delaysection = ref_rwsem_delay_section, + .name = "rwsem" +}; + +static void rcu_scale_one_reader(void) +{ + if (readdelay <= 0) + cur_ops->readsection(loops); + else + cur_ops->delaysection(loops, readdelay / 1000, readdelay % 1000); +} + +// Reader kthread. Repeatedly does empty RCU read-side +// critical section, minimizing update-side interference. +static int +ref_scale_reader(void *arg) +{ + unsigned long flags; + long me = (long)arg; + struct reader_task *rt = &(reader_tasks[me]); + u64 start; + s64 duration; + + VERBOSE_SCALEOUT("ref_scale_reader %ld: task started", me); + set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); + set_user_nice(current, MAX_NICE); + atomic_inc(&n_init); + if (holdoff) + schedule_timeout_interruptible(holdoff * HZ); +repeat: + VERBOSE_SCALEOUT("ref_scale_reader %ld: waiting to start next experiment on cpu %d", me, smp_processor_id()); + + // Wait for signal that this reader can start. + wait_event(rt->wq, (atomic_read(&nreaders_exp) && smp_load_acquire(&rt->start_reader)) || + torture_must_stop()); + + if (torture_must_stop()) + goto end; + + // Make sure that the CPU is affinitized appropriately during testing. + WARN_ON_ONCE(smp_processor_id() != me); + + WRITE_ONCE(rt->start_reader, 0); + if (!atomic_dec_return(&n_started)) + while (atomic_read_acquire(&n_started)) + cpu_relax(); + + VERBOSE_SCALEOUT("ref_scale_reader %ld: experiment %d started", me, exp_idx); + + + // To reduce noise, do an initial cache-warming invocation, check + // in, and then keep warming until everyone has checked in. + rcu_scale_one_reader(); + if (!atomic_dec_return(&n_warmedup)) + while (atomic_read_acquire(&n_warmedup)) + rcu_scale_one_reader(); + // Also keep interrupts disabled. This also has the effect + // of preventing entries into slow path for rcu_read_unlock(). + local_irq_save(flags); + start = ktime_get_mono_fast_ns(); + + rcu_scale_one_reader(); + + duration = ktime_get_mono_fast_ns() - start; + local_irq_restore(flags); + + rt->last_duration_ns = WARN_ON_ONCE(duration < 0) ? 0 : duration; + // To reduce runtime-skew noise, do maintain-load invocations until + // everyone is done. + if (!atomic_dec_return(&n_cooleddown)) + while (atomic_read_acquire(&n_cooleddown)) + rcu_scale_one_reader(); + + if (atomic_dec_and_test(&nreaders_exp)) + wake_up(&main_wq); + + VERBOSE_SCALEOUT("ref_scale_reader %ld: experiment %d ended, (readers remaining=%d)", + me, exp_idx, atomic_read(&nreaders_exp)); + + if (!torture_must_stop()) + goto repeat; +end: + torture_kthread_stopping("ref_scale_reader"); + return 0; +} + +static void reset_readers(void) +{ + int i; + struct reader_task *rt; + + for (i = 0; i < nreaders; i++) { + rt = &(reader_tasks[i]); + + rt->last_duration_ns = 0; + } +} + +// Print the results of each reader and return the sum of all their durations. +static u64 process_durations(int n) +{ + int i; + struct reader_task *rt; + char buf1[64]; + char *buf; + u64 sum = 0; + + buf = kmalloc(128 + nreaders * 32, GFP_KERNEL); + if (!buf) + return 0; + buf[0] = 0; + sprintf(buf, "Experiment #%d (Format: <THREAD-NUM>:<Total loop time in ns>)", + exp_idx); + + for (i = 0; i < n && !torture_must_stop(); i++) { + rt = &(reader_tasks[i]); + sprintf(buf1, "%d: %llu\t", i, rt->last_duration_ns); + + if (i % 5 == 0) + strcat(buf, "\n"); + strcat(buf, buf1); + + sum += rt->last_duration_ns; + } + strcat(buf, "\n"); + + SCALEOUT("%s\n", buf); + + kfree(buf); + return sum; +} + +// The main_func is the main orchestrator, it performs a bunch of +// experiments. For every experiment, it orders all the readers +// involved to start and waits for them to finish the experiment. It +// then reads their timestamps and starts the next experiment. Each +// experiment progresses from 1 concurrent reader to N of them at which +// point all the timestamps are printed. +static int main_func(void *arg) +{ + bool errexit = false; + int exp, r; + char buf1[64]; + char *buf; + u64 *result_avg; + + set_cpus_allowed_ptr(current, cpumask_of(nreaders % nr_cpu_ids)); + set_user_nice(current, MAX_NICE); + + VERBOSE_SCALEOUT("main_func task started"); + result_avg = kzalloc(nruns * sizeof(*result_avg), GFP_KERNEL); + buf = kzalloc(64 + nruns * 32, GFP_KERNEL); + if (!result_avg || !buf) { + VERBOSE_SCALEOUT_ERRSTRING("out of memory"); + errexit = true; + } + if (holdoff) + schedule_timeout_interruptible(holdoff * HZ); + + // Wait for all threads to start. + atomic_inc(&n_init); + while (atomic_read(&n_init) < nreaders + 1) + schedule_timeout_uninterruptible(1); + + // Start exp readers up per experiment + for (exp = 0; exp < nruns && !torture_must_stop(); exp++) { + if (errexit) + break; + if (torture_must_stop()) + goto end; + + reset_readers(); + atomic_set(&nreaders_exp, nreaders); + atomic_set(&n_started, nreaders); + atomic_set(&n_warmedup, nreaders); + atomic_set(&n_cooleddown, nreaders); + + exp_idx = exp; + + for (r = 0; r < nreaders; r++) { + smp_store_release(&reader_tasks[r].start_reader, 1); + wake_up(&reader_tasks[r].wq); + } + + VERBOSE_SCALEOUT("main_func: experiment started, waiting for %d readers", + nreaders); + + wait_event(main_wq, + !atomic_read(&nreaders_exp) || torture_must_stop()); + + VERBOSE_SCALEOUT("main_func: experiment ended"); + + if (torture_must_stop()) + goto end; + + result_avg[exp] = div_u64(1000 * process_durations(nreaders), nreaders * loops); + } + + // Print the average of all experiments + SCALEOUT("END OF TEST. Calculating average duration per loop (nanoseconds)...\n"); + + buf[0] = 0; + strcat(buf, "\n"); + strcat(buf, "Runs\tTime(ns)\n"); + + for (exp = 0; exp < nruns; exp++) { + u64 avg; + u32 rem; + + if (errexit) + break; + avg = div_u64_rem(result_avg[exp], 1000, &rem); + sprintf(buf1, "%d\t%llu.%03u\n", exp + 1, avg, rem); + strcat(buf, buf1); + } + + if (!errexit) + SCALEOUT("%s", buf); + + // This will shutdown everything including us. + if (shutdown) { + shutdown_start = 1; + wake_up(&shutdown_wq); + } + + // Wait for torture to stop us + while (!torture_must_stop()) + schedule_timeout_uninterruptible(1); + +end: + torture_kthread_stopping("main_func"); + kfree(result_avg); + kfree(buf); + return 0; +} + +static void +ref_scale_print_module_parms(struct ref_scale_ops *cur_ops, const char *tag) +{ + pr_alert("%s" SCALE_FLAG + "--- %s: verbose=%d shutdown=%d holdoff=%d loops=%ld nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag, + verbose, shutdown, holdoff, loops, nreaders, nruns, readdelay); +} + +static void +ref_scale_cleanup(void) +{ + int i; + + if (torture_cleanup_begin()) + return; + + if (!cur_ops) { + torture_cleanup_end(); + return; + } + + if (reader_tasks) { + for (i = 0; i < nreaders; i++) + torture_stop_kthread("ref_scale_reader", + reader_tasks[i].task); + } + kfree(reader_tasks); + + torture_stop_kthread("main_task", main_task); + kfree(main_task); + + // Do scale-type-specific cleanup operations. + if (cur_ops->cleanup != NULL) + cur_ops->cleanup(); + + torture_cleanup_end(); +} + +// Shutdown kthread. Just waits to be awakened, then shuts down system. +static int +ref_scale_shutdown(void *arg) +{ + wait_event(shutdown_wq, shutdown_start); + + smp_mb(); // Wake before output. + ref_scale_cleanup(); + kernel_power_off(); + + return -EINVAL; +} + +static int __init +ref_scale_init(void) +{ + long i; + int firsterr = 0; + static struct ref_scale_ops *scale_ops[] = { + &rcu_ops, &srcu_ops, &rcu_trace_ops, &rcu_tasks_ops, + &refcnt_ops, &rwlock_ops, &rwsem_ops, + }; + + if (!torture_init_begin(scale_type, verbose)) + return -EBUSY; + + for (i = 0; i < ARRAY_SIZE(scale_ops); i++) { + cur_ops = scale_ops[i]; + if (strcmp(scale_type, cur_ops->name) == 0) + break; + } + if (i == ARRAY_SIZE(scale_ops)) { + pr_alert("rcu-scale: invalid scale type: \"%s\"\n", scale_type); + pr_alert("rcu-scale types:"); + for (i = 0; i < ARRAY_SIZE(scale_ops); i++) + pr_cont(" %s", scale_ops[i]->name); + pr_cont("\n"); + WARN_ON(!IS_MODULE(CONFIG_RCU_REF_SCALE_TEST)); + firsterr = -EINVAL; + cur_ops = NULL; + goto unwind; + } + if (cur_ops->init) + cur_ops->init(); + + ref_scale_print_module_parms(cur_ops, "Start of test"); + + // Shutdown task + if (shutdown) { + init_waitqueue_head(&shutdown_wq); + firsterr = torture_create_kthread(ref_scale_shutdown, NULL, + shutdown_task); + if (firsterr) + goto unwind; + schedule_timeout_uninterruptible(1); + } + + // Reader tasks (default to ~75% of online CPUs). + if (nreaders < 0) + nreaders = (num_online_cpus() >> 1) + (num_online_cpus() >> 2); + reader_tasks = kcalloc(nreaders, sizeof(reader_tasks[0]), + GFP_KERNEL); + if (!reader_tasks) { + VERBOSE_SCALEOUT_ERRSTRING("out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + + VERBOSE_SCALEOUT("Starting %d reader threads\n", nreaders); + + for (i = 0; i < nreaders; i++) { + firsterr = torture_create_kthread(ref_scale_reader, (void *)i, + reader_tasks[i].task); + if (firsterr) + goto unwind; + + init_waitqueue_head(&(reader_tasks[i].wq)); + } + + // Main Task + init_waitqueue_head(&main_wq); + firsterr = torture_create_kthread(main_func, NULL, main_task); + if (firsterr) + goto unwind; + + torture_init_end(); + return 0; + +unwind: + torture_init_end(); + ref_scale_cleanup(); + return firsterr; +} + +module_init(ref_scale_init); +module_exit(ref_scale_cleanup); diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 0c71505f0e19..c100acf332ed 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -29,6 +29,19 @@ #include "rcu.h" #include "rcu_segcblist.h" +#ifndef data_race +#define data_race(expr) \ + ({ \ + expr; \ + }) +#endif +#ifndef ASSERT_EXCLUSIVE_WRITER +#define ASSERT_EXCLUSIVE_WRITER(var) do { } while (0) +#endif +#ifndef ASSERT_EXCLUSIVE_ACCESS +#define ASSERT_EXCLUSIVE_ACCESS(var) do { } while (0) +#endif + /* Holdoff in nanoseconds for auto-expediting. */ #define DEFAULT_SRCU_EXP_HOLDOFF (25 * 1000) static ulong exp_holdoff = DEFAULT_SRCU_EXP_HOLDOFF; @@ -753,7 +766,7 @@ static void srcu_flip(struct srcu_struct *ssp) * it, if this function was preempted for enough time for the counters * to wrap, it really doesn't matter whether or not we expedite the grace * period. The extra overhead of a needlessly expedited grace period is - * negligible when amoritized over that time period, and the extra latency + * negligible when amortized over that time period, and the extra latency * of a needlessly non-expedited grace period is similarly negligible. */ static bool srcu_might_be_idle(struct srcu_struct *ssp) @@ -764,14 +777,15 @@ static bool srcu_might_be_idle(struct srcu_struct *ssp) unsigned long t; unsigned long tlast; + check_init_srcu_struct(ssp); /* If the local srcu_data structure has callbacks, not idle. */ - local_irq_save(flags); - sdp = this_cpu_ptr(ssp->sda); + sdp = raw_cpu_ptr(ssp->sda); + spin_lock_irqsave_rcu_node(sdp, flags); if (rcu_segcblist_pend_cbs(&sdp->srcu_cblist)) { - local_irq_restore(flags); + spin_unlock_irqrestore_rcu_node(sdp, flags); return false; /* Callbacks already present, so not idle. */ } - local_irq_restore(flags); + spin_unlock_irqrestore_rcu_node(sdp, flags); /* * No local callbacks, so probabalistically probe global state. @@ -851,9 +865,8 @@ static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, } rhp->func = func; idx = srcu_read_lock(ssp); - local_irq_save(flags); - sdp = this_cpu_ptr(ssp->sda); - spin_lock_rcu_node(sdp); + sdp = raw_cpu_ptr(ssp->sda); + spin_lock_irqsave_rcu_node(sdp, flags); rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp); rcu_segcblist_advance(&sdp->srcu_cblist, rcu_seq_current(&ssp->srcu_gp_seq)); @@ -1268,8 +1281,8 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) struct srcu_data *sdp; sdp = per_cpu_ptr(ssp->sda, cpu); - u0 = sdp->srcu_unlock_count[!idx]; - u1 = sdp->srcu_unlock_count[idx]; + u0 = data_race(sdp->srcu_unlock_count[!idx]); + u1 = data_race(sdp->srcu_unlock_count[idx]); /* * Make sure that a lock is always counted if the corresponding @@ -1277,8 +1290,8 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) */ smp_rmb(); - l0 = sdp->srcu_lock_count[!idx]; - l1 = sdp->srcu_lock_count[idx]; + l0 = data_race(sdp->srcu_lock_count[!idx]); + l1 = data_race(sdp->srcu_lock_count[idx]); c0 = l0 - u0; c1 = l1 - u1; diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h new file mode 100644 index 000000000000..835e2df8590a --- /dev/null +++ b/kernel/rcu/tasks.h @@ -0,0 +1,1206 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Task-based RCU implementations. + * + * Copyright (C) 2020 Paul E. McKenney + */ + +#ifdef CONFIG_TASKS_RCU_GENERIC + +//////////////////////////////////////////////////////////////////////// +// +// Generic data structures. + +struct rcu_tasks; +typedef void (*rcu_tasks_gp_func_t)(struct rcu_tasks *rtp); +typedef void (*pregp_func_t)(void); +typedef void (*pertask_func_t)(struct task_struct *t, struct list_head *hop); +typedef void (*postscan_func_t)(struct list_head *hop); +typedef void (*holdouts_func_t)(struct list_head *hop, bool ndrpt, bool *frptp); +typedef void (*postgp_func_t)(struct rcu_tasks *rtp); + +/** + * Definition for a Tasks-RCU-like mechanism. + * @cbs_head: Head of callback list. + * @cbs_tail: Tail pointer for callback list. + * @cbs_wq: Wait queue allowning new callback to get kthread's attention. + * @cbs_lock: Lock protecting callback list. + * @kthread_ptr: This flavor's grace-period/callback-invocation kthread. + * @gp_func: This flavor's grace-period-wait function. + * @gp_state: Grace period's most recent state transition (debugging). + * @gp_jiffies: Time of last @gp_state transition. + * @gp_start: Most recent grace-period start in jiffies. + * @n_gps: Number of grace periods completed since boot. + * @n_ipis: Number of IPIs sent to encourage grace periods to end. + * @n_ipis_fails: Number of IPI-send failures. + * @pregp_func: This flavor's pre-grace-period function (optional). + * @pertask_func: This flavor's per-task scan function (optional). + * @postscan_func: This flavor's post-task scan function (optional). + * @holdout_func: This flavor's holdout-list scan function (optional). + * @postgp_func: This flavor's post-grace-period function (optional). + * @call_func: This flavor's call_rcu()-equivalent function. + * @name: This flavor's textual name. + * @kname: This flavor's kthread name. + */ +struct rcu_tasks { + struct rcu_head *cbs_head; + struct rcu_head **cbs_tail; + struct wait_queue_head cbs_wq; + raw_spinlock_t cbs_lock; + int gp_state; + unsigned long gp_jiffies; + unsigned long gp_start; + unsigned long n_gps; + unsigned long n_ipis; + unsigned long n_ipis_fails; + struct task_struct *kthread_ptr; + rcu_tasks_gp_func_t gp_func; + pregp_func_t pregp_func; + pertask_func_t pertask_func; + postscan_func_t postscan_func; + holdouts_func_t holdouts_func; + postgp_func_t postgp_func; + call_rcu_func_t call_func; + char *name; + char *kname; +}; + +#define DEFINE_RCU_TASKS(rt_name, gp, call, n) \ +static struct rcu_tasks rt_name = \ +{ \ + .cbs_tail = &rt_name.cbs_head, \ + .cbs_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rt_name.cbs_wq), \ + .cbs_lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name.cbs_lock), \ + .gp_func = gp, \ + .call_func = call, \ + .name = n, \ + .kname = #rt_name, \ +} + +/* Track exiting tasks in order to allow them to be waited for. */ +DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu); + +/* Avoid IPIing CPUs early in the grace period. */ +#define RCU_TASK_IPI_DELAY (HZ / 2) +static int rcu_task_ipi_delay __read_mostly = RCU_TASK_IPI_DELAY; +module_param(rcu_task_ipi_delay, int, 0644); + +/* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */ +#define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10) +static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT; +module_param(rcu_task_stall_timeout, int, 0644); + +/* RCU tasks grace-period state for debugging. */ +#define RTGS_INIT 0 +#define RTGS_WAIT_WAIT_CBS 1 +#define RTGS_WAIT_GP 2 +#define RTGS_PRE_WAIT_GP 3 +#define RTGS_SCAN_TASKLIST 4 +#define RTGS_POST_SCAN_TASKLIST 5 +#define RTGS_WAIT_SCAN_HOLDOUTS 6 +#define RTGS_SCAN_HOLDOUTS 7 +#define RTGS_POST_GP 8 +#define RTGS_WAIT_READERS 9 +#define RTGS_INVOKE_CBS 10 +#define RTGS_WAIT_CBS 11 +#ifndef CONFIG_TINY_RCU +static const char * const rcu_tasks_gp_state_names[] = { + "RTGS_INIT", + "RTGS_WAIT_WAIT_CBS", + "RTGS_WAIT_GP", + "RTGS_PRE_WAIT_GP", + "RTGS_SCAN_TASKLIST", + "RTGS_POST_SCAN_TASKLIST", + "RTGS_WAIT_SCAN_HOLDOUTS", + "RTGS_SCAN_HOLDOUTS", + "RTGS_POST_GP", + "RTGS_WAIT_READERS", + "RTGS_INVOKE_CBS", + "RTGS_WAIT_CBS", +}; +#endif /* #ifndef CONFIG_TINY_RCU */ + +//////////////////////////////////////////////////////////////////////// +// +// Generic code. + +/* Record grace-period phase and time. */ +static void set_tasks_gp_state(struct rcu_tasks *rtp, int newstate) +{ + rtp->gp_state = newstate; + rtp->gp_jiffies = jiffies; +} + +#ifndef CONFIG_TINY_RCU +/* Return state name. */ +static const char *tasks_gp_state_getname(struct rcu_tasks *rtp) +{ + int i = data_race(rtp->gp_state); // Let KCSAN detect update races + int j = READ_ONCE(i); // Prevent the compiler from reading twice + + if (j >= ARRAY_SIZE(rcu_tasks_gp_state_names)) + return "???"; + return rcu_tasks_gp_state_names[j]; +} +#endif /* #ifndef CONFIG_TINY_RCU */ + +// Enqueue a callback for the specified flavor of Tasks RCU. +static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, + struct rcu_tasks *rtp) +{ + unsigned long flags; + bool needwake; + + rhp->next = NULL; + rhp->func = func; + raw_spin_lock_irqsave(&rtp->cbs_lock, flags); + needwake = !rtp->cbs_head; + WRITE_ONCE(*rtp->cbs_tail, rhp); + rtp->cbs_tail = &rhp->next; + raw_spin_unlock_irqrestore(&rtp->cbs_lock, flags); + /* We can't create the thread unless interrupts are enabled. */ + if (needwake && READ_ONCE(rtp->kthread_ptr)) + wake_up(&rtp->cbs_wq); +} + +// Wait for a grace period for the specified flavor of Tasks RCU. +static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp) +{ + /* Complain if the scheduler has not started. */ + RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE, + "synchronize_rcu_tasks called too soon"); + + /* Wait for the grace period. */ + wait_rcu_gp(rtp->call_func); +} + +/* RCU-tasks kthread that detects grace periods and invokes callbacks. */ +static int __noreturn rcu_tasks_kthread(void *arg) +{ + unsigned long flags; + struct rcu_head *list; + struct rcu_head *next; + struct rcu_tasks *rtp = arg; + + /* Run on housekeeping CPUs by default. Sysadm can move if desired. */ + housekeeping_affine(current, HK_FLAG_RCU); + WRITE_ONCE(rtp->kthread_ptr, current); // Let GPs start! + + /* + * Each pass through the following loop makes one check for + * newly arrived callbacks, and, if there are some, waits for + * one RCU-tasks grace period and then invokes the callbacks. + * This loop is terminated by the system going down. ;-) + */ + for (;;) { + + /* Pick up any new callbacks. */ + raw_spin_lock_irqsave(&rtp->cbs_lock, flags); + smp_mb__after_spinlock(); // Order updates vs. GP. + list = rtp->cbs_head; + rtp->cbs_head = NULL; + rtp->cbs_tail = &rtp->cbs_head; + raw_spin_unlock_irqrestore(&rtp->cbs_lock, flags); + + /* If there were none, wait a bit and start over. */ + if (!list) { + wait_event_interruptible(rtp->cbs_wq, + READ_ONCE(rtp->cbs_head)); + if (!rtp->cbs_head) { + WARN_ON(signal_pending(current)); + set_tasks_gp_state(rtp, RTGS_WAIT_WAIT_CBS); + schedule_timeout_idle(HZ/10); + } + continue; + } + + // Wait for one grace period. + set_tasks_gp_state(rtp, RTGS_WAIT_GP); + rtp->gp_start = jiffies; + rtp->gp_func(rtp); + rtp->n_gps++; + + /* Invoke the callbacks. */ + set_tasks_gp_state(rtp, RTGS_INVOKE_CBS); + while (list) { + next = list->next; + local_bh_disable(); + list->func(list); + local_bh_enable(); + list = next; + cond_resched(); + } + /* Paranoid sleep to keep this from entering a tight loop */ + schedule_timeout_idle(HZ/10); + + set_tasks_gp_state(rtp, RTGS_WAIT_CBS); + } +} + +/* Spawn RCU-tasks grace-period kthread, e.g., at core_initcall() time. */ +static void __init rcu_spawn_tasks_kthread_generic(struct rcu_tasks *rtp) +{ + struct task_struct *t; + + t = kthread_run(rcu_tasks_kthread, rtp, "%s_kthread", rtp->kname); + if (WARN_ONCE(IS_ERR(t), "%s: Could not start %s grace-period kthread, OOM is now expected behavior\n", __func__, rtp->name)) + return; + smp_mb(); /* Ensure others see full kthread. */ +} + +#ifndef CONFIG_TINY_RCU + +/* + * Print any non-default Tasks RCU settings. + */ +static void __init rcu_tasks_bootup_oddness(void) +{ +#if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU) + if (rcu_task_stall_timeout != RCU_TASK_STALL_TIMEOUT) + pr_info("\tTasks-RCU CPU stall warnings timeout set to %d (rcu_task_stall_timeout).\n", rcu_task_stall_timeout); +#endif /* #ifdef CONFIG_TASKS_RCU */ +#ifdef CONFIG_TASKS_RCU + pr_info("\tTrampoline variant of Tasks RCU enabled.\n"); +#endif /* #ifdef CONFIG_TASKS_RCU */ +#ifdef CONFIG_TASKS_RUDE_RCU + pr_info("\tRude variant of Tasks RCU enabled.\n"); +#endif /* #ifdef CONFIG_TASKS_RUDE_RCU */ +#ifdef CONFIG_TASKS_TRACE_RCU + pr_info("\tTracing variant of Tasks RCU enabled.\n"); +#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ +} + +#endif /* #ifndef CONFIG_TINY_RCU */ + +#ifndef CONFIG_TINY_RCU +/* Dump out rcutorture-relevant state common to all RCU-tasks flavors. */ +static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) +{ + pr_info("%s: %s(%d) since %lu g:%lu i:%lu/%lu %c%c %s\n", + rtp->kname, + tasks_gp_state_getname(rtp), data_race(rtp->gp_state), + jiffies - data_race(rtp->gp_jiffies), + data_race(rtp->n_gps), + data_race(rtp->n_ipis_fails), data_race(rtp->n_ipis), + ".k"[!!data_race(rtp->kthread_ptr)], + ".C"[!!data_race(rtp->cbs_head)], + s); +} +#endif /* #ifndef CONFIG_TINY_RCU */ + +static void exit_tasks_rcu_finish_trace(struct task_struct *t); + +#if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU) + +//////////////////////////////////////////////////////////////////////// +// +// Shared code between task-list-scanning variants of Tasks RCU. + +/* Wait for one RCU-tasks grace period. */ +static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) +{ + struct task_struct *g, *t; + unsigned long lastreport; + LIST_HEAD(holdouts); + int fract; + + set_tasks_gp_state(rtp, RTGS_PRE_WAIT_GP); + rtp->pregp_func(); + + /* + * There were callbacks, so we need to wait for an RCU-tasks + * grace period. Start off by scanning the task list for tasks + * that are not already voluntarily blocked. Mark these tasks + * and make a list of them in holdouts. + */ + set_tasks_gp_state(rtp, RTGS_SCAN_TASKLIST); + rcu_read_lock(); + for_each_process_thread(g, t) + rtp->pertask_func(t, &holdouts); + rcu_read_unlock(); + + set_tasks_gp_state(rtp, RTGS_POST_SCAN_TASKLIST); + rtp->postscan_func(&holdouts); + + /* + * Each pass through the following loop scans the list of holdout + * tasks, removing any that are no longer holdouts. When the list + * is empty, we are done. + */ + lastreport = jiffies; + + /* Start off with HZ/10 wait and slowly back off to 1 HZ wait. */ + fract = 10; + + for (;;) { + bool firstreport; + bool needreport; + int rtst; + + if (list_empty(&holdouts)) + break; + + /* Slowly back off waiting for holdouts */ + set_tasks_gp_state(rtp, RTGS_WAIT_SCAN_HOLDOUTS); + schedule_timeout_idle(HZ/fract); + + if (fract > 1) + fract--; + + rtst = READ_ONCE(rcu_task_stall_timeout); + needreport = rtst > 0 && time_after(jiffies, lastreport + rtst); + if (needreport) + lastreport = jiffies; + firstreport = true; + WARN_ON(signal_pending(current)); + set_tasks_gp_state(rtp, RTGS_SCAN_HOLDOUTS); + rtp->holdouts_func(&holdouts, needreport, &firstreport); + } + + set_tasks_gp_state(rtp, RTGS_POST_GP); + rtp->postgp_func(rtp); +} + +#endif /* #if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU) */ + +#ifdef CONFIG_TASKS_RCU + +//////////////////////////////////////////////////////////////////////// +// +// Simple variant of RCU whose quiescent states are voluntary context +// switch, cond_resched_rcu_qs(), user-space execution, and idle. +// As such, grace periods can take one good long time. There are no +// read-side primitives similar to rcu_read_lock() and rcu_read_unlock() +// because this implementation is intended to get the system into a safe +// state for some of the manipulations involved in tracing and the like. +// Finally, this implementation does not support high call_rcu_tasks() +// rates from multiple CPUs. If this is required, per-CPU callback lists +// will be needed. + +/* Pre-grace-period preparation. */ +static void rcu_tasks_pregp_step(void) +{ + /* + * Wait for all pre-existing t->on_rq and t->nvcsw transitions + * to complete. Invoking synchronize_rcu() suffices because all + * these transitions occur with interrupts disabled. Without this + * synchronize_rcu(), a read-side critical section that started + * before the grace period might be incorrectly seen as having + * started after the grace period. + * + * This synchronize_rcu() also dispenses with the need for a + * memory barrier on the first store to t->rcu_tasks_holdout, + * as it forces the store to happen after the beginning of the + * grace period. + */ + synchronize_rcu(); +} + +/* Per-task initial processing. */ +static void rcu_tasks_pertask(struct task_struct *t, struct list_head *hop) +{ + if (t != current && READ_ONCE(t->on_rq) && !is_idle_task(t)) { + get_task_struct(t); + t->rcu_tasks_nvcsw = READ_ONCE(t->nvcsw); + WRITE_ONCE(t->rcu_tasks_holdout, true); + list_add(&t->rcu_tasks_holdout_list, hop); + } +} + +/* Processing between scanning taskslist and draining the holdout list. */ +static void rcu_tasks_postscan(struct list_head *hop) +{ + /* + * Wait for tasks that are in the process of exiting. This + * does only part of the job, ensuring that all tasks that were + * previously exiting reach the point where they have disabled + * preemption, allowing the later synchronize_rcu() to finish + * the job. + */ + synchronize_srcu(&tasks_rcu_exit_srcu); +} + +/* See if tasks are still holding out, complain if so. */ +static void check_holdout_task(struct task_struct *t, + bool needreport, bool *firstreport) +{ + int cpu; + + if (!READ_ONCE(t->rcu_tasks_holdout) || + t->rcu_tasks_nvcsw != READ_ONCE(t->nvcsw) || + !READ_ONCE(t->on_rq) || + (IS_ENABLED(CONFIG_NO_HZ_FULL) && + !is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) { + WRITE_ONCE(t->rcu_tasks_holdout, false); + list_del_init(&t->rcu_tasks_holdout_list); + put_task_struct(t); + return; + } + rcu_request_urgent_qs_task(t); + if (!needreport) + return; + if (*firstreport) { + pr_err("INFO: rcu_tasks detected stalls on tasks:\n"); + *firstreport = false; + } + cpu = task_cpu(t); + pr_alert("%p: %c%c nvcsw: %lu/%lu holdout: %d idle_cpu: %d/%d\n", + t, ".I"[is_idle_task(t)], + "N."[cpu < 0 || !tick_nohz_full_cpu(cpu)], + t->rcu_tasks_nvcsw, t->nvcsw, t->rcu_tasks_holdout, + t->rcu_tasks_idle_cpu, cpu); + sched_show_task(t); +} + +/* Scan the holdout lists for tasks no longer holding out. */ +static void check_all_holdout_tasks(struct list_head *hop, + bool needreport, bool *firstreport) +{ + struct task_struct *t, *t1; + + list_for_each_entry_safe(t, t1, hop, rcu_tasks_holdout_list) { + check_holdout_task(t, needreport, firstreport); + cond_resched(); + } +} + +/* Finish off the Tasks-RCU grace period. */ +static void rcu_tasks_postgp(struct rcu_tasks *rtp) +{ + /* + * Because ->on_rq and ->nvcsw are not guaranteed to have a full + * memory barriers prior to them in the schedule() path, memory + * reordering on other CPUs could cause their RCU-tasks read-side + * critical sections to extend past the end of the grace period. + * However, because these ->nvcsw updates are carried out with + * interrupts disabled, we can use synchronize_rcu() to force the + * needed ordering on all such CPUs. + * + * This synchronize_rcu() also confines all ->rcu_tasks_holdout + * accesses to be within the grace period, avoiding the need for + * memory barriers for ->rcu_tasks_holdout accesses. + * + * In addition, this synchronize_rcu() waits for exiting tasks + * to complete their final preempt_disable() region of execution, + * cleaning up after the synchronize_srcu() above. + */ + synchronize_rcu(); +} + +void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func); +DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks, "RCU Tasks"); + +/** + * call_rcu_tasks() - Queue an RCU for invocation task-based grace period + * @rhp: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. call_rcu_tasks() assumes + * that the read-side critical sections end at a voluntary context + * switch (not a preemption!), cond_resched_rcu_qs(), entry into idle, + * or transition to usermode execution. As such, there are no read-side + * primitives analogous to rcu_read_lock() and rcu_read_unlock() because + * this primitive is intended to determine that all tasks have passed + * through a safe state, not so much for data-strcuture synchronization. + * + * See the description of call_rcu() for more detailed information on + * memory ordering guarantees. + */ +void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) +{ + call_rcu_tasks_generic(rhp, func, &rcu_tasks); +} +EXPORT_SYMBOL_GPL(call_rcu_tasks); + +/** + * synchronize_rcu_tasks - wait until an rcu-tasks grace period has elapsed. + * + * Control will return to the caller some time after a full rcu-tasks + * grace period has elapsed, in other words after all currently + * executing rcu-tasks read-side critical sections have elapsed. These + * read-side critical sections are delimited by calls to schedule(), + * cond_resched_tasks_rcu_qs(), idle execution, userspace execution, calls + * to synchronize_rcu_tasks(), and (in theory, anyway) cond_resched(). + * + * This is a very specialized primitive, intended only for a few uses in + * tracing and other situations requiring manipulation of function + * preambles and profiling hooks. The synchronize_rcu_tasks() function + * is not (yet) intended for heavy use from multiple CPUs. + * + * See the description of synchronize_rcu() for more detailed information + * on memory ordering guarantees. + */ +void synchronize_rcu_tasks(void) +{ + synchronize_rcu_tasks_generic(&rcu_tasks); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_tasks); + +/** + * rcu_barrier_tasks - Wait for in-flight call_rcu_tasks() callbacks. + * + * Although the current implementation is guaranteed to wait, it is not + * obligated to, for example, if there are no pending callbacks. + */ +void rcu_barrier_tasks(void) +{ + /* There is only one callback queue, so this is easy. ;-) */ + synchronize_rcu_tasks(); +} +EXPORT_SYMBOL_GPL(rcu_barrier_tasks); + +static int __init rcu_spawn_tasks_kthread(void) +{ + rcu_tasks.pregp_func = rcu_tasks_pregp_step; + rcu_tasks.pertask_func = rcu_tasks_pertask; + rcu_tasks.postscan_func = rcu_tasks_postscan; + rcu_tasks.holdouts_func = check_all_holdout_tasks; + rcu_tasks.postgp_func = rcu_tasks_postgp; + rcu_spawn_tasks_kthread_generic(&rcu_tasks); + return 0; +} +core_initcall(rcu_spawn_tasks_kthread); + +#ifndef CONFIG_TINY_RCU +static void show_rcu_tasks_classic_gp_kthread(void) +{ + show_rcu_tasks_generic_gp_kthread(&rcu_tasks, ""); +} +#endif /* #ifndef CONFIG_TINY_RCU */ + +/* Do the srcu_read_lock() for the above synchronize_srcu(). */ +void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu) +{ + preempt_disable(); + current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu); + preempt_enable(); +} + +/* Do the srcu_read_unlock() for the above synchronize_srcu(). */ +void exit_tasks_rcu_finish(void) __releases(&tasks_rcu_exit_srcu) +{ + struct task_struct *t = current; + + preempt_disable(); + __srcu_read_unlock(&tasks_rcu_exit_srcu, t->rcu_tasks_idx); + preempt_enable(); + exit_tasks_rcu_finish_trace(t); +} + +#else /* #ifdef CONFIG_TASKS_RCU */ +static void show_rcu_tasks_classic_gp_kthread(void) { } +void exit_tasks_rcu_start(void) { } +void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); } +#endif /* #else #ifdef CONFIG_TASKS_RCU */ + +#ifdef CONFIG_TASKS_RUDE_RCU + +//////////////////////////////////////////////////////////////////////// +// +// "Rude" variant of Tasks RCU, inspired by Steve Rostedt's trick of +// passing an empty function to schedule_on_each_cpu(). This approach +// provides an asynchronous call_rcu_tasks_rude() API and batching +// of concurrent calls to the synchronous synchronize_rcu_rude() API. +// This sends IPIs far and wide and induces otherwise unnecessary context +// switches on all online CPUs, whether idle or not. + +// Empty function to allow workqueues to force a context switch. +static void rcu_tasks_be_rude(struct work_struct *work) +{ +} + +// Wait for one rude RCU-tasks grace period. +static void rcu_tasks_rude_wait_gp(struct rcu_tasks *rtp) +{ + rtp->n_ipis += cpumask_weight(cpu_online_mask); + schedule_on_each_cpu(rcu_tasks_be_rude); +} + +void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func); +DEFINE_RCU_TASKS(rcu_tasks_rude, rcu_tasks_rude_wait_gp, call_rcu_tasks_rude, + "RCU Tasks Rude"); + +/** + * call_rcu_tasks_rude() - Queue a callback rude task-based grace period + * @rhp: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. call_rcu_tasks_rude() + * assumes that the read-side critical sections end at context switch, + * cond_resched_rcu_qs(), or transition to usermode execution. As such, + * there are no read-side primitives analogous to rcu_read_lock() and + * rcu_read_unlock() because this primitive is intended to determine + * that all tasks have passed through a safe state, not so much for + * data-strcuture synchronization. + * + * See the description of call_rcu() for more detailed information on + * memory ordering guarantees. + */ +void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func) +{ + call_rcu_tasks_generic(rhp, func, &rcu_tasks_rude); +} +EXPORT_SYMBOL_GPL(call_rcu_tasks_rude); + +/** + * synchronize_rcu_tasks_rude - wait for a rude rcu-tasks grace period + * + * Control will return to the caller some time after a rude rcu-tasks + * grace period has elapsed, in other words after all currently + * executing rcu-tasks read-side critical sections have elapsed. These + * read-side critical sections are delimited by calls to schedule(), + * cond_resched_tasks_rcu_qs(), userspace execution, and (in theory, + * anyway) cond_resched(). + * + * This is a very specialized primitive, intended only for a few uses in + * tracing and other situations requiring manipulation of function preambles + * and profiling hooks. The synchronize_rcu_tasks_rude() function is not + * (yet) intended for heavy use from multiple CPUs. + * + * See the description of synchronize_rcu() for more detailed information + * on memory ordering guarantees. + */ +void synchronize_rcu_tasks_rude(void) +{ + synchronize_rcu_tasks_generic(&rcu_tasks_rude); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_tasks_rude); + +/** + * rcu_barrier_tasks_rude - Wait for in-flight call_rcu_tasks_rude() callbacks. + * + * Although the current implementation is guaranteed to wait, it is not + * obligated to, for example, if there are no pending callbacks. + */ +void rcu_barrier_tasks_rude(void) +{ + /* There is only one callback queue, so this is easy. ;-) */ + synchronize_rcu_tasks_rude(); +} +EXPORT_SYMBOL_GPL(rcu_barrier_tasks_rude); + +static int __init rcu_spawn_tasks_rude_kthread(void) +{ + rcu_spawn_tasks_kthread_generic(&rcu_tasks_rude); + return 0; +} +core_initcall(rcu_spawn_tasks_rude_kthread); + +#ifndef CONFIG_TINY_RCU +static void show_rcu_tasks_rude_gp_kthread(void) +{ + show_rcu_tasks_generic_gp_kthread(&rcu_tasks_rude, ""); +} +#endif /* #ifndef CONFIG_TINY_RCU */ + +#else /* #ifdef CONFIG_TASKS_RUDE_RCU */ +static void show_rcu_tasks_rude_gp_kthread(void) {} +#endif /* #else #ifdef CONFIG_TASKS_RUDE_RCU */ + +//////////////////////////////////////////////////////////////////////// +// +// Tracing variant of Tasks RCU. This variant is designed to be used +// to protect tracing hooks, including those of BPF. This variant +// therefore: +// +// 1. Has explicit read-side markers to allow finite grace periods +// in the face of in-kernel loops for PREEMPT=n builds. +// +// 2. Protects code in the idle loop, exception entry/exit, and +// CPU-hotplug code paths, similar to the capabilities of SRCU. +// +// 3. Avoids expensive read-side instruction, having overhead similar +// to that of Preemptible RCU. +// +// There are of course downsides. The grace-period code can send IPIs to +// CPUs, even when those CPUs are in the idle loop or in nohz_full userspace. +// It is necessary to scan the full tasklist, much as for Tasks RCU. There +// is a single callback queue guarded by a single lock, again, much as for +// Tasks RCU. If needed, these downsides can be at least partially remedied. +// +// Perhaps most important, this variant of RCU does not affect the vanilla +// flavors, rcu_preempt and rcu_sched. The fact that RCU Tasks Trace +// readers can operate from idle, offline, and exception entry/exit in no +// way allows rcu_preempt and rcu_sched readers to also do so. + +// The lockdep state must be outside of #ifdef to be useful. +#ifdef CONFIG_DEBUG_LOCK_ALLOC +static struct lock_class_key rcu_lock_trace_key; +struct lockdep_map rcu_trace_lock_map = + STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_trace", &rcu_lock_trace_key); +EXPORT_SYMBOL_GPL(rcu_trace_lock_map); +#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + +#ifdef CONFIG_TASKS_TRACE_RCU + +static atomic_t trc_n_readers_need_end; // Number of waited-for readers. +static DECLARE_WAIT_QUEUE_HEAD(trc_wait); // List of holdout tasks. + +// Record outstanding IPIs to each CPU. No point in sending two... +static DEFINE_PER_CPU(bool, trc_ipi_to_cpu); + +// The number of detections of task quiescent state relying on +// heavyweight readers executing explicit memory barriers. +unsigned long n_heavy_reader_attempts; +unsigned long n_heavy_reader_updates; +unsigned long n_heavy_reader_ofl_updates; + +void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func); +DEFINE_RCU_TASKS(rcu_tasks_trace, rcu_tasks_wait_gp, call_rcu_tasks_trace, + "RCU Tasks Trace"); + +/* + * This irq_work handler allows rcu_read_unlock_trace() to be invoked + * while the scheduler locks are held. + */ +static void rcu_read_unlock_iw(struct irq_work *iwp) +{ + wake_up(&trc_wait); +} +static DEFINE_IRQ_WORK(rcu_tasks_trace_iw, rcu_read_unlock_iw); + +/* If we are the last reader, wake up the grace-period kthread. */ +void rcu_read_unlock_trace_special(struct task_struct *t, int nesting) +{ + int nq = t->trc_reader_special.b.need_qs; + + if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && + t->trc_reader_special.b.need_mb) + smp_mb(); // Pairs with update-side barriers. + // Update .need_qs before ->trc_reader_nesting for irq/NMI handlers. + if (nq) + WRITE_ONCE(t->trc_reader_special.b.need_qs, false); + WRITE_ONCE(t->trc_reader_nesting, nesting); + if (nq && atomic_dec_and_test(&trc_n_readers_need_end)) + irq_work_queue(&rcu_tasks_trace_iw); +} +EXPORT_SYMBOL_GPL(rcu_read_unlock_trace_special); + +/* Add a task to the holdout list, if it is not already on the list. */ +static void trc_add_holdout(struct task_struct *t, struct list_head *bhp) +{ + if (list_empty(&t->trc_holdout_list)) { + get_task_struct(t); + list_add(&t->trc_holdout_list, bhp); + } +} + +/* Remove a task from the holdout list, if it is in fact present. */ +static void trc_del_holdout(struct task_struct *t) +{ + if (!list_empty(&t->trc_holdout_list)) { + list_del_init(&t->trc_holdout_list); + put_task_struct(t); + } +} + +/* IPI handler to check task state. */ +static void trc_read_check_handler(void *t_in) +{ + struct task_struct *t = current; + struct task_struct *texp = t_in; + + // If the task is no longer running on this CPU, leave. + if (unlikely(texp != t)) { + if (WARN_ON_ONCE(atomic_dec_and_test(&trc_n_readers_need_end))) + wake_up(&trc_wait); + goto reset_ipi; // Already on holdout list, so will check later. + } + + // If the task is not in a read-side critical section, and + // if this is the last reader, awaken the grace-period kthread. + if (likely(!t->trc_reader_nesting)) { + if (WARN_ON_ONCE(atomic_dec_and_test(&trc_n_readers_need_end))) + wake_up(&trc_wait); + // Mark as checked after decrement to avoid false + // positives on the above WARN_ON_ONCE(). + WRITE_ONCE(t->trc_reader_checked, true); + goto reset_ipi; + } + WRITE_ONCE(t->trc_reader_checked, true); + + // Get here if the task is in a read-side critical section. Set + // its state so that it will awaken the grace-period kthread upon + // exit from that critical section. + WARN_ON_ONCE(t->trc_reader_special.b.need_qs); + WRITE_ONCE(t->trc_reader_special.b.need_qs, true); + +reset_ipi: + // Allow future IPIs to be sent on CPU and for task. + // Also order this IPI handler against any later manipulations of + // the intended task. + smp_store_release(&per_cpu(trc_ipi_to_cpu, smp_processor_id()), false); // ^^^ + smp_store_release(&texp->trc_ipi_to_cpu, -1); // ^^^ +} + +/* Callback function for scheduler to check locked-down task. */ +static bool trc_inspect_reader(struct task_struct *t, void *arg) +{ + int cpu = task_cpu(t); + bool in_qs = false; + bool ofl = cpu_is_offline(cpu); + + if (task_curr(t)) { + WARN_ON_ONCE(ofl && !is_idle_task(t)); + + // If no chance of heavyweight readers, do it the hard way. + if (!ofl && !IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) + return false; + + // If heavyweight readers are enabled on the remote task, + // we can inspect its state despite its currently running. + // However, we cannot safely change its state. + n_heavy_reader_attempts++; + if (!ofl && // Check for "running" idle tasks on offline CPUs. + !rcu_dynticks_zero_in_eqs(cpu, &t->trc_reader_nesting)) + return false; // No quiescent state, do it the hard way. + n_heavy_reader_updates++; + if (ofl) + n_heavy_reader_ofl_updates++; + in_qs = true; + } else { + in_qs = likely(!t->trc_reader_nesting); + } + + // Mark as checked. Because this is called from the grace-period + // kthread, also remove the task from the holdout list. + t->trc_reader_checked = true; + trc_del_holdout(t); + + if (in_qs) + return true; // Already in quiescent state, done!!! + + // The task is in a read-side critical section, so set up its + // state so that it will awaken the grace-period kthread upon exit + // from that critical section. + atomic_inc(&trc_n_readers_need_end); // One more to wait on. + WARN_ON_ONCE(t->trc_reader_special.b.need_qs); + WRITE_ONCE(t->trc_reader_special.b.need_qs, true); + return true; +} + +/* Attempt to extract the state for the specified task. */ +static void trc_wait_for_one_reader(struct task_struct *t, + struct list_head *bhp) +{ + int cpu; + + // If a previous IPI is still in flight, let it complete. + if (smp_load_acquire(&t->trc_ipi_to_cpu) != -1) // Order IPI + return; + + // The current task had better be in a quiescent state. + if (t == current) { + t->trc_reader_checked = true; + trc_del_holdout(t); + WARN_ON_ONCE(t->trc_reader_nesting); + return; + } + + // Attempt to nail down the task for inspection. + get_task_struct(t); + if (try_invoke_on_locked_down_task(t, trc_inspect_reader, NULL)) { + put_task_struct(t); + return; + } + put_task_struct(t); + + // If currently running, send an IPI, either way, add to list. + trc_add_holdout(t, bhp); + if (task_curr(t) && time_after(jiffies, rcu_tasks_trace.gp_start + rcu_task_ipi_delay)) { + // The task is currently running, so try IPIing it. + cpu = task_cpu(t); + + // If there is already an IPI outstanding, let it happen. + if (per_cpu(trc_ipi_to_cpu, cpu) || t->trc_ipi_to_cpu >= 0) + return; + + atomic_inc(&trc_n_readers_need_end); + per_cpu(trc_ipi_to_cpu, cpu) = true; + t->trc_ipi_to_cpu = cpu; + rcu_tasks_trace.n_ipis++; + if (smp_call_function_single(cpu, + trc_read_check_handler, t, 0)) { + // Just in case there is some other reason for + // failure than the target CPU being offline. + rcu_tasks_trace.n_ipis_fails++; + per_cpu(trc_ipi_to_cpu, cpu) = false; + t->trc_ipi_to_cpu = cpu; + if (atomic_dec_and_test(&trc_n_readers_need_end)) { + WARN_ON_ONCE(1); + wake_up(&trc_wait); + } + } + } +} + +/* Initialize for a new RCU-tasks-trace grace period. */ +static void rcu_tasks_trace_pregp_step(void) +{ + int cpu; + + // Allow for fast-acting IPIs. + atomic_set(&trc_n_readers_need_end, 1); + + // There shouldn't be any old IPIs, but... + for_each_possible_cpu(cpu) + WARN_ON_ONCE(per_cpu(trc_ipi_to_cpu, cpu)); + + // Disable CPU hotplug across the tasklist scan. + // This also waits for all readers in CPU-hotplug code paths. + cpus_read_lock(); +} + +/* Do first-round processing for the specified task. */ +static void rcu_tasks_trace_pertask(struct task_struct *t, + struct list_head *hop) +{ + WRITE_ONCE(t->trc_reader_special.b.need_qs, false); + WRITE_ONCE(t->trc_reader_checked, false); + t->trc_ipi_to_cpu = -1; + trc_wait_for_one_reader(t, hop); +} + +/* + * Do intermediate processing between task and holdout scans and + * pick up the idle tasks. + */ +static void rcu_tasks_trace_postscan(struct list_head *hop) +{ + int cpu; + + for_each_possible_cpu(cpu) + rcu_tasks_trace_pertask(idle_task(cpu), hop); + + // Re-enable CPU hotplug now that the tasklist scan has completed. + cpus_read_unlock(); + + // Wait for late-stage exiting tasks to finish exiting. + // These might have passed the call to exit_tasks_rcu_finish(). + synchronize_rcu(); + // Any tasks that exit after this point will set ->trc_reader_checked. +} + +/* Show the state of a task stalling the current RCU tasks trace GP. */ +static void show_stalled_task_trace(struct task_struct *t, bool *firstreport) +{ + int cpu; + + if (*firstreport) { + pr_err("INFO: rcu_tasks_trace detected stalls on tasks:\n"); + *firstreport = false; + } + // FIXME: This should attempt to use try_invoke_on_nonrunning_task(). + cpu = task_cpu(t); + pr_alert("P%d: %c%c%c nesting: %d%c cpu: %d\n", + t->pid, + ".I"[READ_ONCE(t->trc_ipi_to_cpu) > 0], + ".i"[is_idle_task(t)], + ".N"[cpu > 0 && tick_nohz_full_cpu(cpu)], + t->trc_reader_nesting, + " N"[!!t->trc_reader_special.b.need_qs], + cpu); + sched_show_task(t); +} + +/* List stalled IPIs for RCU tasks trace. */ +static void show_stalled_ipi_trace(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + if (per_cpu(trc_ipi_to_cpu, cpu)) + pr_alert("\tIPI outstanding to CPU %d\n", cpu); +} + +/* Do one scan of the holdout list. */ +static void check_all_holdout_tasks_trace(struct list_head *hop, + bool needreport, bool *firstreport) +{ + struct task_struct *g, *t; + + // Disable CPU hotplug across the holdout list scan. + cpus_read_lock(); + + list_for_each_entry_safe(t, g, hop, trc_holdout_list) { + // If safe and needed, try to check the current task. + if (READ_ONCE(t->trc_ipi_to_cpu) == -1 && + !READ_ONCE(t->trc_reader_checked)) + trc_wait_for_one_reader(t, hop); + + // If check succeeded, remove this task from the list. + if (READ_ONCE(t->trc_reader_checked)) + trc_del_holdout(t); + else if (needreport) + show_stalled_task_trace(t, firstreport); + } + + // Re-enable CPU hotplug now that the holdout list scan has completed. + cpus_read_unlock(); + + if (needreport) { + if (firstreport) + pr_err("INFO: rcu_tasks_trace detected stalls? (Late IPI?)\n"); + show_stalled_ipi_trace(); + } +} + +/* Wait for grace period to complete and provide ordering. */ +static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp) +{ + bool firstreport; + struct task_struct *g, *t; + LIST_HEAD(holdouts); + long ret; + + // Remove the safety count. + smp_mb__before_atomic(); // Order vs. earlier atomics + atomic_dec(&trc_n_readers_need_end); + smp_mb__after_atomic(); // Order vs. later atomics + + // Wait for readers. + set_tasks_gp_state(rtp, RTGS_WAIT_READERS); + for (;;) { + ret = wait_event_idle_exclusive_timeout( + trc_wait, + atomic_read(&trc_n_readers_need_end) == 0, + READ_ONCE(rcu_task_stall_timeout)); + if (ret) + break; // Count reached zero. + // Stall warning time, so make a list of the offenders. + for_each_process_thread(g, t) + if (READ_ONCE(t->trc_reader_special.b.need_qs)) + trc_add_holdout(t, &holdouts); + firstreport = true; + list_for_each_entry_safe(t, g, &holdouts, trc_holdout_list) + if (READ_ONCE(t->trc_reader_special.b.need_qs)) { + show_stalled_task_trace(t, &firstreport); + trc_del_holdout(t); + } + if (firstreport) + pr_err("INFO: rcu_tasks_trace detected stalls? (Counter/taskslist mismatch?)\n"); + show_stalled_ipi_trace(); + pr_err("\t%d holdouts\n", atomic_read(&trc_n_readers_need_end)); + } + smp_mb(); // Caller's code must be ordered after wakeup. + // Pairs with pretty much every ordering primitive. +} + +/* Report any needed quiescent state for this exiting task. */ +static void exit_tasks_rcu_finish_trace(struct task_struct *t) +{ + WRITE_ONCE(t->trc_reader_checked, true); + WARN_ON_ONCE(t->trc_reader_nesting); + WRITE_ONCE(t->trc_reader_nesting, 0); + if (WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs))) + rcu_read_unlock_trace_special(t, 0); +} + +/** + * call_rcu_tasks_trace() - Queue a callback trace task-based grace period + * @rhp: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. call_rcu_tasks_trace() + * assumes that the read-side critical sections end at context switch, + * cond_resched_rcu_qs(), or transition to usermode execution. As such, + * there are no read-side primitives analogous to rcu_read_lock() and + * rcu_read_unlock() because this primitive is intended to determine + * that all tasks have passed through a safe state, not so much for + * data-strcuture synchronization. + * + * See the description of call_rcu() for more detailed information on + * memory ordering guarantees. + */ +void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func) +{ + call_rcu_tasks_generic(rhp, func, &rcu_tasks_trace); +} +EXPORT_SYMBOL_GPL(call_rcu_tasks_trace); + +/** + * synchronize_rcu_tasks_trace - wait for a trace rcu-tasks grace period + * + * Control will return to the caller some time after a trace rcu-tasks + * grace period has elapsed, in other words after all currently executing + * rcu-tasks read-side critical sections have elapsed. These read-side + * critical sections are delimited by calls to rcu_read_lock_trace() + * and rcu_read_unlock_trace(). + * + * This is a very specialized primitive, intended only for a few uses in + * tracing and other situations requiring manipulation of function preambles + * and profiling hooks. The synchronize_rcu_tasks_trace() function is not + * (yet) intended for heavy use from multiple CPUs. + * + * See the description of synchronize_rcu() for more detailed information + * on memory ordering guarantees. + */ +void synchronize_rcu_tasks_trace(void) +{ + RCU_LOCKDEP_WARN(lock_is_held(&rcu_trace_lock_map), "Illegal synchronize_rcu_tasks_trace() in RCU Tasks Trace read-side critical section"); + synchronize_rcu_tasks_generic(&rcu_tasks_trace); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_tasks_trace); + +/** + * rcu_barrier_tasks_trace - Wait for in-flight call_rcu_tasks_trace() callbacks. + * + * Although the current implementation is guaranteed to wait, it is not + * obligated to, for example, if there are no pending callbacks. + */ +void rcu_barrier_tasks_trace(void) +{ + /* There is only one callback queue, so this is easy. ;-) */ + synchronize_rcu_tasks_trace(); +} +EXPORT_SYMBOL_GPL(rcu_barrier_tasks_trace); + +static int __init rcu_spawn_tasks_trace_kthread(void) +{ + rcu_tasks_trace.pregp_func = rcu_tasks_trace_pregp_step; + rcu_tasks_trace.pertask_func = rcu_tasks_trace_pertask; + rcu_tasks_trace.postscan_func = rcu_tasks_trace_postscan; + rcu_tasks_trace.holdouts_func = check_all_holdout_tasks_trace; + rcu_tasks_trace.postgp_func = rcu_tasks_trace_postgp; + rcu_spawn_tasks_kthread_generic(&rcu_tasks_trace); + return 0; +} +core_initcall(rcu_spawn_tasks_trace_kthread); + +#ifndef CONFIG_TINY_RCU +static void show_rcu_tasks_trace_gp_kthread(void) +{ + char buf[64]; + + sprintf(buf, "N%d h:%lu/%lu/%lu", atomic_read(&trc_n_readers_need_end), + data_race(n_heavy_reader_ofl_updates), + data_race(n_heavy_reader_updates), + data_race(n_heavy_reader_attempts)); + show_rcu_tasks_generic_gp_kthread(&rcu_tasks_trace, buf); +} +#endif /* #ifndef CONFIG_TINY_RCU */ + +#else /* #ifdef CONFIG_TASKS_TRACE_RCU */ +static void exit_tasks_rcu_finish_trace(struct task_struct *t) { } +static inline void show_rcu_tasks_trace_gp_kthread(void) {} +#endif /* #else #ifdef CONFIG_TASKS_TRACE_RCU */ + +#ifndef CONFIG_TINY_RCU +void show_rcu_tasks_gp_kthreads(void) +{ + show_rcu_tasks_classic_gp_kthread(); + show_rcu_tasks_rude_gp_kthread(); + show_rcu_tasks_trace_gp_kthread(); +} +#endif /* #ifndef CONFIG_TINY_RCU */ + +#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ +static inline void rcu_tasks_bootup_oddness(void) {} +void show_rcu_tasks_gp_kthreads(void) {} +#endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */ diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index dd572ce7c747..aa897c3f2e92 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -23,6 +23,7 @@ #include <linux/cpu.h> #include <linux/prefetch.h> #include <linux/slab.h> +#include <linux/mm.h> #include "rcu.h" @@ -84,9 +85,9 @@ static inline bool rcu_reclaim_tiny(struct rcu_head *head) unsigned long offset = (unsigned long)head->func; rcu_lock_acquire(&rcu_callback_map); - if (__is_kfree_rcu_offset(offset)) { - trace_rcu_invoke_kfree_callback("", head, offset); - kfree((void *)head - offset); + if (__is_kvfree_rcu_offset(offset)) { + trace_rcu_invoke_kvfree_callback("", head, offset); + kvfree((void *)head - offset); rcu_lock_release(&rcu_callback_map); return true; } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d9a49cd6065a..ac7198ed3197 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -57,6 +57,8 @@ #include <linux/slab.h> #include <linux/sched/isolation.h> #include <linux/sched/clock.h> +#include <linux/vmalloc.h> +#include <linux/mm.h> #include "../time/tick-internal.h" #include "tree.h" @@ -67,6 +69,19 @@ #endif #define MODULE_PARAM_PREFIX "rcutree." +#ifndef data_race +#define data_race(expr) \ + ({ \ + expr; \ + }) +#endif +#ifndef ASSERT_EXCLUSIVE_WRITER +#define ASSERT_EXCLUSIVE_WRITER(var) do { } while (0) +#endif +#ifndef ASSERT_EXCLUSIVE_ACCESS +#define ASSERT_EXCLUSIVE_ACCESS(var) do { } while (0) +#endif + /* Data structures. */ /* @@ -75,9 +90,6 @@ */ #define RCU_DYNTICK_CTRL_MASK 0x1 #define RCU_DYNTICK_CTRL_CTR (RCU_DYNTICK_CTRL_MASK + 1) -#ifndef rcu_eqs_special_exit -#define rcu_eqs_special_exit() do { } while (0) -#endif static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = { .dynticks_nesting = 1, @@ -100,7 +112,7 @@ static struct rcu_state rcu_state = { static bool dump_tree; module_param(dump_tree, bool, 0444); /* By default, use RCU_SOFTIRQ instead of rcuc kthreads. */ -static bool use_softirq = 1; +static bool use_softirq = true; module_param(use_softirq, bool, 0444); /* Control rcu_node-tree auto-balancing at boot time. */ static bool rcu_fanout_exact; @@ -165,6 +177,15 @@ module_param(gp_init_delay, int, 0444); static int gp_cleanup_delay; module_param(gp_cleanup_delay, int, 0444); +/* + * This rcu parameter is runtime-read-only. It reflects + * a minimum allowed number of objects which can be cached + * per-CPU. Object size is equal to one page. This value + * can be changed at boot time. + */ +static int rcu_min_cached_objs = 2; +module_param(rcu_min_cached_objs, int, 0444); + /* Retrieve RCU kthreads priority for rcutorture */ int rcu_get_gp_kthreads_prio(void) { @@ -225,9 +246,11 @@ void rcu_softirq_qs(void) /* * Record entry into an extended quiescent state. This is only to be - * called when not already in an extended quiescent state. + * called when not already in an extended quiescent state, that is, + * RCU is watching prior to the call to this function and is no longer + * watching upon return. */ -static void rcu_dynticks_eqs_enter(void) +static noinstr void rcu_dynticks_eqs_enter(void) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); int seq; @@ -237,8 +260,9 @@ static void rcu_dynticks_eqs_enter(void) * critical sections, and we also must force ordering with the * next idle sojourn. */ - seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); - /* Better be in an extended quiescent state! */ + rcu_dynticks_task_trace_enter(); // Before ->dynticks update! + seq = arch_atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); + // RCU is no longer watching. Better be in extended quiescent state! WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & RCU_DYNTICK_CTRL_CTR)); /* Better not have special action (TLB flush) pending! */ @@ -248,9 +272,10 @@ static void rcu_dynticks_eqs_enter(void) /* * Record exit from an extended quiescent state. This is only to be - * called from an extended quiescent state. + * called from an extended quiescent state, that is, RCU is not watching + * prior to the call to this function and is watching upon return. */ -static void rcu_dynticks_eqs_exit(void) +static noinstr void rcu_dynticks_eqs_exit(void) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); int seq; @@ -260,14 +285,14 @@ static void rcu_dynticks_eqs_exit(void) * and we also must force ordering with the next RCU read-side * critical section. */ - seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); + seq = arch_atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); + // RCU is now watching. Better not be in an extended quiescent state! + rcu_dynticks_task_trace_exit(); // After ->dynticks update! WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & RCU_DYNTICK_CTRL_CTR)); if (seq & RCU_DYNTICK_CTRL_MASK) { - atomic_andnot(RCU_DYNTICK_CTRL_MASK, &rdp->dynticks); + arch_atomic_andnot(RCU_DYNTICK_CTRL_MASK, &rdp->dynticks); smp_mb__after_atomic(); /* _exit after clearing mask. */ - /* Prefer duplicate flushes to losing a flush. */ - rcu_eqs_special_exit(); } } @@ -295,11 +320,11 @@ static void rcu_dynticks_eqs_online(void) * * No ordering, as we are sampling CPU-local information. */ -static bool rcu_dynticks_curr_cpu_in_eqs(void) +static __always_inline bool rcu_dynticks_curr_cpu_in_eqs(void) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - return !(atomic_read(&rdp->dynticks) & RCU_DYNTICK_CTRL_CTR); + return !(arch_atomic_read(&rdp->dynticks) & RCU_DYNTICK_CTRL_CTR); } /* @@ -333,6 +358,28 @@ static bool rcu_dynticks_in_eqs_since(struct rcu_data *rdp, int snap) } /* + * Return true if the referenced integer is zero while the specified + * CPU remains within a single extended quiescent state. + */ +bool rcu_dynticks_zero_in_eqs(int cpu, int *vp) +{ + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + int snap; + + // If not quiescent, force back to earlier extended quiescent state. + snap = atomic_read(&rdp->dynticks) & ~(RCU_DYNTICK_CTRL_MASK | + RCU_DYNTICK_CTRL_CTR); + + smp_rmb(); // Order ->dynticks and *vp reads. + if (READ_ONCE(*vp)) + return false; // Non-zero, so report failure; + smp_rmb(); // Order *vp read and ->dynticks re-read. + + // If still in the same extended quiescent state, we are good! + return snap == (atomic_read(&rdp->dynticks) & ~RCU_DYNTICK_CTRL_MASK); +} + +/* * Set the special (bottom) bit of the specified CPU so that it * will take special action (such as flushing its TLB) on the * next exit from an extended quiescent state. Returns true if @@ -382,16 +429,23 @@ void rcu_momentary_dyntick_idle(void) EXPORT_SYMBOL_GPL(rcu_momentary_dyntick_idle); /** - * rcu_is_cpu_rrupt_from_idle - see if interrupted from idle + * rcu_is_cpu_rrupt_from_idle - see if 'interrupted' from idle * * If the current CPU is idle and running at a first-level (not nested) - * interrupt from idle, return true. The caller must have at least - * disabled preemption. + * interrupt, or directly, from idle, return true. + * + * The caller must have at least disabled IRQs. */ static int rcu_is_cpu_rrupt_from_idle(void) { - /* Called only from within the scheduling-clock interrupt */ - lockdep_assert_in_irq(); + long nesting; + + /* + * Usually called from the tick; but also used from smp_function_call() + * for expedited grace periods. This latter can result in running from + * the idle task, instead of an actual IPI. + */ + lockdep_assert_irqs_disabled(); /* Check for counter underflows */ RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) < 0, @@ -400,9 +454,15 @@ static int rcu_is_cpu_rrupt_from_idle(void) "RCU dynticks_nmi_nesting counter underflow/zero!"); /* Are we at first interrupt nesting level? */ - if (__this_cpu_read(rcu_data.dynticks_nmi_nesting) != 1) + nesting = __this_cpu_read(rcu_data.dynticks_nmi_nesting); + if (nesting > 1) return false; + /* + * If we're not in an interrupt, we must be in the idle task! + */ + WARN_ON_ONCE(!nesting && !is_idle_task(current)); + /* Does CPU appear to be idle from an RCU standpoint? */ return __this_cpu_read(rcu_data.dynticks_nesting) == 0; } @@ -562,7 +622,7 @@ EXPORT_SYMBOL_GPL(rcutorture_get_gp_data); * the possibility of usermode upcalls having messed up our count * of interrupt nesting level during the prior busy period. */ -static void rcu_eqs_enter(bool user) +static noinstr void rcu_eqs_enter(bool user) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); @@ -571,19 +631,28 @@ static void rcu_eqs_enter(bool user) WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && rdp->dynticks_nesting == 0); if (rdp->dynticks_nesting != 1) { + // RCU will still be watching, so just do accounting and leave. rdp->dynticks_nesting--; return; } lockdep_assert_irqs_disabled(); + instrumentation_begin(); trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, atomic_read(&rdp->dynticks)); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); rdp = this_cpu_ptr(&rcu_data); do_nocb_deferred_wakeup(rdp); rcu_prepare_for_idle(); rcu_preempt_deferred_qs(current); + + // instrumentation for the noinstr rcu_dynticks_eqs_enter() + instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks)); + + instrumentation_end(); WRITE_ONCE(rdp->dynticks_nesting, 0); /* Avoid irq-access tearing. */ + // RCU is watching here ... rcu_dynticks_eqs_enter(); + // ... but is no longer watching here. rcu_dynticks_task_enter(); } @@ -616,26 +685,29 @@ void rcu_idle_enter(void) * If you add or remove a call to rcu_user_enter(), be sure to test with * CONFIG_RCU_EQS_DEBUG=y. */ -void rcu_user_enter(void) +noinstr void rcu_user_enter(void) { lockdep_assert_irqs_disabled(); rcu_eqs_enter(true); } #endif /* CONFIG_NO_HZ_FULL */ -/* +/** + * rcu_nmi_exit - inform RCU of exit from NMI context + * * If we are returning from the outermost NMI handler that interrupted an * RCU-idle period, update rdp->dynticks and rdp->dynticks_nmi_nesting * to let the RCU grace-period handling know that the CPU is back to * being RCU-idle. * - * If you add or remove a call to rcu_nmi_exit_common(), be sure to test + * If you add or remove a call to rcu_nmi_exit(), be sure to test * with CONFIG_RCU_EQS_DEBUG=y. */ -static __always_inline void rcu_nmi_exit_common(bool irq) +noinstr void rcu_nmi_exit(void) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); + instrumentation_begin(); /* * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks. * (We are exiting an NMI handler, so RCU better be paying attention @@ -653,6 +725,7 @@ static __always_inline void rcu_nmi_exit_common(bool irq) atomic_read(&rdp->dynticks)); WRITE_ONCE(rdp->dynticks_nmi_nesting, /* No store tearing. */ rdp->dynticks_nmi_nesting - 2); + instrumentation_end(); return; } @@ -660,27 +733,22 @@ static __always_inline void rcu_nmi_exit_common(bool irq) trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, atomic_read(&rdp->dynticks)); WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */ - if (irq) + if (!in_nmi()) rcu_prepare_for_idle(); + // instrumentation for the noinstr rcu_dynticks_eqs_enter() + instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks)); + instrumentation_end(); + + // RCU is watching here ... rcu_dynticks_eqs_enter(); + // ... but is no longer watching here. - if (irq) + if (!in_nmi()) rcu_dynticks_task_enter(); } /** - * rcu_nmi_exit - inform RCU of exit from NMI context - * - * If you add or remove a call to rcu_nmi_exit(), be sure to test - * with CONFIG_RCU_EQS_DEBUG=y. - */ -void rcu_nmi_exit(void) -{ - rcu_nmi_exit_common(false); -} - -/** * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle * * Exit from an interrupt handler, which might possibly result in entering @@ -699,11 +767,51 @@ void rcu_nmi_exit(void) * If you add or remove a call to rcu_irq_exit(), be sure to test with * CONFIG_RCU_EQS_DEBUG=y. */ -void rcu_irq_exit(void) +void noinstr rcu_irq_exit(void) +{ + lockdep_assert_irqs_disabled(); + rcu_nmi_exit(); +} + +/** + * rcu_irq_exit_preempt - Inform RCU that current CPU is exiting irq + * towards in kernel preemption + * + * Same as rcu_irq_exit() but has a sanity check that scheduling is safe + * from RCU point of view. Invoked from return from interrupt before kernel + * preemption. + */ +void rcu_irq_exit_preempt(void) +{ + lockdep_assert_irqs_disabled(); + rcu_nmi_exit(); + + RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) <= 0, + "RCU dynticks_nesting counter underflow/zero!"); + RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) != + DYNTICK_IRQ_NONIDLE, + "Bad RCU dynticks_nmi_nesting counter\n"); + RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(), + "RCU in extended quiescent state!"); +} + +#ifdef CONFIG_PROVE_RCU +/** + * rcu_irq_exit_check_preempt - Validate that scheduling is possible + */ +void rcu_irq_exit_check_preempt(void) { lockdep_assert_irqs_disabled(); - rcu_nmi_exit_common(true); + + RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) <= 0, + "RCU dynticks_nesting counter underflow/zero!"); + RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) != + DYNTICK_IRQ_NONIDLE, + "Bad RCU dynticks_nmi_nesting counter\n"); + RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(), + "RCU in extended quiescent state!"); } +#endif /* #ifdef CONFIG_PROVE_RCU */ /* * Wrapper for rcu_irq_exit() where interrupts are enabled. @@ -728,7 +836,7 @@ void rcu_irq_exit_irqson(void) * allow for the possibility of usermode upcalls messing up our count of * interrupt nesting level during the busy period that is just now starting. */ -static void rcu_eqs_exit(bool user) +static void noinstr rcu_eqs_exit(bool user) { struct rcu_data *rdp; long oldval; @@ -738,17 +846,26 @@ static void rcu_eqs_exit(bool user) oldval = rdp->dynticks_nesting; WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0); if (oldval) { + // RCU was already watching, so just do accounting and leave. rdp->dynticks_nesting++; return; } rcu_dynticks_task_exit(); + // RCU is not watching here ... rcu_dynticks_eqs_exit(); + // ... but is watching here. + instrumentation_begin(); + + // instrumentation for the noinstr rcu_dynticks_eqs_exit() + instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks)); + rcu_cleanup_after_idle(); trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, atomic_read(&rdp->dynticks)); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); WRITE_ONCE(rdp->dynticks_nesting, 1); WARN_ON_ONCE(rdp->dynticks_nmi_nesting); WRITE_ONCE(rdp->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE); + instrumentation_end(); } /** @@ -779,15 +896,75 @@ void rcu_idle_exit(void) * If you add or remove a call to rcu_user_exit(), be sure to test with * CONFIG_RCU_EQS_DEBUG=y. */ -void rcu_user_exit(void) +void noinstr rcu_user_exit(void) { rcu_eqs_exit(1); } + +/** + * __rcu_irq_enter_check_tick - Enable scheduler tick on CPU if RCU needs it. + * + * The scheduler tick is not normally enabled when CPUs enter the kernel + * from nohz_full userspace execution. After all, nohz_full userspace + * execution is an RCU quiescent state and the time executing in the kernel + * is quite short. Except of course when it isn't. And it is not hard to + * cause a large system to spend tens of seconds or even minutes looping + * in the kernel, which can cause a number of problems, include RCU CPU + * stall warnings. + * + * Therefore, if a nohz_full CPU fails to report a quiescent state + * in a timely manner, the RCU grace-period kthread sets that CPU's + * ->rcu_urgent_qs flag with the expectation that the next interrupt or + * exception will invoke this function, which will turn on the scheduler + * tick, which will enable RCU to detect that CPU's quiescent states, + * for example, due to cond_resched() calls in CONFIG_PREEMPT=n kernels. + * The tick will be disabled once a quiescent state is reported for + * this CPU. + * + * Of course, in carefully tuned systems, there might never be an + * interrupt or exception. In that case, the RCU grace-period kthread + * will eventually cause one to happen. However, in less carefully + * controlled environments, this function allows RCU to get what it + * needs without creating otherwise useless interruptions. + */ +void __rcu_irq_enter_check_tick(void) +{ + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); + + // Enabling the tick is unsafe in NMI handlers. + if (WARN_ON_ONCE(in_nmi())) + return; + + RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(), + "Illegal rcu_irq_enter_check_tick() from extended quiescent state"); + + if (!tick_nohz_full_cpu(rdp->cpu) || + !READ_ONCE(rdp->rcu_urgent_qs) || + READ_ONCE(rdp->rcu_forced_tick)) { + // RCU doesn't need nohz_full help from this CPU, or it is + // already getting that help. + return; + } + + // We get here only when not in an extended quiescent state and + // from interrupts (as opposed to NMIs). Therefore, (1) RCU is + // already watching and (2) The fact that we are in an interrupt + // handler and that the rcu_node lock is an irq-disabled lock + // prevents self-deadlock. So we can safely recheck under the lock. + // Note that the nohz_full state currently cannot change. + raw_spin_lock_rcu_node(rdp->mynode); + if (rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) { + // A nohz_full CPU is in the kernel and RCU needs a + // quiescent state. Turn on the tick! + WRITE_ONCE(rdp->rcu_forced_tick, true); + tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU); + } + raw_spin_unlock_rcu_node(rdp->mynode); +} #endif /* CONFIG_NO_HZ_FULL */ /** - * rcu_nmi_enter_common - inform RCU of entry to NMI context - * @irq: Is this call from rcu_irq_enter? + * rcu_nmi_enter - inform RCU of entry to NMI context * * If the CPU was idle from RCU's viewpoint, update rdp->dynticks and * rdp->dynticks_nmi_nesting to let the RCU grace-period handling know @@ -795,10 +972,10 @@ void rcu_user_exit(void) * long as the nesting level does not overflow an int. (You will probably * run out of stack space first.) * - * If you add or remove a call to rcu_nmi_enter_common(), be sure to test + * If you add or remove a call to rcu_nmi_enter(), be sure to test * with CONFIG_RCU_EQS_DEBUG=y. */ -static __always_inline void rcu_nmi_enter_common(bool irq) +noinstr void rcu_nmi_enter(void) { long incby = 2; struct rcu_data *rdp = this_cpu_ptr(&rcu_data); @@ -816,45 +993,44 @@ static __always_inline void rcu_nmi_enter_common(bool irq) */ if (rcu_dynticks_curr_cpu_in_eqs()) { - if (irq) + if (!in_nmi()) rcu_dynticks_task_exit(); + // RCU is not watching here ... rcu_dynticks_eqs_exit(); + // ... but is watching here. - if (irq) + if (!in_nmi()) { + instrumentation_begin(); rcu_cleanup_after_idle(); + instrumentation_end(); + } + + instrumentation_begin(); + // instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs() + instrument_atomic_read(&rdp->dynticks, sizeof(rdp->dynticks)); + // instrumentation for the noinstr rcu_dynticks_eqs_exit() + instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks)); incby = 1; - } else if (irq && tick_nohz_full_cpu(rdp->cpu) && - rdp->dynticks_nmi_nesting == DYNTICK_IRQ_NONIDLE && - READ_ONCE(rdp->rcu_urgent_qs) && - !READ_ONCE(rdp->rcu_forced_tick)) { - raw_spin_lock_rcu_node(rdp->mynode); - // Recheck under lock. - if (rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) { - WRITE_ONCE(rdp->rcu_forced_tick, true); - tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU); - } - raw_spin_unlock_rcu_node(rdp->mynode); + } else if (!in_nmi()) { + instrumentation_begin(); + rcu_irq_enter_check_tick(); + instrumentation_end(); + } else { + instrumentation_begin(); } + trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting + incby, atomic_read(&rdp->dynticks)); + instrumentation_end(); WRITE_ONCE(rdp->dynticks_nmi_nesting, /* Prevent store tearing. */ rdp->dynticks_nmi_nesting + incby); barrier(); } /** - * rcu_nmi_enter - inform RCU of entry to NMI context - */ -void rcu_nmi_enter(void) -{ - rcu_nmi_enter_common(false); -} -NOKPROBE_SYMBOL(rcu_nmi_enter); - -/** * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle * * Enter an interrupt handler, which might possibly result in exiting @@ -876,10 +1052,10 @@ NOKPROBE_SYMBOL(rcu_nmi_enter); * If you add or remove a call to rcu_irq_enter(), be sure to test with * CONFIG_RCU_EQS_DEBUG=y. */ -void rcu_irq_enter(void) +noinstr void rcu_irq_enter(void) { lockdep_assert_irqs_disabled(); - rcu_nmi_enter_common(true); + rcu_nmi_enter(); } /* @@ -913,6 +1089,11 @@ static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp) } } +noinstr bool __rcu_is_watching(void) +{ + return !rcu_dynticks_curr_cpu_in_eqs(); +} + /** * rcu_is_watching - see if RCU thinks that the current CPU is not idle * @@ -921,7 +1102,7 @@ static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp) * if the current CPU is not in its idle loop or is in an interrupt or * NMI handler, return true. */ -bool notrace rcu_is_watching(void) +bool rcu_is_watching(void) { bool ret; @@ -973,12 +1154,12 @@ bool rcu_lockdep_current_cpu_online(void) if (in_nmi() || !rcu_scheduler_fully_active) return true; - preempt_disable(); + preempt_disable_notrace(); rdp = this_cpu_ptr(&rcu_data); rnp = rdp->mynode; if (rdp->grpmask & rcu_rnp_online_cpus(rnp)) ret = true; - preempt_enable(); + preempt_enable_notrace(); return ret; } EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); @@ -1217,7 +1398,7 @@ static bool rcu_start_this_gp(struct rcu_node *rnp_start, struct rcu_data *rdp, trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("NoGPkthread")); goto unlock_out; } - trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("newreq")); + trace_rcu_grace_period(rcu_state.name, data_race(rcu_state.gp_seq), TPS("newreq")); ret = true; /* Caller must wake GP kthread. */ unlock_out: /* Push furthest requested GP to leaf node and rcu_data structure. */ @@ -1470,7 +1651,32 @@ static void rcu_gp_slow(int delay) if (delay > 0 && !(rcu_seq_ctr(rcu_state.gp_seq) % (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay))) - schedule_timeout_uninterruptible(delay); + schedule_timeout_idle(delay); +} + +static unsigned long sleep_duration; + +/* Allow rcutorture to stall the grace-period kthread. */ +void rcu_gp_set_torture_wait(int duration) +{ + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST) && duration > 0) + WRITE_ONCE(sleep_duration, duration); +} +EXPORT_SYMBOL_GPL(rcu_gp_set_torture_wait); + +/* Actually implement the aforementioned wait. */ +static void rcu_gp_torture_wait(void) +{ + unsigned long duration; + + if (!IS_ENABLED(CONFIG_RCU_TORTURE_TEST)) + return; + duration = xchg(&sleep_duration, 0UL); + if (duration > 0) { + pr_alert("%s: Waiting %lu jiffies\n", __func__, duration); + schedule_timeout_idle(duration); + pr_alert("%s: Wait complete\n", __func__); + } } /* @@ -1506,6 +1712,7 @@ static bool rcu_gp_init(void) record_gp_stall_check_time(); /* Record GP times before starting GP, hence rcu_seq_start(). */ rcu_seq_start(&rcu_state.gp_seq); + ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq); trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start")); raw_spin_unlock_irq_rcu_node(rnp); @@ -1611,12 +1818,16 @@ static bool rcu_gp_fqs_check_wake(int *gfp) { struct rcu_node *rnp = rcu_get_root(); - /* Someone like call_rcu() requested a force-quiescent-state scan. */ + // If under overload conditions, force an immediate FQS scan. + if (*gfp & RCU_GP_FLAG_OVLD) + return true; + + // Someone like call_rcu() requested a force-quiescent-state scan. *gfp = READ_ONCE(rcu_state.gp_flags); if (*gfp & RCU_GP_FLAG_FQS) return true; - /* The current grace period has completed. */ + // The current grace period has completed. if (!READ_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp)) return true; @@ -1654,13 +1865,15 @@ static void rcu_gp_fqs(bool first_time) static void rcu_gp_fqs_loop(void) { bool first_gp_fqs; - int gf; + int gf = 0; unsigned long j; int ret; struct rcu_node *rnp = rcu_get_root(); first_gp_fqs = true; j = READ_ONCE(jiffies_till_first_fqs); + if (rcu_state.cbovld) + gf = RCU_GP_FLAG_OVLD; ret = 0; for (;;) { if (!ret) { @@ -1673,6 +1886,7 @@ static void rcu_gp_fqs_loop(void) rcu_state.gp_state = RCU_GP_WAIT_FQS; ret = swait_event_idle_timeout_exclusive( rcu_state.gp_wq, rcu_gp_fqs_check_wake(&gf), j); + rcu_gp_torture_wait(); rcu_state.gp_state = RCU_GP_DOING_FQS; /* Locking provides needed memory barriers. */ /* If grace period done, leave loop. */ @@ -1680,12 +1894,16 @@ static void rcu_gp_fqs_loop(void) !rcu_preempt_blocked_readers_cgp(rnp)) break; /* If time for quiescent-state forcing, do it. */ - if (ULONG_CMP_GE(jiffies, rcu_state.jiffies_force_qs) || + if (!time_after(rcu_state.jiffies_force_qs, jiffies) || (gf & RCU_GP_FLAG_FQS)) { trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("fqsstart")); rcu_gp_fqs(first_gp_fqs); - first_gp_fqs = false; + gf = 0; + if (first_gp_fqs) { + first_gp_fqs = false; + gf = rcu_state.cbovld ? RCU_GP_FLAG_OVLD : 0; + } trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("fqsend")); cond_resched_tasks_rcu_qs(); @@ -1705,6 +1923,7 @@ static void rcu_gp_fqs_loop(void) j = 1; else j = rcu_state.jiffies_force_qs - j; + gf = 0; } } } @@ -1781,6 +2000,7 @@ static void rcu_gp_cleanup(void) /* Declare grace period done, trace first to use old GP number. */ trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("end")); rcu_seq_end(&rcu_state.gp_seq); + ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq); rcu_state.gp_state = RCU_GP_IDLE; /* Check for GP requests since above loop. */ rdp = this_cpu_ptr(&rcu_data); @@ -1821,6 +2041,7 @@ static int __noreturn rcu_gp_kthread(void *unused) swait_event_idle_exclusive(rcu_state.gp_wq, READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_INIT); + rcu_gp_torture_wait(); rcu_state.gp_state = RCU_GP_DONE_GPS; /* Locking provides needed memory barrier. */ if (rcu_gp_init()) @@ -2235,6 +2456,7 @@ static void rcu_do_batch(struct rcu_data *rdp) local_irq_save(flags); rcu_nocb_lock(rdp); count = -rcl.len; + rdp->n_cbs_invoked += count; trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(), is_idle_task(current), rcu_is_callbacks_kthread()); @@ -2518,7 +2740,7 @@ static void rcu_cpu_kthread(unsigned int cpu) } *statusp = RCU_KTHREAD_YIELDING; trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield")); - schedule_timeout_interruptible(2); + schedule_timeout_idle(2); trace_rcu_utilization(TPS("End CPU kthread@rcu_yield")); *statusp = RCU_KTHREAD_WAITING; } @@ -2686,8 +2908,8 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func) return; // Enqueued onto ->nocb_bypass, so just leave. // If no-CBs CPU gets here, rcu_nocb_try_bypass() acquired ->nocb_lock. rcu_segcblist_enqueue(&rdp->cblist, head); - if (__is_kfree_rcu_offset((unsigned long)func)) - trace_rcu_kfree_callback(rcu_state.name, head, + if (__is_kvfree_rcu_offset((unsigned long)func)) + trace_rcu_kvfree_callback(rcu_state.name, head, (unsigned long)func, rcu_segcblist_n_cbs(&rdp->cblist)); else @@ -2749,53 +2971,53 @@ EXPORT_SYMBOL_GPL(call_rcu); /* Maximum number of jiffies to wait before draining a batch. */ #define KFREE_DRAIN_JIFFIES (HZ / 50) #define KFREE_N_BATCHES 2 - -/* - * This macro defines how many entries the "records" array - * will contain. It is based on the fact that the size of - * kfree_rcu_bulk_data structure becomes exactly one page. - */ -#define KFREE_BULK_MAX_ENTR ((PAGE_SIZE / sizeof(void *)) - 3) +#define FREE_N_CHANNELS 2 /** - * struct kfree_rcu_bulk_data - single block to store kfree_rcu() pointers + * struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers * @nr_records: Number of active pointers in the array - * @records: Array of the kfree_rcu() pointers * @next: Next bulk object in the block chain - * @head_free_debug: For debug, when CONFIG_DEBUG_OBJECTS_RCU_HEAD is set + * @records: Array of the kvfree_rcu() pointers */ -struct kfree_rcu_bulk_data { +struct kvfree_rcu_bulk_data { unsigned long nr_records; - void *records[KFREE_BULK_MAX_ENTR]; - struct kfree_rcu_bulk_data *next; - struct rcu_head *head_free_debug; + struct kvfree_rcu_bulk_data *next; + void *records[]; }; +/* + * This macro defines how many entries the "records" array + * will contain. It is based on the fact that the size of + * kvfree_rcu_bulk_data structure becomes exactly one page. + */ +#define KVFREE_BULK_MAX_ENTR \ + ((PAGE_SIZE - sizeof(struct kvfree_rcu_bulk_data)) / sizeof(void *)) + /** * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period * @head_free: List of kfree_rcu() objects waiting for a grace period - * @bhead_free: Bulk-List of kfree_rcu() objects waiting for a grace period + * @bkvhead_free: Bulk-List of kvfree_rcu() objects waiting for a grace period * @krcp: Pointer to @kfree_rcu_cpu structure */ struct kfree_rcu_cpu_work { struct rcu_work rcu_work; struct rcu_head *head_free; - struct kfree_rcu_bulk_data *bhead_free; + struct kvfree_rcu_bulk_data *bkvhead_free[FREE_N_CHANNELS]; struct kfree_rcu_cpu *krcp; }; /** * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period * @head: List of kfree_rcu() objects not yet waiting for a grace period - * @bhead: Bulk-List of kfree_rcu() objects not yet waiting for a grace period - * @bcached: Keeps at most one object for later reuse when build chain blocks + * @bkvhead: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period * @lock: Synchronize access to this structure * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES * @monitor_todo: Tracks whether a @monitor_work delayed work is pending - * @initialized: The @lock and @rcu_work fields have been initialized + * @initialized: The @rcu_work fields have been initialized + * @count: Number of objects for which GP not started * * This is a per-CPU structure. The reason that it is not included in * the rcu_data structure is to permit this code to be extracted from @@ -2804,26 +3026,84 @@ struct kfree_rcu_cpu_work { */ struct kfree_rcu_cpu { struct rcu_head *head; - struct kfree_rcu_bulk_data *bhead; - struct kfree_rcu_bulk_data *bcached; + struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS]; struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES]; - spinlock_t lock; + raw_spinlock_t lock; struct delayed_work monitor_work; bool monitor_todo; bool initialized; + int count; + + /* + * A simple cache list that contains objects for + * reuse purpose. In order to save some per-cpu + * space the list is singular. Even though it is + * lockless an access has to be protected by the + * per-cpu lock. + */ + struct llist_head bkvcache; + int nr_bkv_objs; }; -static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc); +static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = { + .lock = __RAW_SPIN_LOCK_UNLOCKED(krc.lock), +}; static __always_inline void -debug_rcu_head_unqueue_bulk(struct rcu_head *head) +debug_rcu_bhead_unqueue(struct kvfree_rcu_bulk_data *bhead) { #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD - for (; head; head = head->next) - debug_rcu_head_unqueue(head); + int i; + + for (i = 0; i < bhead->nr_records; i++) + debug_rcu_head_unqueue((struct rcu_head *)(bhead->records[i])); #endif } +static inline struct kfree_rcu_cpu * +krc_this_cpu_lock(unsigned long *flags) +{ + struct kfree_rcu_cpu *krcp; + + local_irq_save(*flags); // For safely calling this_cpu_ptr(). + krcp = this_cpu_ptr(&krc); + raw_spin_lock(&krcp->lock); + + return krcp; +} + +static inline void +krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags) +{ + raw_spin_unlock(&krcp->lock); + local_irq_restore(flags); +} + +static inline struct kvfree_rcu_bulk_data * +get_cached_bnode(struct kfree_rcu_cpu *krcp) +{ + if (!krcp->nr_bkv_objs) + return NULL; + + krcp->nr_bkv_objs--; + return (struct kvfree_rcu_bulk_data *) + llist_del_first(&krcp->bkvcache); +} + +static inline bool +put_cached_bnode(struct kfree_rcu_cpu *krcp, + struct kvfree_rcu_bulk_data *bnode) +{ + // Check the limit. + if (krcp->nr_bkv_objs >= rcu_min_cached_objs) + return false; + + llist_add((struct llist_node *) bnode, &krcp->bkvcache); + krcp->nr_bkv_objs++; + return true; + +} + /* * This function is invoked in workqueue context after a grace period. * It frees all the objects queued on ->bhead_free or ->head_free. @@ -2831,38 +3111,63 @@ debug_rcu_head_unqueue_bulk(struct rcu_head *head) static void kfree_rcu_work(struct work_struct *work) { unsigned long flags; + struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS], *bnext; struct rcu_head *head, *next; - struct kfree_rcu_bulk_data *bhead, *bnext; struct kfree_rcu_cpu *krcp; struct kfree_rcu_cpu_work *krwp; + int i, j; krwp = container_of(to_rcu_work(work), struct kfree_rcu_cpu_work, rcu_work); krcp = krwp->krcp; - spin_lock_irqsave(&krcp->lock, flags); - head = krwp->head_free; - krwp->head_free = NULL; - bhead = krwp->bhead_free; - krwp->bhead_free = NULL; - spin_unlock_irqrestore(&krcp->lock, flags); - - /* "bhead" is now private, so traverse locklessly. */ - for (; bhead; bhead = bnext) { - bnext = bhead->next; - debug_rcu_head_unqueue_bulk(bhead->head_free_debug); + raw_spin_lock_irqsave(&krcp->lock, flags); + // Channels 1 and 2. + for (i = 0; i < FREE_N_CHANNELS; i++) { + bkvhead[i] = krwp->bkvhead_free[i]; + krwp->bkvhead_free[i] = NULL; + } - rcu_lock_acquire(&rcu_callback_map); - trace_rcu_invoke_kfree_bulk_callback(rcu_state.name, - bhead->nr_records, bhead->records); + // Channel 3. + head = krwp->head_free; + krwp->head_free = NULL; + raw_spin_unlock_irqrestore(&krcp->lock, flags); + + // Handle two first channels. + for (i = 0; i < FREE_N_CHANNELS; i++) { + for (; bkvhead[i]; bkvhead[i] = bnext) { + bnext = bkvhead[i]->next; + debug_rcu_bhead_unqueue(bkvhead[i]); + + rcu_lock_acquire(&rcu_callback_map); + if (i == 0) { // kmalloc() / kfree(). + trace_rcu_invoke_kfree_bulk_callback( + rcu_state.name, bkvhead[i]->nr_records, + bkvhead[i]->records); + + kfree_bulk(bkvhead[i]->nr_records, + bkvhead[i]->records); + } else { // vmalloc() / vfree(). + for (j = 0; j < bkvhead[i]->nr_records; j++) { + trace_rcu_invoke_kvfree_callback( + rcu_state.name, + bkvhead[i]->records[j], 0); + + vfree(bkvhead[i]->records[j]); + } + } + rcu_lock_release(&rcu_callback_map); - kfree_bulk(bhead->nr_records, bhead->records); - rcu_lock_release(&rcu_callback_map); + krcp = krc_this_cpu_lock(&flags); + if (put_cached_bnode(krcp, bkvhead[i])) + bkvhead[i] = NULL; + krc_this_cpu_unlock(krcp, flags); - if (cmpxchg(&krcp->bcached, NULL, bhead)) - free_page((unsigned long) bhead); + if (bkvhead[i]) + free_page((unsigned long) bkvhead[i]); - cond_resched_tasks_rcu_qs(); + cond_resched_tasks_rcu_qs(); + } } /* @@ -2872,14 +3177,15 @@ static void kfree_rcu_work(struct work_struct *work) */ for (; head; head = next) { unsigned long offset = (unsigned long)head->func; + void *ptr = (void *)head - offset; next = head->next; - debug_rcu_head_unqueue(head); + debug_rcu_head_unqueue((struct rcu_head *)ptr); rcu_lock_acquire(&rcu_callback_map); - trace_rcu_invoke_kfree_callback(rcu_state.name, head, offset); + trace_rcu_invoke_kvfree_callback(rcu_state.name, head, offset); - if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset))) - kfree((void *)head - offset); + if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset))) + kvfree(ptr); rcu_lock_release(&rcu_callback_map); cond_resched_tasks_rcu_qs(); @@ -2895,8 +3201,8 @@ static void kfree_rcu_work(struct work_struct *work) static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) { struct kfree_rcu_cpu_work *krwp; - bool queued = false; - int i; + bool repeat = false; + int i, j; lockdep_assert_held(&krcp->lock); @@ -2904,38 +3210,48 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) krwp = &(krcp->krw_arr[i]); /* - * Try to detach bhead or head and attach it over any + * Try to detach bkvhead or head and attach it over any * available corresponding free channel. It can be that * a previous RCU batch is in progress, it means that * immediately to queue another one is not possible so * return false to tell caller to retry. */ - if ((krcp->bhead && !krwp->bhead_free) || + if ((krcp->bkvhead[0] && !krwp->bkvhead_free[0]) || + (krcp->bkvhead[1] && !krwp->bkvhead_free[1]) || (krcp->head && !krwp->head_free)) { - /* Channel 1. */ - if (!krwp->bhead_free) { - krwp->bhead_free = krcp->bhead; - krcp->bhead = NULL; + // Channel 1 corresponds to SLAB ptrs. + // Channel 2 corresponds to vmalloc ptrs. + for (j = 0; j < FREE_N_CHANNELS; j++) { + if (!krwp->bkvhead_free[j]) { + krwp->bkvhead_free[j] = krcp->bkvhead[j]; + krcp->bkvhead[j] = NULL; + } } - /* Channel 2. */ + // Channel 3 corresponds to emergency path. if (!krwp->head_free) { krwp->head_free = krcp->head; krcp->head = NULL; } + WRITE_ONCE(krcp->count, 0); + /* - * One work is per one batch, so there are two "free channels", - * "bhead_free" and "head_free" the batch can handle. It can be - * that the work is in the pending state when two channels have - * been detached following each other, one by one. + * One work is per one batch, so there are three + * "free channels", the batch can handle. It can + * be that the work is in the pending state when + * channels have been detached following by each + * other. */ queue_rcu_work(system_wq, &krwp->rcu_work); - queued = true; } + + // Repeat if any "free" corresponding channel is still busy. + if (krcp->bkvhead[0] || krcp->bkvhead[1] || krcp->head) + repeat = true; } - return queued; + return !repeat; } static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp, @@ -2945,14 +3261,14 @@ static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp, krcp->monitor_todo = false; if (queue_kfree_rcu_work(krcp)) { // Success! Our job is done here. - spin_unlock_irqrestore(&krcp->lock, flags); + raw_spin_unlock_irqrestore(&krcp->lock, flags); return; } // Previous RCU batch still in progress, try again later. krcp->monitor_todo = true; schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); - spin_unlock_irqrestore(&krcp->lock, flags); + raw_spin_unlock_irqrestore(&krcp->lock, flags); } /* @@ -2965,32 +3281,50 @@ static void kfree_rcu_monitor(struct work_struct *work) struct kfree_rcu_cpu *krcp = container_of(work, struct kfree_rcu_cpu, monitor_work.work); - spin_lock_irqsave(&krcp->lock, flags); + raw_spin_lock_irqsave(&krcp->lock, flags); if (krcp->monitor_todo) kfree_rcu_drain_unlock(krcp, flags); else - spin_unlock_irqrestore(&krcp->lock, flags); + raw_spin_unlock_irqrestore(&krcp->lock, flags); } static inline bool -kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, - struct rcu_head *head, rcu_callback_t func) +kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr) { - struct kfree_rcu_bulk_data *bnode; + struct kvfree_rcu_bulk_data *bnode; + int idx; if (unlikely(!krcp->initialized)) return false; lockdep_assert_held(&krcp->lock); + idx = !!is_vmalloc_addr(ptr); /* Check if a new block is required. */ - if (!krcp->bhead || - krcp->bhead->nr_records == KFREE_BULK_MAX_ENTR) { - bnode = xchg(&krcp->bcached, NULL); + if (!krcp->bkvhead[idx] || + krcp->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) { + bnode = get_cached_bnode(krcp); if (!bnode) { - WARN_ON_ONCE(sizeof(struct kfree_rcu_bulk_data) > PAGE_SIZE); + /* + * To keep this path working on raw non-preemptible + * sections, prevent the optional entry into the + * allocator as it uses sleeping locks. In fact, even + * if the caller of kfree_rcu() is preemptible, this + * path still is not, as krcp->lock is a raw spinlock. + * With additional page pre-allocation in the works, + * hitting this return is going to be much less likely. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + return false; - bnode = (struct kfree_rcu_bulk_data *) + /* + * NOTE: For one argument of kvfree_rcu() we can + * drop the lock and get the page in sleepable + * context. That would allow to maintain an array + * for the CONFIG_PREEMPT_RT as well if no cached + * pages are available. + */ + bnode = (struct kvfree_rcu_bulk_data *) __get_free_page(GFP_NOWAIT | __GFP_NOWARN); } @@ -3000,53 +3334,62 @@ kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, /* Initialize the new block. */ bnode->nr_records = 0; - bnode->next = krcp->bhead; - bnode->head_free_debug = NULL; + bnode->next = krcp->bkvhead[idx]; /* Attach it to the head. */ - krcp->bhead = bnode; + krcp->bkvhead[idx] = bnode; } -#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD - head->func = func; - head->next = krcp->bhead->head_free_debug; - krcp->bhead->head_free_debug = head; -#endif - /* Finally insert. */ - krcp->bhead->records[krcp->bhead->nr_records++] = - (void *) head - (unsigned long) func; + krcp->bkvhead[idx]->records + [krcp->bkvhead[idx]->nr_records++] = ptr; return true; } /* - * Queue a request for lazy invocation of kfree_bulk()/kfree() after a grace - * period. Please note there are two paths are maintained, one is the main one - * that uses kfree_bulk() interface and second one is emergency one, that is - * used only when the main path can not be maintained temporary, due to memory - * pressure. + * Queue a request for lazy invocation of appropriate free routine after a + * grace period. Please note there are three paths are maintained, two are the + * main ones that use array of pointers interface and third one is emergency + * one, that is used only when the main path can not be maintained temporary, + * due to memory pressure. * - * Each kfree_call_rcu() request is added to a batch. The batch will be drained + * Each kvfree_call_rcu() request is added to a batch. The batch will be drained * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will * be free'd in workqueue context. This allows us to: batch requests together to - * reduce the number of grace periods during heavy kfree_rcu() load. + * reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load. */ -void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) +void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) { unsigned long flags; struct kfree_rcu_cpu *krcp; + bool success; + void *ptr; - local_irq_save(flags); // For safely calling this_cpu_ptr(). - krcp = this_cpu_ptr(&krc); - if (krcp->initialized) - spin_lock(&krcp->lock); + if (head) { + ptr = (void *) head - (unsigned long) func; + } else { + /* + * Please note there is a limitation for the head-less + * variant, that is why there is a clear rule for such + * objects: it can be used from might_sleep() context + * only. For other places please embed an rcu_head to + * your data. + */ + might_sleep(); + ptr = (unsigned long *) func; + } + + krcp = krc_this_cpu_lock(&flags); // Queue the object but don't yet schedule the batch. - if (debug_rcu_head_queue(head)) { + if (debug_rcu_head_queue(ptr)) { // Probable double kfree_rcu(), just leak. WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n", __func__, head); + + // Mark as success and leave. + success = true; goto unlock_return; } @@ -3054,12 +3397,20 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) * Under high memory pressure GFP_NOWAIT can fail, * in that case the emergency path is maintained. */ - if (unlikely(!kfree_call_rcu_add_ptr_to_bulk(krcp, head, func))) { + success = kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr); + if (!success) { + if (head == NULL) + // Inline if kvfree_rcu(one_arg) call. + goto unlock_return; + head->func = func; head->next = krcp->head; krcp->head = head; + success = true; } + WRITE_ONCE(krcp->count, krcp->count + 1); + // Set timer to drain after KFREE_DRAIN_JIFFIES. if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && !krcp->monitor_todo) { @@ -3068,11 +3419,70 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) } unlock_return: - if (krcp->initialized) - spin_unlock(&krcp->lock); - local_irq_restore(flags); + krc_this_cpu_unlock(krcp, flags); + + /* + * Inline kvfree() after synchronize_rcu(). We can do + * it from might_sleep() context only, so the current + * CPU can pass the QS state. + */ + if (!success) { + debug_rcu_head_unqueue((struct rcu_head *) ptr); + synchronize_rcu(); + kvfree(ptr); + } +} +EXPORT_SYMBOL_GPL(kvfree_call_rcu); + +static unsigned long +kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) +{ + int cpu; + unsigned long count = 0; + + /* Snapshot count of all CPUs */ + for_each_online_cpu(cpu) { + struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); + + count += READ_ONCE(krcp->count); + } + + return count; } -EXPORT_SYMBOL_GPL(kfree_call_rcu); + +static unsigned long +kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) +{ + int cpu, freed = 0; + unsigned long flags; + + for_each_online_cpu(cpu) { + int count; + struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); + + count = krcp->count; + raw_spin_lock_irqsave(&krcp->lock, flags); + if (krcp->monitor_todo) + kfree_rcu_drain_unlock(krcp, flags); + else + raw_spin_unlock_irqrestore(&krcp->lock, flags); + + sc->nr_to_scan -= count; + freed += count; + + if (sc->nr_to_scan <= 0) + break; + } + + return freed == 0 ? SHRINK_STOP : freed; +} + +static struct shrinker kfree_rcu_shrinker = { + .count_objects = kfree_rcu_shrink_count, + .scan_objects = kfree_rcu_shrink_scan, + .batch = 0, + .seeks = DEFAULT_SEEKS, +}; void __init kfree_rcu_scheduler_running(void) { @@ -3082,15 +3492,15 @@ void __init kfree_rcu_scheduler_running(void) for_each_online_cpu(cpu) { struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); - spin_lock_irqsave(&krcp->lock, flags); + raw_spin_lock_irqsave(&krcp->lock, flags); if (!krcp->head || krcp->monitor_todo) { - spin_unlock_irqrestore(&krcp->lock, flags); + raw_spin_unlock_irqrestore(&krcp->lock, flags); continue; } krcp->monitor_todo = true; schedule_delayed_work_on(cpu, &krcp->monitor_work, KFREE_DRAIN_JIFFIES); - spin_unlock_irqrestore(&krcp->lock, flags); + raw_spin_unlock_irqrestore(&krcp->lock, flags); } } @@ -3578,10 +3988,9 @@ void rcu_cpu_starting(unsigned int cpu) { unsigned long flags; unsigned long mask; - int nbits; - unsigned long oldmask; struct rcu_data *rdp; struct rcu_node *rnp; + bool newcpu; if (per_cpu(rcu_cpu_started, cpu)) return; @@ -3593,12 +4002,11 @@ void rcu_cpu_starting(unsigned int cpu) mask = rdp->grpmask; raw_spin_lock_irqsave_rcu_node(rnp, flags); WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask); - oldmask = rnp->expmaskinitnext; + newcpu = !(rnp->expmaskinitnext & mask); rnp->expmaskinitnext |= mask; - oldmask ^= rnp->expmaskinitnext; - nbits = bitmap_weight(&oldmask, BITS_PER_LONG); /* Allow lockless access for expedited grace periods. */ - smp_store_release(&rcu_state.ncpus, rcu_state.ncpus + nbits); /* ^^^ */ + smp_store_release(&rcu_state.ncpus, rcu_state.ncpus + newcpu); /* ^^^ */ + ASSERT_EXCLUSIVE_WRITER(rcu_state.ncpus); rcu_gpnum_ovf(rnp, rdp); /* Offline-induced counter wrap? */ rdp->rcu_onl_gp_seq = READ_ONCE(rcu_state.gp_seq); rdp->rcu_onl_gp_flags = READ_ONCE(rcu_state.gp_flags); @@ -3984,16 +4392,28 @@ static void __init kfree_rcu_batch_init(void) for_each_possible_cpu(cpu) { struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); + struct kvfree_rcu_bulk_data *bnode; - spin_lock_init(&krcp->lock); for (i = 0; i < KFREE_N_BATCHES; i++) { INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work); krcp->krw_arr[i].krcp = krcp; } + for (i = 0; i < rcu_min_cached_objs; i++) { + bnode = (struct kvfree_rcu_bulk_data *) + __get_free_page(GFP_NOWAIT | __GFP_NOWARN); + + if (bnode) + put_cached_bnode(krcp, bnode); + else + pr_err("Failed to preallocate for %d CPU!\n", cpu); + } + INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor); krcp->initialized = true; } + if (register_shrinker(&kfree_rcu_shrinker)) + pr_err("Failed to register kfree_rcu() shrinker!\n"); } void __init rcu_init(void) diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 9dc2ec021da5..c96ae351688b 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -41,7 +41,7 @@ struct rcu_node { raw_spinlock_t __private lock; /* Root rcu_node's lock protects */ /* some rcu_state fields as well as */ /* following. */ - unsigned long gp_seq; /* Track rsp->rcu_gp_seq. */ + unsigned long gp_seq; /* Track rsp->gp_seq. */ unsigned long gp_seq_needed; /* Track furthest future GP request. */ unsigned long completedqs; /* All QSes done for this node. */ unsigned long qsmask; /* CPUs or groups that need to switch in */ @@ -73,9 +73,9 @@ struct rcu_node { unsigned long ffmask; /* Fully functional CPUs. */ unsigned long grpmask; /* Mask to apply to parent qsmask. */ /* Only one bit will be set in this mask. */ - int grplo; /* lowest-numbered CPU or group here. */ - int grphi; /* highest-numbered CPU or group here. */ - u8 grpnum; /* CPU/group number for next level up. */ + int grplo; /* lowest-numbered CPU here. */ + int grphi; /* highest-numbered CPU here. */ + u8 grpnum; /* group number for next level up. */ u8 level; /* root is at level 0. */ bool wait_blkd_tasks;/* Necessary to wait for blocked tasks to */ /* exit RCU read-side critical sections */ @@ -149,7 +149,7 @@ union rcu_noqs { /* Per-CPU data for read-copy update. */ struct rcu_data { /* 1) quiescent-state and grace-period handling : */ - unsigned long gp_seq; /* Track rsp->rcu_gp_seq counter. */ + unsigned long gp_seq; /* Track rsp->gp_seq counter. */ unsigned long gp_seq_needed; /* Track furthest future GP request. */ union rcu_noqs cpu_no_qs; /* No QSes yet for this CPU. */ bool core_needs_qs; /* Core waits for quiesc state. */ @@ -171,6 +171,7 @@ struct rcu_data { /* different grace periods. */ long qlen_last_fqs_check; /* qlen at last check for QS forcing */ + unsigned long n_cbs_invoked; /* # callbacks invoked since boot. */ unsigned long n_force_qs_snap; /* did other CPU force QS recently? */ long blimit; /* Upper limit on a processed batch */ @@ -301,6 +302,8 @@ struct rcu_state { u8 boost ____cacheline_internodealigned_in_smp; /* Subject to priority boost. */ unsigned long gp_seq; /* Grace-period sequence #. */ + unsigned long gp_max; /* Maximum GP duration in */ + /* jiffies. */ struct task_struct *gp_kthread; /* Task for grace periods. */ struct swait_queue_head gp_wq; /* Where GP task waits. */ short gp_flags; /* Commands for GP task. */ @@ -346,8 +349,6 @@ struct rcu_state { /* a reluctant CPU. */ unsigned long n_force_qs_gpstart; /* Snapshot of n_force_qs at */ /* GP start. */ - unsigned long gp_max; /* Maximum GP duration in */ - /* jiffies. */ const char *name; /* Name of structure. */ char abbr; /* Abbreviated name. */ @@ -359,6 +360,7 @@ struct rcu_state { /* Values for rcu_state structure's gp_flags field. */ #define RCU_GP_FLAG_INIT 0x1 /* Need grace-period initialization. */ #define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */ +#define RCU_GP_FLAG_OVLD 0x4 /* Experiencing callback overload. */ /* Values for rcu_state structure's gp_state field. */ #define RCU_GP_IDLE 0 /* Initial state and no GP in progress. */ @@ -454,6 +456,8 @@ static void rcu_bind_gp_kthread(void); static bool rcu_nohz_full_cpu(void); static void rcu_dynticks_task_enter(void); static void rcu_dynticks_task_exit(void); +static void rcu_dynticks_task_trace_enter(void); +static void rcu_dynticks_task_trace_exit(void); /* Forward declarations for tree_stall.h */ static void record_gp_stall_check_time(void); diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 1a617b9dffb0..1888c0eb1216 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -150,7 +150,7 @@ static void __maybe_unused sync_exp_reset_tree(void) static bool sync_rcu_exp_done(struct rcu_node *rnp) { raw_lockdep_assert_held_rcu_node(rnp); - return rnp->exp_tasks == NULL && + return READ_ONCE(rnp->exp_tasks) == NULL && READ_ONCE(rnp->expmask) == 0; } @@ -373,7 +373,7 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) * until such time as the ->expmask bits are cleared. */ if (rcu_preempt_has_tasks(rnp)) - rnp->exp_tasks = rnp->blkd_tasks.next; + WRITE_ONCE(rnp->exp_tasks, rnp->blkd_tasks.next); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* IPI the remaining CPUs for expedited quiescent state. */ @@ -403,7 +403,7 @@ retry_ipi: /* Online, so delay for a bit and try again. */ raw_spin_unlock_irqrestore_rcu_node(rnp, flags); trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("selectofl")); - schedule_timeout_uninterruptible(1); + schedule_timeout_idle(1); goto retry_ipi; } /* CPU really is offline, so we must report its QS. */ @@ -542,8 +542,8 @@ static void synchronize_rcu_expedited_wait(void) } pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", jiffies - jiffies_start, rcu_state.expedited_sequence, - READ_ONCE(rnp_root->expmask), - ".T"[!!rnp_root->exp_tasks]); + data_race(rnp_root->expmask), + ".T"[!!data_race(rnp_root->exp_tasks)]); if (ndetected) { pr_err("blocking rcu_node structures:"); rcu_for_each_node_breadth_first(rnp) { @@ -553,8 +553,8 @@ static void synchronize_rcu_expedited_wait(void) continue; pr_cont(" l=%u:%d-%d:%#lx/%c", rnp->level, rnp->grplo, rnp->grphi, - READ_ONCE(rnp->expmask), - ".T"[!!rnp->exp_tasks]); + data_race(rnp->expmask), + ".T"[!!data_race(rnp->exp_tasks)]); } pr_cont("\n"); } @@ -639,6 +639,7 @@ static void wait_rcu_exp_gp(struct work_struct *wp) */ static void rcu_exp_handler(void *unused) { + int depth = rcu_preempt_depth(); unsigned long flags; struct rcu_data *rdp = this_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; @@ -649,7 +650,7 @@ static void rcu_exp_handler(void *unused) * critical section. If also enabled or idle, immediately * report the quiescent state, otherwise defer. */ - if (!rcu_preempt_depth()) { + if (!depth) { if (!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)) || rcu_dynticks_curr_cpu_in_eqs()) { rcu_report_exp_rdp(rdp); @@ -673,7 +674,7 @@ static void rcu_exp_handler(void *unused) * can have caused this quiescent state to already have been * reported, so we really do need to check ->expmask. */ - if (rcu_preempt_depth() > 0) { + if (depth > 0) { raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->expmask & rdp->grpmask) { rdp->exp_deferred_qs = true; @@ -683,30 +684,8 @@ static void rcu_exp_handler(void *unused) return; } - /* - * The final and least likely case is where the interrupted - * code was just about to or just finished exiting the RCU-preempt - * read-side critical section, and no, we can't tell which. - * So either way, set ->deferred_qs to flag later code that - * a quiescent state is required. - * - * If the CPU is fully enabled (or if some buggy RCU-preempt - * read-side critical section is being used from idle), just - * invoke rcu_preempt_deferred_qs() to immediately report the - * quiescent state. We cannot use rcu_read_unlock_special() - * because we are in an interrupt handler, which will cause that - * function to take an early exit without doing anything. - * - * Otherwise, force a context switch after the CPU enables everything. - */ - rdp->exp_deferred_qs = true; - if (!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)) || - WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs())) { - rcu_preempt_deferred_qs(t); - } else { - set_tsk_need_resched(t); - set_preempt_need_resched(); - } + // Finally, negative nesting depth should not happen. + WARN_ON_ONCE(1); } /* PREEMPTION=y, so no PREEMPTION=n expedited grace period to clean up after. */ @@ -721,17 +700,20 @@ static void sync_sched_exp_online_cleanup(int cpu) */ static int rcu_print_task_exp_stall(struct rcu_node *rnp) { - struct task_struct *t; + unsigned long flags; int ndetected = 0; + struct task_struct *t; - if (!rnp->exp_tasks) + if (!READ_ONCE(rnp->exp_tasks)) return 0; + raw_spin_lock_irqsave_rcu_node(rnp, flags); t = list_entry(rnp->exp_tasks->prev, struct task_struct, rcu_node_entry); list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { pr_cont(" P%d", t->pid); ndetected++; } + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return ndetected; } diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 097635c41135..982fc5be5269 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -226,7 +226,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq); } if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD)) - rnp->exp_tasks = &t->rcu_node_entry; + WRITE_ONCE(rnp->exp_tasks, &t->rcu_node_entry); WARN_ON_ONCE(!(blkd_state & RCU_GP_BLKD) != !(rnp->qsmask & rdp->grpmask)); WARN_ON_ONCE(!(blkd_state & RCU_EXP_BLKD) != @@ -331,6 +331,7 @@ void rcu_note_context_switch(bool preempt) rcu_qs(); if (rdp->exp_deferred_qs) rcu_report_exp_rdp(rdp); + rcu_tasks_qs(current, preempt); trace_rcu_utilization(TPS("End context switch")); } EXPORT_SYMBOL_GPL(rcu_note_context_switch); @@ -345,9 +346,7 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) return READ_ONCE(rnp->gp_tasks) != NULL; } -/* Bias and limit values for ->rcu_read_lock_nesting. */ -#define RCU_NEST_BIAS INT_MAX -#define RCU_NEST_NMAX (-INT_MAX / 2) +/* limit value for ->rcu_read_lock_nesting. */ #define RCU_NEST_PMAX (INT_MAX / 2) static void rcu_preempt_read_enter(void) @@ -355,9 +354,9 @@ static void rcu_preempt_read_enter(void) current->rcu_read_lock_nesting++; } -static void rcu_preempt_read_exit(void) +static int rcu_preempt_read_exit(void) { - current->rcu_read_lock_nesting--; + return --current->rcu_read_lock_nesting; } static void rcu_preempt_depth_set(int val) @@ -390,21 +389,15 @@ void __rcu_read_unlock(void) { struct task_struct *t = current; - if (rcu_preempt_depth() != 1) { - rcu_preempt_read_exit(); - } else { + if (rcu_preempt_read_exit() == 0) { barrier(); /* critical section before exit code. */ - rcu_preempt_depth_set(-RCU_NEST_BIAS); - barrier(); /* assign before ->rcu_read_unlock_special load */ if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s))) rcu_read_unlock_special(t); - barrier(); /* ->rcu_read_unlock_special load before assign */ - rcu_preempt_depth_set(0); } if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { int rrln = rcu_preempt_depth(); - WARN_ON_ONCE(rrln < 0 && rrln > RCU_NEST_NMAX); + WARN_ON_ONCE(rrln < 0 || rrln > RCU_NEST_PMAX); } } EXPORT_SYMBOL_GPL(__rcu_read_unlock); @@ -500,12 +493,12 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) if (&t->rcu_node_entry == rnp->gp_tasks) WRITE_ONCE(rnp->gp_tasks, np); if (&t->rcu_node_entry == rnp->exp_tasks) - rnp->exp_tasks = np; + WRITE_ONCE(rnp->exp_tasks, np); if (IS_ENABLED(CONFIG_RCU_BOOST)) { /* Snapshot ->boost_mtx ownership w/rnp->lock held. */ drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t; if (&t->rcu_node_entry == rnp->boost_tasks) - rnp->boost_tasks = np; + WRITE_ONCE(rnp->boost_tasks, np); } /* @@ -556,7 +549,7 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t) { return (__this_cpu_read(rcu_data.exp_deferred_qs) || READ_ONCE(t->rcu_read_unlock_special.s)) && - rcu_preempt_depth() <= 0; + rcu_preempt_depth() == 0; } /* @@ -569,16 +562,11 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t) static void rcu_preempt_deferred_qs(struct task_struct *t) { unsigned long flags; - bool couldrecurse = rcu_preempt_depth() >= 0; if (!rcu_preempt_need_deferred_qs(t)) return; - if (couldrecurse) - rcu_preempt_depth_set(rcu_preempt_depth() - RCU_NEST_BIAS); local_irq_save(flags); rcu_preempt_deferred_qs_irqrestore(t, flags); - if (couldrecurse) - rcu_preempt_depth_set(rcu_preempt_depth() + RCU_NEST_BIAS); } /* @@ -615,19 +603,18 @@ static void rcu_read_unlock_special(struct task_struct *t) struct rcu_data *rdp = this_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; - exp = (t->rcu_blocked_node && t->rcu_blocked_node->exp_tasks) || - (rdp->grpmask & READ_ONCE(rnp->expmask)) || - tick_nohz_full_cpu(rdp->cpu); + exp = (t->rcu_blocked_node && + READ_ONCE(t->rcu_blocked_node->exp_tasks)) || + (rdp->grpmask & READ_ONCE(rnp->expmask)); // Need to defer quiescent state until everything is enabled. - if (irqs_were_disabled && use_softirq && - (in_interrupt() || - (exp && !t->rcu_read_unlock_special.b.deferred_qs))) { - // Using softirq, safe to awaken, and we get - // no help from enabling irqs, unlike bh/preempt. + if (use_softirq && (in_irq() || (exp && !irqs_were_disabled))) { + // Using softirq, safe to awaken, and either the + // wakeup is free or there is an expedited GP. raise_softirq_irqoff(RCU_SOFTIRQ); } else { // Enabling BH or preempt does reschedule, so... - // Also if no expediting or NO_HZ_FULL, slow is OK. + // Also if no expediting, slow is OK. + // Plus nohz_full CPUs eventually get tick enabled. set_tsk_need_resched(current); set_preempt_need_resched(); if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled && @@ -640,7 +627,6 @@ static void rcu_read_unlock_special(struct task_struct *t) irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu); } } - t->rcu_read_unlock_special.b.deferred_qs = true; local_irq_restore(flags); return; } @@ -699,7 +685,7 @@ static void rcu_flavor_sched_clock_irq(int user) } else if (rcu_preempt_need_deferred_qs(t)) { rcu_preempt_deferred_qs(t); /* Report deferred QS. */ return; - } else if (!rcu_preempt_depth()) { + } else if (!WARN_ON_ONCE(rcu_preempt_depth())) { rcu_qs(); /* Report immediate QS. */ return; } @@ -760,8 +746,8 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx\n", __func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext); pr_info("%s: ->gp_tasks %p ->boost_tasks %p ->exp_tasks %p\n", - __func__, READ_ONCE(rnp->gp_tasks), rnp->boost_tasks, - rnp->exp_tasks); + __func__, READ_ONCE(rnp->gp_tasks), data_race(rnp->boost_tasks), + READ_ONCE(rnp->exp_tasks)); pr_info("%s: ->blkd_tasks", __func__); i = 0; list_for_each(lhp, &rnp->blkd_tasks) { @@ -854,8 +840,7 @@ void rcu_note_context_switch(bool preempt) this_cpu_write(rcu_data.rcu_urgent_qs, false); if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs))) rcu_momentary_dyntick_idle(); - if (!preempt) - rcu_tasks_qs(current); + rcu_tasks_qs(current, preempt); out: trace_rcu_utilization(TPS("End context switch")); } @@ -1036,7 +1021,8 @@ static int rcu_boost_kthread(void *arg) for (;;) { WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_WAITING); trace_rcu_utilization(TPS("End boost kthread@rcu_wait")); - rcu_wait(rnp->boost_tasks || rnp->exp_tasks); + rcu_wait(READ_ONCE(rnp->boost_tasks) || + READ_ONCE(rnp->exp_tasks)); trace_rcu_utilization(TPS("Start boost kthread@rcu_wait")); WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_RUNNING); more2boost = rcu_boost(rnp); @@ -1047,7 +1033,7 @@ static int rcu_boost_kthread(void *arg) if (spincnt > 10) { WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_YIELDING); trace_rcu_utilization(TPS("End boost kthread@rcu_yield")); - schedule_timeout_interruptible(2); + schedule_timeout_idle(2); trace_rcu_utilization(TPS("Start boost kthread@rcu_yield")); spincnt = 0; } @@ -1079,9 +1065,9 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) (rnp->gp_tasks != NULL && rnp->boost_tasks == NULL && rnp->qsmask == 0 && - (ULONG_CMP_GE(jiffies, rnp->boost_time) || rcu_state.cbovld))) { + (!time_after(rnp->boost_time, jiffies) || rcu_state.cbovld))) { if (rnp->exp_tasks == NULL) - rnp->boost_tasks = rnp->gp_tasks; + WRITE_ONCE(rnp->boost_tasks, rnp->gp_tasks); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); rcu_wake_cond(rnp->boost_kthread_task, READ_ONCE(rnp->boost_kthread_status)); @@ -2019,7 +2005,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) /* Polling, so trace if first poll in the series. */ if (gotcbs) trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Poll")); - schedule_timeout_interruptible(1); + schedule_timeout_idle(1); } else if (!needwait_gp) { /* Wait for callbacks to appear. */ trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep")); @@ -2536,7 +2522,7 @@ static bool rcu_nohz_full_cpu(void) #ifdef CONFIG_NO_HZ_FULL if (tick_nohz_full_cpu(smp_processor_id()) && (!rcu_gp_in_progress() || - ULONG_CMP_LT(jiffies, READ_ONCE(rcu_state.gp_start) + HZ))) + time_before(jiffies, READ_ONCE(rcu_state.gp_start) + HZ))) return true; #endif /* #ifdef CONFIG_NO_HZ_FULL */ return false; @@ -2553,7 +2539,7 @@ static void rcu_bind_gp_kthread(void) } /* Record the current task on dyntick-idle entry. */ -static void rcu_dynticks_task_enter(void) +static void noinstr rcu_dynticks_task_enter(void) { #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) WRITE_ONCE(current->rcu_tasks_idle_cpu, smp_processor_id()); @@ -2561,9 +2547,27 @@ static void rcu_dynticks_task_enter(void) } /* Record no current task on dyntick-idle exit. */ -static void rcu_dynticks_task_exit(void) +static void noinstr rcu_dynticks_task_exit(void) { #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) WRITE_ONCE(current->rcu_tasks_idle_cpu, -1); #endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */ } + +/* Turn on heavyweight RCU tasks trace readers on idle/user entry. */ +static void rcu_dynticks_task_trace_enter(void) +{ +#ifdef CONFIG_TASKS_RCU_TRACE + if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) + current->trc_reader_special.b.need_mb = true; +#endif /* #ifdef CONFIG_TASKS_RCU_TRACE */ +} + +/* Turn off heavyweight RCU tasks trace readers on idle/user exit. */ +static void rcu_dynticks_task_trace_exit(void) +{ +#ifdef CONFIG_TASKS_RCU_TRACE + if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) + current->trc_reader_special.b.need_mb = false; +#endif /* #ifdef CONFIG_TASKS_RCU_TRACE */ +} diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 119ed6afd20f..b5d3b4794db4 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -15,10 +15,12 @@ int sysctl_panic_on_rcu_stall __read_mostly; #ifdef CONFIG_PROVE_RCU -#define RCU_STALL_DELAY_DELTA (5 * HZ) +#define RCU_STALL_DELAY_DELTA (5 * HZ) #else -#define RCU_STALL_DELAY_DELTA 0 +#define RCU_STALL_DELAY_DELTA 0 #endif +#define RCU_STALL_MIGHT_DIV 8 +#define RCU_STALL_MIGHT_MIN (2 * HZ) /* Limit-check stall timeouts specified at boottime and runtime. */ int rcu_jiffies_till_stall_check(void) @@ -40,6 +42,36 @@ int rcu_jiffies_till_stall_check(void) } EXPORT_SYMBOL_GPL(rcu_jiffies_till_stall_check); +/** + * rcu_gp_might_be_stalled - Is it likely that the grace period is stalled? + * + * Returns @true if the current grace period is sufficiently old that + * it is reasonable to assume that it might be stalled. This can be + * useful when deciding whether to allocate memory to enable RCU-mediated + * freeing on the one hand or just invoking synchronize_rcu() on the other. + * The latter is preferable when the grace period is stalled. + * + * Note that sampling of the .gp_start and .gp_seq fields must be done + * carefully to avoid false positives at the beginnings and ends of + * grace periods. + */ +bool rcu_gp_might_be_stalled(void) +{ + unsigned long d = rcu_jiffies_till_stall_check() / RCU_STALL_MIGHT_DIV; + unsigned long j = jiffies; + + if (d < RCU_STALL_MIGHT_MIN) + d = RCU_STALL_MIGHT_MIN; + smp_mb(); // jiffies before .gp_seq to avoid false positives. + if (!rcu_gp_in_progress()) + return false; + // Long delays at this point avoids false positive, but a delay + // of ULONG_MAX/4 jiffies voids your no-false-positive warranty. + smp_mb(); // .gp_seq before second .gp_start + // And ditto here. + return !time_before(j, READ_ONCE(rcu_state.gp_start) + d); +} + /* Don't do RCU CPU stall warnings during long sysrq printouts. */ void rcu_sysrq_start(void) { @@ -104,8 +136,8 @@ static void record_gp_stall_check_time(void) WRITE_ONCE(rcu_state.gp_start, j); j1 = rcu_jiffies_till_stall_check(); - /* Record ->gp_start before ->jiffies_stall. */ - smp_store_release(&rcu_state.jiffies_stall, j + j1); /* ^^^ */ + smp_mb(); // ->gp_start before ->jiffies_stall and caller's ->gp_seq. + WRITE_ONCE(rcu_state.jiffies_stall, j + j1); rcu_state.jiffies_resched = j + j1 / 2; rcu_state.n_force_qs_gpstart = READ_ONCE(rcu_state.n_force_qs); } @@ -192,14 +224,38 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } +// Communicate task state back to the RCU CPU stall warning request. +struct rcu_stall_chk_rdr { + int nesting; + union rcu_special rs; + bool on_blkd_list; +}; + +/* + * Report out the state of a not-running task that is stalling the + * current RCU grace period. + */ +static bool check_slow_task(struct task_struct *t, void *arg) +{ + struct rcu_stall_chk_rdr *rscrp = arg; + + if (task_curr(t)) + return false; // It is running, so decline to inspect it. + rscrp->nesting = t->rcu_read_lock_nesting; + rscrp->rs = t->rcu_read_unlock_special; + rscrp->on_blkd_list = !list_empty(&t->rcu_node_entry); + return true; +} + /* * Scan the current list of tasks blocked within RCU read-side critical * sections, printing out the tid of each. */ static int rcu_print_task_stall(struct rcu_node *rnp) { - struct task_struct *t; int ndetected = 0; + struct rcu_stall_chk_rdr rscr; + struct task_struct *t; if (!rcu_preempt_blocked_readers_cgp(rnp)) return 0; @@ -208,7 +264,15 @@ static int rcu_print_task_stall(struct rcu_node *rnp) t = list_entry(rnp->gp_tasks->prev, struct task_struct, rcu_node_entry); list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { - pr_cont(" P%d", t->pid); + if (!try_invoke_on_locked_down_task(t, check_slow_task, &rscr)) + pr_cont(" P%d", t->pid); + else + pr_cont(" P%d/%d:%c%c%c%c", + t->pid, rscr.nesting, + ".b"[rscr.rs.b.blocked], + ".q"[rscr.rs.b.need_qs], + ".e"[rscr.rs.b.exp_hint], + ".l"[rscr.on_blkd_list]); ndetected++; } pr_cont("\n"); @@ -299,6 +363,16 @@ static const char *gp_state_getname(short gs) return gp_state_names[gs]; } +/* Is the RCU grace-period kthread being starved of CPU time? */ +static bool rcu_is_gp_kthread_starving(unsigned long *jp) +{ + unsigned long j = jiffies - READ_ONCE(rcu_state.gp_activity); + + if (jp) + *jp = j; + return j > 2 * HZ; +} + /* * Print out diagnostic information for the specified stalled CPU. * @@ -313,6 +387,7 @@ static const char *gp_state_getname(short gs) static void print_cpu_stall_info(int cpu) { unsigned long delta; + bool falsepositive; char fast_no_hz[72]; struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); char *ticks_title; @@ -333,7 +408,9 @@ static void print_cpu_stall_info(int cpu) } print_cpu_stall_fast_no_hz(fast_no_hz, cpu); delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq); - pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s\n", + falsepositive = rcu_is_gp_kthread_starving(NULL) && + rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp)); + pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s%s\n", cpu, "O."[!!cpu_online(cpu)], "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)], @@ -345,8 +422,9 @@ static void print_cpu_stall_info(int cpu) rcu_dynticks_snap(rdp) & 0xfff, rdp->dynticks_nesting, rdp->dynticks_nmi_nesting, rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), - READ_ONCE(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart, - fast_no_hz); + data_race(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart, + fast_no_hz, + falsepositive ? " (false positive?)" : ""); } /* Complain about starvation of grace-period kthread. */ @@ -355,15 +433,15 @@ static void rcu_check_gp_kthread_starvation(void) struct task_struct *gpk = rcu_state.gp_kthread; unsigned long j; - j = jiffies - READ_ONCE(rcu_state.gp_activity); - if (j > 2 * HZ) { + if (rcu_is_gp_kthread_starving(&j)) { pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n", rcu_state.name, j, (long)rcu_seq_current(&rcu_state.gp_seq), - READ_ONCE(rcu_state.gp_flags), + data_race(rcu_state.gp_flags), gp_state_getname(rcu_state.gp_state), rcu_state.gp_state, gpk ? gpk->state : ~0, gpk ? task_cpu(gpk) : -1); if (gpk) { + pr_err("\tUnless %s kthread gets sufficient CPU time, OOM is now expected behavior.\n", rcu_state.name); pr_err("RCU grace-period kthread stack dump:\n"); sched_show_task(gpk); wake_up_process(gpk); @@ -371,7 +449,7 @@ static void rcu_check_gp_kthread_starvation(void) } } -static void print_other_cpu_stall(unsigned long gp_seq) +static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) { int cpu; unsigned long flags; @@ -388,7 +466,7 @@ static void print_other_cpu_stall(unsigned long gp_seq) /* * OK, time to rat on our buddy... - * See Documentation/RCU/stallwarn.txt for info on how to debug + * See Documentation/RCU/stallwarn.rst for info on how to debug * RCU CPU stall warnings. */ pr_err("INFO: %s detected stalls on CPUs/tasks:\n", rcu_state.name); @@ -408,7 +486,7 @@ static void print_other_cpu_stall(unsigned long gp_seq) for_each_possible_cpu(cpu) totqlen += rcu_get_n_cbs_cpu(cpu); pr_cont("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n", - smp_processor_id(), (long)(jiffies - rcu_state.gp_start), + smp_processor_id(), (long)(jiffies - gps), (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); if (ndetected) { rcu_dump_cpu_stacks(); @@ -421,13 +499,11 @@ static void print_other_cpu_stall(unsigned long gp_seq) pr_err("INFO: Stall ended before state dump start\n"); } else { j = jiffies; - gpa = READ_ONCE(rcu_state.gp_activity); + gpa = data_race(rcu_state.gp_activity); pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n", rcu_state.name, j - gpa, j, gpa, - READ_ONCE(jiffies_till_next_fqs), + data_race(jiffies_till_next_fqs), rcu_get_root()->qsmask); - /* In this case, the current CPU might be at fault. */ - sched_show_task(current); } } /* Rewrite if needed in case of slow consoles. */ @@ -442,7 +518,7 @@ static void print_other_cpu_stall(unsigned long gp_seq) rcu_force_quiescent_state(); /* Kick them all. */ } -static void print_cpu_stall(void) +static void print_cpu_stall(unsigned long gps) { int cpu; unsigned long flags; @@ -457,7 +533,7 @@ static void print_cpu_stall(void) /* * OK, time to rat on ourselves... - * See Documentation/RCU/stallwarn.txt for info on how to debug + * See Documentation/RCU/stallwarn.rst for info on how to debug * RCU CPU stall warnings. */ pr_err("INFO: %s self-detected stall on CPU\n", rcu_state.name); @@ -467,7 +543,7 @@ static void print_cpu_stall(void) for_each_possible_cpu(cpu) totqlen += rcu_get_n_cbs_cpu(cpu); pr_cont("\t(t=%lu jiffies g=%ld q=%lu)\n", - jiffies - rcu_state.gp_start, + jiffies - gps, (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); rcu_check_gp_kthread_starvation(); @@ -546,7 +622,7 @@ static void check_cpu_stall(struct rcu_data *rdp) cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { /* We haven't checked in, so go dump stack. */ - print_cpu_stall(); + print_cpu_stall(gps); if (rcu_cpu_stall_ftrace_dump) rcu_ftrace_dump(DUMP_ALL); @@ -555,7 +631,7 @@ static void check_cpu_stall(struct rcu_data *rdp) cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { /* They had a few time units to dump stack, so complain. */ - print_other_cpu_stall(gs2); + print_other_cpu_stall(gs2, gps); if (rcu_cpu_stall_ftrace_dump) rcu_ftrace_dump(DUMP_ALL); } @@ -571,6 +647,7 @@ static void check_cpu_stall(struct rcu_data *rdp) */ void show_rcu_gp_kthreads(void) { + unsigned long cbs = 0; int cpu; unsigned long j; unsigned long ja; @@ -581,23 +658,23 @@ void show_rcu_gp_kthreads(void) struct task_struct *t = READ_ONCE(rcu_state.gp_kthread); j = jiffies; - ja = j - READ_ONCE(rcu_state.gp_activity); - jr = j - READ_ONCE(rcu_state.gp_req_activity); - jw = j - READ_ONCE(rcu_state.gp_wake_time); + ja = j - data_race(rcu_state.gp_activity); + jr = j - data_race(rcu_state.gp_req_activity); + jw = j - data_race(rcu_state.gp_wake_time); pr_info("%s: wait state: %s(%d) ->state: %#lx delta ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_flags %#x\n", rcu_state.name, gp_state_getname(rcu_state.gp_state), rcu_state.gp_state, t ? t->state : 0x1ffffL, - ja, jr, jw, (long)READ_ONCE(rcu_state.gp_wake_seq), - (long)READ_ONCE(rcu_state.gp_seq), - (long)READ_ONCE(rcu_get_root()->gp_seq_needed), - READ_ONCE(rcu_state.gp_flags)); + ja, jr, jw, (long)data_race(rcu_state.gp_wake_seq), + (long)data_race(rcu_state.gp_seq), + (long)data_race(rcu_get_root()->gp_seq_needed), + data_race(rcu_state.gp_flags)); rcu_for_each_node_breadth_first(rnp) { if (ULONG_CMP_GE(READ_ONCE(rcu_state.gp_seq), READ_ONCE(rnp->gp_seq_needed))) continue; pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld\n", - rnp->grplo, rnp->grphi, (long)READ_ONCE(rnp->gp_seq), - (long)READ_ONCE(rnp->gp_seq_needed)); + rnp->grplo, rnp->grphi, (long)data_race(rnp->gp_seq), + (long)data_race(rnp->gp_seq_needed)); if (!rcu_is_leaf_node(rnp)) continue; for_each_leaf_node_possible_cpu(rnp, cpu) { @@ -607,15 +684,17 @@ void show_rcu_gp_kthreads(void) READ_ONCE(rdp->gp_seq_needed))) continue; pr_info("\tcpu %d ->gp_seq_needed %ld\n", - cpu, (long)READ_ONCE(rdp->gp_seq_needed)); + cpu, (long)data_race(rdp->gp_seq_needed)); } } for_each_possible_cpu(cpu) { rdp = per_cpu_ptr(&rcu_data, cpu); + cbs += data_race(rdp->n_cbs_invoked); if (rcu_segcblist_is_offloaded(&rdp->cblist)) show_rcu_nocb_state(rdp); } - /* sched_show_task(rcu_state.gp_kthread); */ + pr_info("RCU callbacks invoked since boot: %lu\n", cbs); + show_rcu_tasks_gp_kthreads(); } EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads); @@ -729,7 +808,7 @@ static void sysrq_show_rcu(int key) show_rcu_gp_kthreads(); } -static struct sysrq_key_op sysrq_rcudump_op = { +static const struct sysrq_key_op sysrq_rcudump_op = { .handler = sysrq_show_rcu, .help_msg = "show-rcu(y)", .action_msg = "Show RCU tree", diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 28a8bdc5072f..2de49b5d8dd2 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -41,6 +41,8 @@ #include <linux/sched/isolation.h> #include <linux/kprobes.h> #include <linux/slab.h> +#include <linux/irq_work.h> +#include <linux/rcupdate_trace.h> #define CREATE_TRACE_POINTS @@ -51,6 +53,19 @@ #endif #define MODULE_PARAM_PREFIX "rcupdate." +#ifndef data_race +#define data_race(expr) \ + ({ \ + expr; \ + }) +#endif +#ifndef ASSERT_EXCLUSIVE_WRITER +#define ASSERT_EXCLUSIVE_WRITER(var) do { } while (0) +#endif +#ifndef ASSERT_EXCLUSIVE_ACCESS +#define ASSERT_EXCLUSIVE_ACCESS(var) do { } while (0) +#endif + #ifndef CONFIG_TINY_RCU module_param(rcu_expedited, int, 0); module_param(rcu_normal, int, 0); @@ -63,12 +78,12 @@ module_param(rcu_normal_after_boot, int, 0); * rcu_read_lock_held_common() - might we be in RCU-sched read-side critical section? * @ret: Best guess answer if lockdep cannot be relied on * - * Returns true if lockdep must be ignored, in which case *ret contains + * Returns true if lockdep must be ignored, in which case ``*ret`` contains * the best guess described below. Otherwise returns false, in which - * case *ret tells the caller nothing and the caller should instead + * case ``*ret`` tells the caller nothing and the caller should instead * consult lockdep. * - * If CONFIG_DEBUG_LOCK_ALLOC is selected, set *ret to nonzero iff in an + * If CONFIG_DEBUG_LOCK_ALLOC is selected, set ``*ret`` to nonzero iff in an * RCU-sched read-side critical section. In absence of * CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side * critical section unless it can prove otherwise. Note that disabling @@ -82,7 +97,7 @@ module_param(rcu_normal_after_boot, int, 0); * * Note that if the CPU is in the idle loop from an RCU point of view (ie: * that we are in the section between rcu_idle_enter() and rcu_idle_exit()) - * then rcu_read_lock_held() sets *ret to false even if the CPU did an + * then rcu_read_lock_held() sets ``*ret`` to false even if the CPU did an * rcu_read_lock(). The reason for this is that RCU ignores CPUs that are * in such a section, considering these as in extended quiescent state, * so such a CPU is effectively never in an RCU read-side critical section @@ -98,15 +113,15 @@ module_param(rcu_normal_after_boot, int, 0); static bool rcu_read_lock_held_common(bool *ret) { if (!debug_lockdep_rcu_enabled()) { - *ret = 1; + *ret = true; return true; } if (!rcu_is_watching()) { - *ret = 0; + *ret = false; return true; } if (!rcu_lockdep_current_cpu_online()) { - *ret = 0; + *ret = false; return true; } return false; @@ -193,7 +208,7 @@ void rcu_end_inkernel_boot(void) rcu_unexpedite_gp(); if (rcu_normal_after_boot) WRITE_ONCE(rcu_normal, 1); - rcu_boot_ended = 1; + rcu_boot_ended = true; } /* @@ -265,18 +280,18 @@ struct lockdep_map rcu_sched_lock_map = { }; EXPORT_SYMBOL_GPL(rcu_sched_lock_map); +// Tell lockdep when RCU callbacks are being invoked. static struct lock_class_key rcu_callback_key; struct lockdep_map rcu_callback_map = STATIC_LOCKDEP_MAP_INIT("rcu_callback", &rcu_callback_key); EXPORT_SYMBOL_GPL(rcu_callback_map); -int notrace debug_lockdep_rcu_enabled(void) +noinstr int notrace debug_lockdep_rcu_enabled(void) { return rcu_scheduler_active != RCU_SCHEDULER_INACTIVE && debug_locks && current->lockdep_recursion == 0; } EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); -NOKPROBE_SYMBOL(debug_lockdep_rcu_enabled); /** * rcu_read_lock_held() - might we be in RCU read-side critical section? @@ -377,13 +392,14 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, might_sleep(); continue; } - init_rcu_head_on_stack(&rs_array[i].head); - init_completion(&rs_array[i].completion); for (j = 0; j < i; j++) if (crcu_array[j] == crcu_array[i]) break; - if (j == i) + if (j == i) { + init_rcu_head_on_stack(&rs_array[i].head); + init_completion(&rs_array[i].completion); (crcu_array[i])(&rs_array[i].head, wakeme_after_rcu); + } } /* Wait for all callbacks to be invoked. */ @@ -394,9 +410,10 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, for (j = 0; j < i; j++) if (crcu_array[j] == crcu_array[i]) break; - if (j == i) + if (j == i) { wait_for_completion(&rs_array[i].completion); - destroy_rcu_head_on_stack(&rs_array[i].head); + destroy_rcu_head_on_stack(&rs_array[i].head); + } } } EXPORT_SYMBOL_GPL(__wait_rcu_gp); @@ -501,370 +518,6 @@ int rcu_cpu_stall_suppress_at_boot __read_mostly; // !0 = suppress boot stalls. EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress_at_boot); module_param(rcu_cpu_stall_suppress_at_boot, int, 0444); -#ifdef CONFIG_TASKS_RCU - -/* - * Simple variant of RCU whose quiescent states are voluntary context - * switch, cond_resched_rcu_qs(), user-space execution, and idle. - * As such, grace periods can take one good long time. There are no - * read-side primitives similar to rcu_read_lock() and rcu_read_unlock() - * because this implementation is intended to get the system into a safe - * state for some of the manipulations involved in tracing and the like. - * Finally, this implementation does not support high call_rcu_tasks() - * rates from multiple CPUs. If this is required, per-CPU callback lists - * will be needed. - */ - -/* Global list of callbacks and associated lock. */ -static struct rcu_head *rcu_tasks_cbs_head; -static struct rcu_head **rcu_tasks_cbs_tail = &rcu_tasks_cbs_head; -static DECLARE_WAIT_QUEUE_HEAD(rcu_tasks_cbs_wq); -static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock); - -/* Track exiting tasks in order to allow them to be waited for. */ -DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu); - -/* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */ -#define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10) -static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT; -module_param(rcu_task_stall_timeout, int, 0644); - -static struct task_struct *rcu_tasks_kthread_ptr; - -/** - * call_rcu_tasks() - Queue an RCU for invocation task-based grace period - * @rhp: structure to be used for queueing the RCU updates. - * @func: actual callback function to be invoked after the grace period - * - * The callback function will be invoked some time after a full grace - * period elapses, in other words after all currently executing RCU - * read-side critical sections have completed. call_rcu_tasks() assumes - * that the read-side critical sections end at a voluntary context - * switch (not a preemption!), cond_resched_rcu_qs(), entry into idle, - * or transition to usermode execution. As such, there are no read-side - * primitives analogous to rcu_read_lock() and rcu_read_unlock() because - * this primitive is intended to determine that all tasks have passed - * through a safe state, not so much for data-strcuture synchronization. - * - * See the description of call_rcu() for more detailed information on - * memory ordering guarantees. - */ -void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) -{ - unsigned long flags; - bool needwake; - - rhp->next = NULL; - rhp->func = func; - raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags); - needwake = !rcu_tasks_cbs_head; - WRITE_ONCE(*rcu_tasks_cbs_tail, rhp); - rcu_tasks_cbs_tail = &rhp->next; - raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); - /* We can't create the thread unless interrupts are enabled. */ - if (needwake && READ_ONCE(rcu_tasks_kthread_ptr)) - wake_up(&rcu_tasks_cbs_wq); -} -EXPORT_SYMBOL_GPL(call_rcu_tasks); - -/** - * synchronize_rcu_tasks - wait until an rcu-tasks grace period has elapsed. - * - * Control will return to the caller some time after a full rcu-tasks - * grace period has elapsed, in other words after all currently - * executing rcu-tasks read-side critical sections have elapsed. These - * read-side critical sections are delimited by calls to schedule(), - * cond_resched_tasks_rcu_qs(), idle execution, userspace execution, calls - * to synchronize_rcu_tasks(), and (in theory, anyway) cond_resched(). - * - * This is a very specialized primitive, intended only for a few uses in - * tracing and other situations requiring manipulation of function - * preambles and profiling hooks. The synchronize_rcu_tasks() function - * is not (yet) intended for heavy use from multiple CPUs. - * - * Note that this guarantee implies further memory-ordering guarantees. - * On systems with more than one CPU, when synchronize_rcu_tasks() returns, - * each CPU is guaranteed to have executed a full memory barrier since the - * end of its last RCU-tasks read-side critical section whose beginning - * preceded the call to synchronize_rcu_tasks(). In addition, each CPU - * having an RCU-tasks read-side critical section that extends beyond - * the return from synchronize_rcu_tasks() is guaranteed to have executed - * a full memory barrier after the beginning of synchronize_rcu_tasks() - * and before the beginning of that RCU-tasks read-side critical section. - * Note that these guarantees include CPUs that are offline, idle, or - * executing in user mode, as well as CPUs that are executing in the kernel. - * - * Furthermore, if CPU A invoked synchronize_rcu_tasks(), which returned - * to its caller on CPU B, then both CPU A and CPU B are guaranteed - * to have executed a full memory barrier during the execution of - * synchronize_rcu_tasks() -- even if CPU A and CPU B are the same CPU - * (but again only if the system has more than one CPU). - */ -void synchronize_rcu_tasks(void) -{ - /* Complain if the scheduler has not started. */ - RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE, - "synchronize_rcu_tasks called too soon"); - - /* Wait for the grace period. */ - wait_rcu_gp(call_rcu_tasks); -} -EXPORT_SYMBOL_GPL(synchronize_rcu_tasks); - -/** - * rcu_barrier_tasks - Wait for in-flight call_rcu_tasks() callbacks. - * - * Although the current implementation is guaranteed to wait, it is not - * obligated to, for example, if there are no pending callbacks. - */ -void rcu_barrier_tasks(void) -{ - /* There is only one callback queue, so this is easy. ;-) */ - synchronize_rcu_tasks(); -} -EXPORT_SYMBOL_GPL(rcu_barrier_tasks); - -/* See if tasks are still holding out, complain if so. */ -static void check_holdout_task(struct task_struct *t, - bool needreport, bool *firstreport) -{ - int cpu; - - if (!READ_ONCE(t->rcu_tasks_holdout) || - t->rcu_tasks_nvcsw != READ_ONCE(t->nvcsw) || - !READ_ONCE(t->on_rq) || - (IS_ENABLED(CONFIG_NO_HZ_FULL) && - !is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) { - WRITE_ONCE(t->rcu_tasks_holdout, false); - list_del_init(&t->rcu_tasks_holdout_list); - put_task_struct(t); - return; - } - rcu_request_urgent_qs_task(t); - if (!needreport) - return; - if (*firstreport) { - pr_err("INFO: rcu_tasks detected stalls on tasks:\n"); - *firstreport = false; - } - cpu = task_cpu(t); - pr_alert("%p: %c%c nvcsw: %lu/%lu holdout: %d idle_cpu: %d/%d\n", - t, ".I"[is_idle_task(t)], - "N."[cpu < 0 || !tick_nohz_full_cpu(cpu)], - t->rcu_tasks_nvcsw, t->nvcsw, t->rcu_tasks_holdout, - t->rcu_tasks_idle_cpu, cpu); - sched_show_task(t); -} - -/* RCU-tasks kthread that detects grace periods and invokes callbacks. */ -static int __noreturn rcu_tasks_kthread(void *arg) -{ - unsigned long flags; - struct task_struct *g, *t; - unsigned long lastreport; - struct rcu_head *list; - struct rcu_head *next; - LIST_HEAD(rcu_tasks_holdouts); - int fract; - - /* Run on housekeeping CPUs by default. Sysadm can move if desired. */ - housekeeping_affine(current, HK_FLAG_RCU); - - /* - * Each pass through the following loop makes one check for - * newly arrived callbacks, and, if there are some, waits for - * one RCU-tasks grace period and then invokes the callbacks. - * This loop is terminated by the system going down. ;-) - */ - for (;;) { - - /* Pick up any new callbacks. */ - raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags); - list = rcu_tasks_cbs_head; - rcu_tasks_cbs_head = NULL; - rcu_tasks_cbs_tail = &rcu_tasks_cbs_head; - raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); - - /* If there were none, wait a bit and start over. */ - if (!list) { - wait_event_interruptible(rcu_tasks_cbs_wq, - READ_ONCE(rcu_tasks_cbs_head)); - if (!rcu_tasks_cbs_head) { - WARN_ON(signal_pending(current)); - schedule_timeout_interruptible(HZ/10); - } - continue; - } - - /* - * Wait for all pre-existing t->on_rq and t->nvcsw - * transitions to complete. Invoking synchronize_rcu() - * suffices because all these transitions occur with - * interrupts disabled. Without this synchronize_rcu(), - * a read-side critical section that started before the - * grace period might be incorrectly seen as having started - * after the grace period. - * - * This synchronize_rcu() also dispenses with the - * need for a memory barrier on the first store to - * ->rcu_tasks_holdout, as it forces the store to happen - * after the beginning of the grace period. - */ - synchronize_rcu(); - - /* - * There were callbacks, so we need to wait for an - * RCU-tasks grace period. Start off by scanning - * the task list for tasks that are not already - * voluntarily blocked. Mark these tasks and make - * a list of them in rcu_tasks_holdouts. - */ - rcu_read_lock(); - for_each_process_thread(g, t) { - if (t != current && READ_ONCE(t->on_rq) && - !is_idle_task(t)) { - get_task_struct(t); - t->rcu_tasks_nvcsw = READ_ONCE(t->nvcsw); - WRITE_ONCE(t->rcu_tasks_holdout, true); - list_add(&t->rcu_tasks_holdout_list, - &rcu_tasks_holdouts); - } - } - rcu_read_unlock(); - - /* - * Wait for tasks that are in the process of exiting. - * This does only part of the job, ensuring that all - * tasks that were previously exiting reach the point - * where they have disabled preemption, allowing the - * later synchronize_rcu() to finish the job. - */ - synchronize_srcu(&tasks_rcu_exit_srcu); - - /* - * Each pass through the following loop scans the list - * of holdout tasks, removing any that are no longer - * holdouts. When the list is empty, we are done. - */ - lastreport = jiffies; - - /* Start off with HZ/10 wait and slowly back off to 1 HZ wait*/ - fract = 10; - - for (;;) { - bool firstreport; - bool needreport; - int rtst; - struct task_struct *t1; - - if (list_empty(&rcu_tasks_holdouts)) - break; - - /* Slowly back off waiting for holdouts */ - schedule_timeout_interruptible(HZ/fract); - - if (fract > 1) - fract--; - - rtst = READ_ONCE(rcu_task_stall_timeout); - needreport = rtst > 0 && - time_after(jiffies, lastreport + rtst); - if (needreport) - lastreport = jiffies; - firstreport = true; - WARN_ON(signal_pending(current)); - list_for_each_entry_safe(t, t1, &rcu_tasks_holdouts, - rcu_tasks_holdout_list) { - check_holdout_task(t, needreport, &firstreport); - cond_resched(); - } - } - - /* - * Because ->on_rq and ->nvcsw are not guaranteed - * to have a full memory barriers prior to them in the - * schedule() path, memory reordering on other CPUs could - * cause their RCU-tasks read-side critical sections to - * extend past the end of the grace period. However, - * because these ->nvcsw updates are carried out with - * interrupts disabled, we can use synchronize_rcu() - * to force the needed ordering on all such CPUs. - * - * This synchronize_rcu() also confines all - * ->rcu_tasks_holdout accesses to be within the grace - * period, avoiding the need for memory barriers for - * ->rcu_tasks_holdout accesses. - * - * In addition, this synchronize_rcu() waits for exiting - * tasks to complete their final preempt_disable() region - * of execution, cleaning up after the synchronize_srcu() - * above. - */ - synchronize_rcu(); - - /* Invoke the callbacks. */ - while (list) { - next = list->next; - local_bh_disable(); - list->func(list); - local_bh_enable(); - list = next; - cond_resched(); - } - /* Paranoid sleep to keep this from entering a tight loop */ - schedule_timeout_uninterruptible(HZ/10); - } -} - -/* Spawn rcu_tasks_kthread() at core_initcall() time. */ -static int __init rcu_spawn_tasks_kthread(void) -{ - struct task_struct *t; - - t = kthread_run(rcu_tasks_kthread, NULL, "rcu_tasks_kthread"); - if (WARN_ONCE(IS_ERR(t), "%s: Could not start Tasks-RCU grace-period kthread, OOM is now expected behavior\n", __func__)) - return 0; - smp_mb(); /* Ensure others see full kthread. */ - WRITE_ONCE(rcu_tasks_kthread_ptr, t); - return 0; -} -core_initcall(rcu_spawn_tasks_kthread); - -/* Do the srcu_read_lock() for the above synchronize_srcu(). */ -void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu) -{ - preempt_disable(); - current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu); - preempt_enable(); -} - -/* Do the srcu_read_unlock() for the above synchronize_srcu(). */ -void exit_tasks_rcu_finish(void) __releases(&tasks_rcu_exit_srcu) -{ - preempt_disable(); - __srcu_read_unlock(&tasks_rcu_exit_srcu, current->rcu_tasks_idx); - preempt_enable(); -} - -#endif /* #ifdef CONFIG_TASKS_RCU */ - -#ifndef CONFIG_TINY_RCU - -/* - * Print any non-default Tasks RCU settings. - */ -static void __init rcu_tasks_bootup_oddness(void) -{ -#ifdef CONFIG_TASKS_RCU - if (rcu_task_stall_timeout != RCU_TASK_STALL_TIMEOUT) - pr_info("\tTasks-RCU CPU stall warnings timeout set to %d (rcu_task_stall_timeout).\n", rcu_task_stall_timeout); - else - pr_info("\tTasks RCU enabled.\n"); -#endif /* #ifdef CONFIG_TASKS_RCU */ -} - -#endif /* #ifndef CONFIG_TINY_RCU */ - #ifdef CONFIG_PROVE_RCU /* @@ -935,6 +588,8 @@ late_initcall(rcu_verify_early_boot_tests); void rcu_early_boot_tests(void) {} #endif /* CONFIG_PROVE_RCU */ +#include "tasks.h" + #ifndef CONFIG_TINY_RCU /* diff --git a/kernel/reboot.c b/kernel/reboot.c index c4d472b7f1b4..e7b78d5ae1ab 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -26,7 +26,7 @@ int C_A_D = 1; struct pid *cad_pid; EXPORT_SYMBOL(cad_pid); -#if defined(CONFIG_ARM) || defined(CONFIG_UNICORE32) +#if defined(CONFIG_ARM) #define DEFAULT_REBOOT_MODE = REBOOT_HARD #else #define DEFAULT_REBOOT_MODE @@ -250,7 +250,7 @@ void kernel_restart(char *cmd) pr_emerg("Restarting system\n"); else pr_emerg("Restarting system with command '%s'\n", cmd); - kmsg_dump(KMSG_DUMP_RESTART); + kmsg_dump(KMSG_DUMP_SHUTDOWN); machine_restart(cmd); } EXPORT_SYMBOL_GPL(kernel_restart); @@ -274,7 +274,7 @@ void kernel_halt(void) migrate_to_reboot_cpu(); syscore_shutdown(); pr_emerg("System halted\n"); - kmsg_dump(KMSG_DUMP_HALT); + kmsg_dump(KMSG_DUMP_SHUTDOWN); machine_halt(); } EXPORT_SYMBOL_GPL(kernel_halt); @@ -292,7 +292,7 @@ void kernel_power_off(void) migrate_to_reboot_cpu(); syscore_shutdown(); pr_emerg("Power down\n"); - kmsg_dump(KMSG_DUMP_POWEROFF); + kmsg_dump(KMSG_DUMP_SHUTDOWN); machine_power_off(); } EXPORT_SYMBOL_GPL(kernel_power_off); diff --git a/kernel/relay.c b/kernel/relay.c index ade14fb7ce2e..72fe443ea78f 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1,7 +1,7 @@ /* * Public API and common code for kernel->userspace relay file support. * - * See Documentation/filesystems/relay.txt for an overview. + * See Documentation/filesystems/relay.rst for an overview. * * Copyright (C) 2002-2005 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp * Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com) @@ -91,7 +91,7 @@ static void relay_free_page_array(struct page **array) * * Returns 0 if ok, negative on error * - * Caller should already have grabbed mmap_sem. + * Caller should already have grabbed mmap_lock. */ static int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma) { @@ -581,6 +581,11 @@ struct rchan *relay_open(const char *base_filename, return NULL; chan->buf = alloc_percpu(struct rchan_buf *); + if (!chan->buf) { + kfree(chan); + return NULL; + } + chan->version = RELAYFS_CHANNEL_VERSION; chan->n_subbufs = n_subbufs; chan->subbuf_size = subbuf_size; @@ -991,14 +996,14 @@ static void relay_file_read_consume(struct rchan_buf *buf, /* * relay_file_read_avail - boolean, are there unconsumed bytes available? */ -static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos) +static int relay_file_read_avail(struct rchan_buf *buf) { size_t subbuf_size = buf->chan->subbuf_size; size_t n_subbufs = buf->chan->n_subbufs; size_t produced = buf->subbufs_produced; size_t consumed = buf->subbufs_consumed; - relay_file_read_consume(buf, read_pos, 0); + relay_file_read_consume(buf, 0, 0); consumed = buf->subbufs_consumed; @@ -1059,23 +1064,20 @@ static size_t relay_file_read_subbuf_avail(size_t read_pos, /** * relay_file_read_start_pos - find the first available byte to read - * @read_pos: file read position * @buf: relay channel buffer * - * If the @read_pos is in the middle of padding, return the + * If the read_pos is in the middle of padding, return the * position of the first actually available byte, otherwise * return the original value. */ -static size_t relay_file_read_start_pos(size_t read_pos, - struct rchan_buf *buf) +static size_t relay_file_read_start_pos(struct rchan_buf *buf) { size_t read_subbuf, padding, padding_start, padding_end; size_t subbuf_size = buf->chan->subbuf_size; size_t n_subbufs = buf->chan->n_subbufs; size_t consumed = buf->subbufs_consumed % n_subbufs; + size_t read_pos = consumed * subbuf_size + buf->bytes_consumed; - if (!read_pos) - read_pos = consumed * subbuf_size + buf->bytes_consumed; read_subbuf = read_pos / subbuf_size; padding = buf->padding[read_subbuf]; padding_start = (read_subbuf + 1) * subbuf_size - padding; @@ -1131,10 +1133,10 @@ static ssize_t relay_file_read(struct file *filp, do { void *from; - if (!relay_file_read_avail(buf, *ppos)) + if (!relay_file_read_avail(buf)) break; - read_start = relay_file_read_start_pos(*ppos, buf); + read_start = relay_file_read_start_pos(buf); avail = relay_file_read_subbuf_avail(read_start, buf); if (!avail) break; @@ -1177,10 +1179,9 @@ static void relay_pipe_buf_release(struct pipe_inode_info *pipe, } static const struct pipe_buf_operations relay_pipe_buf_ops = { - .confirm = generic_pipe_buf_confirm, - .release = relay_pipe_buf_release, - .steal = generic_pipe_buf_steal, - .get = generic_pipe_buf_get, + .release = relay_pipe_buf_release, + .try_steal = generic_pipe_buf_try_steal, + .get = generic_pipe_buf_get, }; static void relay_page_release(struct splice_pipe_desc *spd, unsigned int i) diff --git a/kernel/resource.c b/kernel/resource.c index 76036a41143b..841737bbda9e 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1126,6 +1126,7 @@ struct resource * __request_region(struct resource *parent, { DECLARE_WAITQUEUE(wait, current); struct resource *res = alloc_resource(GFP_KERNEL); + struct resource *orig_parent = parent; if (!res) return NULL; @@ -1176,6 +1177,10 @@ struct resource * __request_region(struct resource *parent, break; } write_unlock(&resource_lock); + + if (res && orig_parent == &iomem_resource) + revoke_devmem(res); + return res; } EXPORT_SYMBOL(__request_region); diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 21fb5a5662b5..5fc9c9b70862 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -7,6 +7,12 @@ endif # that is not a function of syscall inputs. E.g. involuntary context switches. KCOV_INSTRUMENT := n +# There are numerous data races here, however, most of them are due to plain accesses. +# This would make it even harder for syzbot to find reproducers, because these +# bugs trigger without specific input. Disable by default, but should re-enable +# eventually. +KCSAN_SANITIZE := n + ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is # needed for x86 only. Why this used to be enabled for all architectures is beyond diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9a2fbf98fd6f..84758f34cdb0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6,11 +6,16 @@ * * Copyright (C) 1991-2002 Linus Torvalds */ +#define CREATE_TRACE_POINTS +#include <trace/events/sched.h> +#undef CREATE_TRACE_POINTS + #include "sched.h" #include <linux/nospec.h> #include <linux/kcov.h> +#include <linux/scs.h> #include <asm/switch_to.h> #include <asm/tlb.h> @@ -20,9 +25,7 @@ #include "../smpboot.h" #include "pelt.h" - -#define CREATE_TRACE_POINTS -#include <trace/events/sched.h> +#include "smp.h" /* * Export tracepoints that act as a bare tracehook (ie: have no trace event @@ -34,6 +37,9 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); @@ -73,6 +79,100 @@ __read_mostly int scheduler_running; */ int sysctl_sched_rt_runtime = 950000; + +/* + * Serialization rules: + * + * Lock order: + * + * p->pi_lock + * rq->lock + * hrtimer_cpu_base->lock (hrtimer_start() for bandwidth controls) + * + * rq1->lock + * rq2->lock where: rq1 < rq2 + * + * Regular state: + * + * Normal scheduling state is serialized by rq->lock. __schedule() takes the + * local CPU's rq->lock, it optionally removes the task from the runqueue and + * always looks at the local rq data structures to find the most elegible task + * to run next. + * + * Task enqueue is also under rq->lock, possibly taken from another CPU. + * Wakeups from another LLC domain might use an IPI to transfer the enqueue to + * the local CPU to avoid bouncing the runqueue state around [ see + * ttwu_queue_wakelist() ] + * + * Task wakeup, specifically wakeups that involve migration, are horribly + * complicated to avoid having to take two rq->locks. + * + * Special state: + * + * System-calls and anything external will use task_rq_lock() which acquires + * both p->pi_lock and rq->lock. As a consequence the state they change is + * stable while holding either lock: + * + * - sched_setaffinity()/ + * set_cpus_allowed_ptr(): p->cpus_ptr, p->nr_cpus_allowed + * - set_user_nice(): p->se.load, p->*prio + * - __sched_setscheduler(): p->sched_class, p->policy, p->*prio, + * p->se.load, p->rt_priority, + * p->dl.dl_{runtime, deadline, period, flags, bw, density} + * - sched_setnuma(): p->numa_preferred_nid + * - sched_move_task()/ + * cpu_cgroup_fork(): p->sched_task_group + * - uclamp_update_active() p->uclamp* + * + * p->state <- TASK_*: + * + * is changed locklessly using set_current_state(), __set_current_state() or + * set_special_state(), see their respective comments, or by + * try_to_wake_up(). This latter uses p->pi_lock to serialize against + * concurrent self. + * + * p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }: + * + * is set by activate_task() and cleared by deactivate_task(), under + * rq->lock. Non-zero indicates the task is runnable, the special + * ON_RQ_MIGRATING state is used for migration without holding both + * rq->locks. It indicates task_cpu() is not stable, see task_rq_lock(). + * + * p->on_cpu <- { 0, 1 }: + * + * is set by prepare_task() and cleared by finish_task() such that it will be + * set before p is scheduled-in and cleared after p is scheduled-out, both + * under rq->lock. Non-zero indicates the task is running on its CPU. + * + * [ The astute reader will observe that it is possible for two tasks on one + * CPU to have ->on_cpu = 1 at the same time. ] + * + * task_cpu(p): is changed by set_task_cpu(), the rules are: + * + * - Don't call set_task_cpu() on a blocked task: + * + * We don't care what CPU we're not running on, this simplifies hotplug, + * the CPU assignment of blocked tasks isn't required to be valid. + * + * - for try_to_wake_up(), called under p->pi_lock: + * + * This allows try_to_wake_up() to only take one rq->lock, see its comment. + * + * - for migration called under rq->lock: + * [ see task_on_rq_migrating() in task_rq_lock() ] + * + * o move_queued_task() + * o detach_task() + * + * - for migration called under double_rq_lock(): + * + * o __migrate_swap_task() + * o push_rt_task() / pull_rt_task() + * o push_dl_task() / pull_dl_task() + * o dl_task_offline_migration() + * + */ + /* * __task_rq_lock - lock the rq @p resides on. */ @@ -219,6 +319,13 @@ void update_rq_clock(struct rq *rq) update_rq_clock_task(rq, delta); } +static inline void +rq_csd_init(struct rq *rq, call_single_data_t *csd, smp_call_func_t func) +{ + csd->flags = 0; + csd->func = func; + csd->info = rq; +} #ifdef CONFIG_SCHED_HRTICK /* @@ -314,16 +421,14 @@ void hrtick_start(struct rq *rq, u64 delay) hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL_PINNED_HARD); } + #endif /* CONFIG_SMP */ static void hrtick_rq_init(struct rq *rq) { #ifdef CONFIG_SMP - rq->hrtick_csd.flags = 0; - rq->hrtick_csd.func = __hrtick_start; - rq->hrtick_csd.info = rq; + rq_csd_init(rq, &rq->hrtick_csd, __hrtick_start); #endif - hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); rq->hrtick_timer.function = hrtick; } @@ -632,29 +737,23 @@ void wake_up_nohz_cpu(int cpu) wake_up_idle_cpu(cpu); } -static inline bool got_nohz_idle_kick(void) +static void nohz_csd_func(void *info) { - int cpu = smp_processor_id(); - - if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK)) - return false; - - if (idle_cpu(cpu) && !need_resched()) - return true; + struct rq *rq = info; + int cpu = cpu_of(rq); + unsigned int flags; /* - * We can't run Idle Load Balance on this CPU for this time so we - * cancel it and clear NOHZ_BALANCE_KICK + * Release the rq::nohz_csd. */ - atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu)); - return false; -} - -#else /* CONFIG_NO_HZ_COMMON */ + flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(cpu)); + WARN_ON(!(flags & NOHZ_KICK_MASK)); -static inline bool got_nohz_idle_kick(void) -{ - return false; + rq->idle_balance = idle_cpu(cpu); + if (rq->idle_balance && !need_resched()) { + rq->nohz_idle_balance = flags; + raise_softirq_irqoff(SCHED_SOFTIRQ); + } } #endif /* CONFIG_NO_HZ_COMMON */ @@ -790,9 +889,46 @@ unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE; /* Max allowed maximum utilization */ unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE; +/* + * By default RT tasks run at the maximum performance point/capacity of the + * system. Uclamp enforces this by always setting UCLAMP_MIN of RT tasks to + * SCHED_CAPACITY_SCALE. + * + * This knob allows admins to change the default behavior when uclamp is being + * used. In battery powered devices, particularly, running at the maximum + * capacity and frequency will increase energy consumption and shorten the + * battery life. + * + * This knob only affects RT tasks that their uclamp_se->user_defined == false. + * + * This knob will not override the system default sched_util_clamp_min defined + * above. + */ +unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE; + /* All clamps are required to be less or equal than these values */ static struct uclamp_se uclamp_default[UCLAMP_CNT]; +/* + * This static key is used to reduce the uclamp overhead in the fast path. It + * primarily disables the call to uclamp_rq_{inc, dec}() in + * enqueue/dequeue_task(). + * + * This allows users to continue to enable uclamp in their kernel config with + * minimum uclamp overhead in the fast path. + * + * As soon as userspace modifies any of the uclamp knobs, the static key is + * enabled, since we have an actual users that make use of uclamp + * functionality. + * + * The knobs that would enable this static key are: + * + * * A task modifying its uclamp value with sched_setattr(). + * * An admin modifying the sysctl_sched_uclamp_{min, max} via procfs. + * * An admin modifying the cgroup cpu.uclamp.{min, max} + */ +DEFINE_STATIC_KEY_FALSE(sched_uclamp_used); + /* Integer rounded range for each bucket */ #define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS) @@ -872,6 +1008,64 @@ unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id, return uclamp_idle_value(rq, clamp_id, clamp_value); } +static void __uclamp_update_util_min_rt_default(struct task_struct *p) +{ + unsigned int default_util_min; + struct uclamp_se *uc_se; + + lockdep_assert_held(&p->pi_lock); + + uc_se = &p->uclamp_req[UCLAMP_MIN]; + + /* Only sync if user didn't override the default */ + if (uc_se->user_defined) + return; + + default_util_min = sysctl_sched_uclamp_util_min_rt_default; + uclamp_se_set(uc_se, default_util_min, false); +} + +static void uclamp_update_util_min_rt_default(struct task_struct *p) +{ + struct rq_flags rf; + struct rq *rq; + + if (!rt_task(p)) + return; + + /* Protect updates to p->uclamp_* */ + rq = task_rq_lock(p, &rf); + __uclamp_update_util_min_rt_default(p); + task_rq_unlock(rq, p, &rf); +} + +static void uclamp_sync_util_min_rt_default(void) +{ + struct task_struct *g, *p; + + /* + * copy_process() sysctl_uclamp + * uclamp_min_rt = X; + * write_lock(&tasklist_lock) read_lock(&tasklist_lock) + * // link thread smp_mb__after_spinlock() + * write_unlock(&tasklist_lock) read_unlock(&tasklist_lock); + * sched_post_fork() for_each_process_thread() + * __uclamp_sync_rt() __uclamp_sync_rt() + * + * Ensures that either sched_post_fork() will observe the new + * uclamp_min_rt or for_each_process_thread() will observe the new + * task. + */ + read_lock(&tasklist_lock); + smp_mb__after_spinlock(); + read_unlock(&tasklist_lock); + + rcu_read_lock(); + for_each_process_thread(g, p) + uclamp_update_util_min_rt_default(p); + rcu_read_unlock(); +} + static inline struct uclamp_se uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id) { @@ -989,10 +1183,38 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, lockdep_assert_held(&rq->lock); + /* + * If sched_uclamp_used was enabled after task @p was enqueued, + * we could end up with unbalanced call to uclamp_rq_dec_id(). + * + * In this case the uc_se->active flag should be false since no uclamp + * accounting was performed at enqueue time and we can just return + * here. + * + * Need to be careful of the following enqeueue/dequeue ordering + * problem too + * + * enqueue(taskA) + * // sched_uclamp_used gets enabled + * enqueue(taskB) + * dequeue(taskA) + * // Must not decrement bukcet->tasks here + * dequeue(taskB) + * + * where we could end up with stale data in uc_se and + * bucket[uc_se->bucket_id]. + * + * The following check here eliminates the possibility of such race. + */ + if (unlikely(!uc_se->active)) + return; + bucket = &uc_rq->bucket[uc_se->bucket_id]; + SCHED_WARN_ON(!bucket->tasks); if (likely(bucket->tasks)) bucket->tasks--; + uc_se->active = false; /* @@ -1020,6 +1242,15 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { enum uclamp_id clamp_id; + /* + * Avoid any overhead until uclamp is actually used by the userspace. + * + * The condition is constructed such that a NOP is generated when + * sched_uclamp_used is disabled. + */ + if (!static_branch_unlikely(&sched_uclamp_used)) + return; + if (unlikely(!p->sched_class->uclamp_enabled)) return; @@ -1035,6 +1266,15 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { enum uclamp_id clamp_id; + /* + * Avoid any overhead until uclamp is actually used by the userspace. + * + * The condition is constructed such that a NOP is generated when + * sched_uclamp_used is disabled. + */ + if (!static_branch_unlikely(&sched_uclamp_used)) + return; + if (unlikely(!p->sched_class->uclamp_enabled)) return; @@ -1110,16 +1350,16 @@ static void uclamp_update_root_tg(void) { } #endif int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { bool update_root_tg = false; - int old_min, old_max; + int old_min, old_max, old_min_rt; int result; mutex_lock(&uclamp_mutex); old_min = sysctl_sched_uclamp_util_min; old_max = sysctl_sched_uclamp_util_max; + old_min_rt = sysctl_sched_uclamp_util_min_rt_default; result = proc_dointvec(table, write, buffer, lenp, ppos); if (result) @@ -1128,7 +1368,9 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, goto done; if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max || - sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE) { + sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE || + sysctl_sched_uclamp_util_min_rt_default > SCHED_CAPACITY_SCALE) { + result = -EINVAL; goto undo; } @@ -1144,8 +1386,15 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, update_root_tg = true; } - if (update_root_tg) + if (update_root_tg) { + static_branch_enable(&sched_uclamp_used); uclamp_update_root_tg(); + } + + if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) { + static_branch_enable(&sched_uclamp_used); + uclamp_sync_util_min_rt_default(); + } /* * We update all RUNNABLE tasks only when task groups are in use. @@ -1158,6 +1407,7 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, undo: sysctl_sched_uclamp_util_min = old_min; sysctl_sched_uclamp_util_max = old_max; + sysctl_sched_uclamp_util_min_rt_default = old_min_rt; done: mutex_unlock(&uclamp_mutex); @@ -1180,6 +1430,15 @@ static int uclamp_validate(struct task_struct *p, if (upper_bound > SCHED_CAPACITY_SCALE) return -EINVAL; + /* + * We have valid uclamp attributes; make sure uclamp is enabled. + * + * We need to do that here, because enabling static branches is a + * blocking operation which obviously cannot be done while holding + * scheduler locks. + */ + static_branch_enable(&sched_uclamp_used); + return 0; } @@ -1194,17 +1453,20 @@ static void __setscheduler_uclamp(struct task_struct *p, */ for_each_clamp_id(clamp_id) { struct uclamp_se *uc_se = &p->uclamp_req[clamp_id]; - unsigned int clamp_value = uclamp_none(clamp_id); /* Keep using defined clamps across class changes */ if (uc_se->user_defined) continue; - /* By default, RT tasks always get 100% boost */ + /* + * RT by default have a 100% boost value that could be modified + * at runtime. + */ if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN)) - clamp_value = uclamp_none(UCLAMP_MAX); + __uclamp_update_util_min_rt_default(p); + else + uclamp_se_set(uc_se, uclamp_none(clamp_id), false); - uclamp_se_set(uc_se, clamp_value, false); } if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP))) @@ -1225,6 +1487,10 @@ static void uclamp_fork(struct task_struct *p) { enum uclamp_id clamp_id; + /* + * We don't need to hold task_rq_lock() when updating p->uclamp_* here + * as the task is still at its early fork stages. + */ for_each_clamp_id(clamp_id) p->uclamp[clamp_id].active = false; @@ -1237,19 +1503,33 @@ static void uclamp_fork(struct task_struct *p) } } +static void uclamp_post_fork(struct task_struct *p) +{ + uclamp_update_util_min_rt_default(p); +} + +static void __init init_uclamp_rq(struct rq *rq) +{ + enum uclamp_id clamp_id; + struct uclamp_rq *uc_rq = rq->uclamp; + + for_each_clamp_id(clamp_id) { + uc_rq[clamp_id] = (struct uclamp_rq) { + .value = uclamp_none(clamp_id) + }; + } + + rq->uclamp_flags = 0; +} + static void __init init_uclamp(void) { struct uclamp_se uc_max = {}; enum uclamp_id clamp_id; int cpu; - mutex_init(&uclamp_mutex); - - for_each_possible_cpu(cpu) { - memset(&cpu_rq(cpu)->uclamp, 0, - sizeof(struct uclamp_rq)*UCLAMP_CNT); - cpu_rq(cpu)->uclamp_flags = 0; - } + for_each_possible_cpu(cpu) + init_uclamp_rq(cpu_rq(cpu)); for_each_clamp_id(clamp_id) { uclamp_se_set(&init_task.uclamp_req[clamp_id], @@ -1278,6 +1558,7 @@ static inline int uclamp_validate(struct task_struct *p, static void __setscheduler_uclamp(struct task_struct *p, const struct sched_attr *attr) { } static inline void uclamp_fork(struct task_struct *p) { } +static inline void uclamp_post_fork(struct task_struct *p) { } static inline void init_uclamp(void) { } #endif /* CONFIG_UCLAMP_TASK */ @@ -1311,9 +1592,6 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) void activate_task(struct rq *rq, struct task_struct *p, int flags) { - if (task_contributes_to_load(p)) - rq->nr_uninterruptible--; - enqueue_task(rq, p, flags); p->on_rq = TASK_ON_RQ_QUEUED; @@ -1323,9 +1601,6 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags) { p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING; - if (task_contributes_to_load(p)) - rq->nr_uninterruptible++; - dequeue_task(rq, p, flags); } @@ -1410,20 +1685,10 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) { - const struct sched_class *class; - - if (p->sched_class == rq->curr->sched_class) { + if (p->sched_class == rq->curr->sched_class) rq->curr->sched_class->check_preempt_curr(rq, p, flags); - } else { - for_each_class(class) { - if (class == rq->curr->sched_class) - break; - if (class == p->sched_class) { - resched_curr(rq); - break; - } - } - } + else if (p->sched_class > rq->curr->sched_class) + resched_curr(rq); /* * A queue event has occurred, and we're going to schedule. In @@ -1474,8 +1739,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf, { lockdep_assert_held(&rq->lock); - WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); - dequeue_task(rq, p, DEQUEUE_NOCLOCK); + deactivate_task(rq, p, DEQUEUE_NOCLOCK); set_task_cpu(p, new_cpu); rq_unlock(rq, rf); @@ -1483,8 +1747,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf, rq_lock(rq, rf); BUG_ON(task_cpu(p) != new_cpu); - enqueue_task(rq, p, 0); - p->on_rq = TASK_ON_RQ_QUEUED; + activate_task(rq, p, 0); check_preempt_curr(rq, p, 0); return rq; @@ -1539,7 +1802,7 @@ static int migration_cpu_stop(void *data) * __migrate_task() such that we will not miss enforcing cpus_ptr * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. */ - sched_ttwu_pending(); + flush_smp_call_function_from_idle(); raw_spin_lock(&p->pi_lock); rq_lock(rq, &rf); @@ -1637,7 +1900,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, goto out; } - if (cpumask_equal(p->cpus_ptr, new_mask)) + if (cpumask_equal(&p->cpus_mask, new_mask)) goto out; /* @@ -2236,10 +2499,10 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, lockdep_assert_held(&rq->lock); -#ifdef CONFIG_SMP if (p->sched_contributes_to_load) rq->nr_uninterruptible--; +#ifdef CONFIG_SMP if (wake_flags & WF_MIGRATED) en_flags |= ENQUEUE_MIGRATED; #endif @@ -2249,12 +2512,31 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, } /* - * Called in case the task @p isn't fully descheduled from its runqueue, - * in this case we must do a remote wakeup. Its a 'light' wakeup though, - * since all we need to do is flip p->state to TASK_RUNNING, since - * the task is still ->on_rq. + * Consider @p being inside a wait loop: + * + * for (;;) { + * set_current_state(TASK_UNINTERRUPTIBLE); + * + * if (CONDITION) + * break; + * + * schedule(); + * } + * __set_current_state(TASK_RUNNING); + * + * between set_current_state() and schedule(). In this case @p is still + * runnable, so all that needs doing is change p->state back to TASK_RUNNING in + * an atomic manner. + * + * By taking task_rq(p)->lock we serialize against schedule(), if @p->on_rq + * then schedule() must still happen and p->state can be changed to + * TASK_RUNNING. Otherwise we lost the race, schedule() has happened, and we + * need to do a full wakeup with enqueue. + * + * Returns: %true when the wakeup is done, + * %false otherwise. */ -static int ttwu_remote(struct task_struct *p, int wake_flags) +static int ttwu_runnable(struct task_struct *p, int wake_flags) { struct rq_flags rf; struct rq *rq; @@ -2273,75 +2555,63 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) } #ifdef CONFIG_SMP -void sched_ttwu_pending(void) +void sched_ttwu_pending(void *arg) { + struct llist_node *llist = arg; struct rq *rq = this_rq(); - struct llist_node *llist = llist_del_all(&rq->wake_list); struct task_struct *p, *t; struct rq_flags rf; if (!llist) return; + /* + * rq::ttwu_pending racy indication of out-standing wakeups. + * Races such that false-negatives are possible, since they + * are shorter lived that false-positives would be. + */ + WRITE_ONCE(rq->ttwu_pending, 0); + rq_lock_irqsave(rq, &rf); update_rq_clock(rq); - llist_for_each_entry_safe(p, t, llist, wake_entry) + llist_for_each_entry_safe(p, t, llist, wake_entry.llist) { + if (WARN_ON_ONCE(p->on_cpu)) + smp_cond_load_acquire(&p->on_cpu, !VAL); + + if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq))) + set_task_cpu(p, cpu_of(rq)); + ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf); + } rq_unlock_irqrestore(rq, &rf); } -void scheduler_ipi(void) +void send_call_function_single_ipi(int cpu) { - /* - * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting - * TIF_NEED_RESCHED remotely (for the first time) will also send - * this IPI. - */ - preempt_fold_need_resched(); - - if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) - return; - - /* - * Not all reschedule IPI handlers call irq_enter/irq_exit, since - * traditionally all their work was done from the interrupt return - * path. Now that we actually do some work, we need to make sure - * we do call them. - * - * Some archs already do call them, luckily irq_enter/exit nest - * properly. - * - * Arguably we should visit all archs and update all handlers, - * however a fair share of IPIs are still resched only so this would - * somewhat pessimize the simple resched case. - */ - irq_enter(); - sched_ttwu_pending(); + struct rq *rq = cpu_rq(cpu); - /* - * Check if someone kicked us for doing the nohz idle load balance. - */ - if (unlikely(got_nohz_idle_kick())) { - this_rq()->idle_balance = 1; - raise_softirq_irqoff(SCHED_SOFTIRQ); - } - irq_exit(); + if (!set_nr_if_polling(rq->idle)) + arch_send_call_function_single_ipi(cpu); + else + trace_sched_wake_idle_without_ipi(cpu); } -static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) +/* + * Queue a task on the target CPUs wake_list and wake the CPU via IPI if + * necessary. The wakee CPU on receipt of the IPI will queue the task + * via sched_ttwu_wakeup() for activation so the wakee incurs the cost + * of the wakeup instead of the waker. + */ +static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) { struct rq *rq = cpu_rq(cpu); p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED); - if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) { - if (!set_nr_if_polling(rq->idle)) - smp_send_reschedule(cpu); - else - trace_sched_wake_idle_without_ipi(cpu); - } + WRITE_ONCE(rq->ttwu_pending, 1); + __smp_call_single_queue(cpu, &p->wake_entry.llist); } void wake_up_if_idle(int cpu) @@ -2372,6 +2642,49 @@ bool cpus_share_cache(int this_cpu, int that_cpu) { return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); } + +static inline bool ttwu_queue_cond(int cpu, int wake_flags) +{ + /* + * If the CPU does not share cache, then queue the task on the + * remote rqs wakelist to avoid accessing remote data. + */ + if (!cpus_share_cache(smp_processor_id(), cpu)) + return true; + + /* + * If the task is descheduling and the only running task on the + * CPU then use the wakelist to offload the task activation to + * the soon-to-be-idle CPU as the current CPU is likely busy. + * nr_running is checked to avoid unnecessary task stacking. + */ + if ((wake_flags & WF_ON_CPU) && cpu_rq(cpu)->nr_running <= 1) + return true; + + return false; +} + +static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) +{ + if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) { + if (WARN_ON_ONCE(cpu == smp_processor_id())) + return false; + + sched_clock_cpu(cpu); /* Sync clocks across CPUs */ + __ttwu_queue_wakelist(p, cpu, wake_flags); + return true; + } + + return false; +} + +#else /* !CONFIG_SMP */ + +static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) +{ + return false; +} + #endif /* CONFIG_SMP */ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) @@ -2379,13 +2692,8 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) struct rq *rq = cpu_rq(cpu); struct rq_flags rf; -#if defined(CONFIG_SMP) - if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { - sched_clock_cpu(cpu); /* Sync clocks across CPUs */ - ttwu_queue_remote(p, cpu, wake_flags); + if (ttwu_queue_wakelist(p, cpu, wake_flags)) return; - } -#endif rq_lock(rq, &rf); update_rq_clock(rq); @@ -2441,8 +2749,8 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) * migration. However the means are completely different as there is no lock * chain to provide order. Instead we do: * - * 1) smp_store_release(X->on_cpu, 0) - * 2) smp_cond_load_acquire(!X->on_cpu) + * 1) smp_store_release(X->on_cpu, 0) -- finish_task() + * 2) smp_cond_load_acquire(!X->on_cpu) -- try_to_wake_up() * * Example: * @@ -2482,15 +2790,33 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) * @state: the mask of task states that can be woken * @wake_flags: wake modifier flags (WF_*) * - * If (@state & @p->state) @p->state = TASK_RUNNING. + * Conceptually does: + * + * If (@state & @p->state) @p->state = TASK_RUNNING. * * If the task was not queued/runnable, also place it back on a runqueue. * - * Atomic against schedule() which would dequeue a task, also see - * set_current_state(). + * This function is atomic against schedule() which would dequeue the task. + * + * It issues a full memory barrier before accessing @p->state, see the comment + * with set_current_state(). + * + * Uses p->pi_lock to serialize against concurrent wake-ups. * - * This function executes a full memory barrier before accessing the task - * state; see set_current_state(). + * Relies on p->pi_lock stabilizing: + * - p->sched_class + * - p->cpus_ptr + * - p->sched_task_group + * in order to do migration, see its use of select_task_rq()/set_task_cpu(). + * + * Tries really hard to only take one task_rq(p)->lock for performance. + * Takes rq->lock in: + * - ttwu_runnable() -- old rq, unavoidable, see comment there; + * - ttwu_queue() -- new rq, for enqueue of the task; + * - psi_ttwu_dequeue() -- much sadness :-( accounting will kill us. + * + * As a consequence we race really badly with just about everything. See the + * many memory barriers and their comments for details. * * Return: %true if @p->state changes (an actual wakeup was done), * %false otherwise. @@ -2506,7 +2832,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) /* * We're waking current, this means 'p->on_rq' and 'task_cpu(p) * == smp_processor_id()'. Together this means we can special - * case the whole 'p->on_rq && ttwu_remote()' case below + * case the whole 'p->on_rq && ttwu_runnable()' case below * without taking any locks. * * In particular: @@ -2518,7 +2844,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) goto out; success = 1; - cpu = task_cpu(p); trace_sched_waking(p); p->state = TASK_RUNNING; trace_sched_wakeup(p); @@ -2528,8 +2853,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) /* * If we are going to wake up a thread waiting for CONDITION we * need to ensure that CONDITION=1 done by the caller can not be - * reordered with p->state check below. This pairs with mb() in - * set_current_state() the waiting thread does. + * reordered with p->state check below. This pairs with smp_store_mb() + * in set_current_state() that the waiting thread does. */ raw_spin_lock_irqsave(&p->pi_lock, flags); smp_mb__after_spinlock(); @@ -2540,7 +2865,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) /* We're going to change ->state: */ success = 1; - cpu = task_cpu(p); /* * Ensure we load p->on_rq _after_ p->state, otherwise it would @@ -2561,11 +2885,18 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in * __schedule(). See the comment for smp_mb__after_spinlock(). + * + * A similar smb_rmb() lives in try_invoke_on_locked_down_task(). */ smp_rmb(); - if (p->on_rq && ttwu_remote(p, wake_flags)) + if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags)) goto unlock; + if (p->in_iowait) { + delayacct_blkio_end(p); + atomic_dec(&task_rq(p)->nr_iowait); + } + #ifdef CONFIG_SMP /* * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be @@ -2585,8 +2916,43 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in * __schedule(). See the comment for smp_mb__after_spinlock(). + * + * Form a control-dep-acquire with p->on_rq == 0 above, to ensure + * schedule()'s deactivate_task() has 'happened' and p will no longer + * care about it's own p->state. See the comment in __schedule(). */ - smp_rmb(); + smp_acquire__after_ctrl_dep(); + + /* + * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq + * == 0), which means we need to do an enqueue, change p->state to + * TASK_WAKING such that we can unlock p->pi_lock before doing the + * enqueue, such as ttwu_queue_wakelist(). + */ + p->state = TASK_WAKING; + + /* + * If the owning (remote) CPU is still in the middle of schedule() with + * this task as prev, considering queueing p on the remote CPUs wake_list + * which potentially sends an IPI instead of spinning on p->on_cpu to + * let the waker make forward progress. This is safe because IRQs are + * disabled and the IPI will deliver after on_cpu is cleared. + * + * Ensure we load task_cpu(p) after p->on_cpu: + * + * set_task_cpu(p, cpu); + * STORE p->cpu = @cpu + * __schedule() (switch to task 'p') + * LOCK rq->lock + * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu) + * STORE p->on_cpu = 1 LOAD p->cpu + * + * to ensure we observe the correct CPU on which the task is currently + * scheduling. + */ + if (smp_load_acquire(&p->on_cpu) && + ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU)) + goto unlock; /* * If the owning (remote) CPU is still in the middle of schedule() with @@ -2599,28 +2965,14 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) */ smp_cond_load_acquire(&p->on_cpu, !VAL); - p->sched_contributes_to_load = !!task_contributes_to_load(p); - p->state = TASK_WAKING; - - if (p->in_iowait) { - delayacct_blkio_end(p); - atomic_dec(&task_rq(p)->nr_iowait); - } - cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); if (task_cpu(p) != cpu) { wake_flags |= WF_MIGRATED; psi_ttwu_dequeue(p); set_task_cpu(p, cpu); } - -#else /* CONFIG_SMP */ - - if (p->in_iowait) { - delayacct_blkio_end(p); - atomic_dec(&task_rq(p)->nr_iowait); - } - +#else + cpu = task_cpu(p); #endif /* CONFIG_SMP */ ttwu_queue(p, cpu, wake_flags); @@ -2628,13 +2980,59 @@ unlock: raw_spin_unlock_irqrestore(&p->pi_lock, flags); out: if (success) - ttwu_stat(p, cpu, wake_flags); + ttwu_stat(p, task_cpu(p), wake_flags); preempt_enable(); return success; } /** + * try_invoke_on_locked_down_task - Invoke a function on task in fixed state + * @p: Process for which the function is to be invoked. + * @func: Function to invoke. + * @arg: Argument to function. + * + * If the specified task can be quickly locked into a definite state + * (either sleeping or on a given runqueue), arrange to keep it in that + * state while invoking @func(@arg). This function can use ->on_rq and + * task_curr() to work out what the state is, if required. Given that + * @func can be invoked with a runqueue lock held, it had better be quite + * lightweight. + * + * Returns: + * @false if the task slipped out from under the locks. + * @true if the task was locked onto a runqueue or is sleeping. + * However, @func can override this by returning @false. + */ +bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg) +{ + bool ret = false; + struct rq_flags rf; + struct rq *rq; + + lockdep_assert_irqs_enabled(); + raw_spin_lock_irq(&p->pi_lock); + if (p->on_rq) { + rq = __task_rq_lock(p, &rf); + if (task_rq(p) == rq) + ret = func(p, arg); + rq_unlock(rq, &rf); + } else { + switch (p->state) { + case TASK_RUNNING: + case TASK_WAKING: + break; + default: + smp_rmb(); // See smp_rmb() comment in try_to_wake_up(). + if (!p->on_rq) + ret = func(p, arg); + } + } + raw_spin_unlock_irq(&p->pi_lock); + return ret; +} + +/** * wake_up_process - Wake up a specific process * @p: The process to be woken up. * @@ -2702,6 +3100,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->capture_control = NULL; #endif init_numa_balancing(clone_flags, p); +#ifdef CONFIG_SMP + p->wake_entry.u_flags = CSD_TYPE_TTWU; +#endif } DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); @@ -2718,7 +3119,7 @@ void set_numabalancing_state(bool enabled) #ifdef CONFIG_PROC_SYSCTL int sysctl_numa_balancing(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; int err; @@ -2792,8 +3193,8 @@ static void __init init_schedstats(void) } #ifdef CONFIG_PROC_SYSCTL -int sysctl_schedstats(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { struct ctl_table t; int err; @@ -2876,6 +3277,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) * Silence PROVE_RCU. */ raw_spin_lock_irqsave(&p->pi_lock, flags); + rseq_migrate(p); /* * We're setting the CPU for the first time, we don't migrate, * so use __set_task_cpu(). @@ -2900,6 +3302,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) return 0; } +void sched_post_fork(struct task_struct *p) +{ + uclamp_post_fork(p); +} + unsigned long to_ratio(u64 period, u64 runtime) { if (runtime == RUNTIME_INF) @@ -2940,6 +3347,7 @@ void wake_up_new_task(struct task_struct *p) * as we're not fully set-up yet. */ p->recent_used_cpu = task_cpu(p); + rseq_migrate(p); __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); #endif rq = __task_rq_lock(p, &rf); @@ -3056,8 +3464,10 @@ static inline void prepare_task(struct task_struct *next) /* * Claim the task as running, we do this before switching to it * such that any running task will have this set. + * + * See the ttwu() WF_ON_CPU case and its ordering comment. */ - next->on_cpu = 1; + WRITE_ONCE(next->on_cpu, 1); #endif } @@ -3065,8 +3475,9 @@ static inline void finish_task(struct task_struct *prev) { #ifdef CONFIG_SMP /* - * After ->on_cpu is cleared, the task can be moved to a different CPU. - * We must ensure this doesn't happen until the switch is completely + * This must be the very last reference to @prev from this CPU. After + * p->on_cpu is cleared, the task can be moved to a different CPU. We + * must ensure this doesn't happen until the switch is completely * finished. * * In particular, the load of prev->state in finish_task_switch() must @@ -3565,17 +3976,6 @@ unsigned long long task_sched_runtime(struct task_struct *p) return ns; } -DEFINE_PER_CPU(unsigned long, thermal_pressure); - -void arch_set_thermal_pressure(struct cpumask *cpus, - unsigned long th_pressure) -{ - int cpu; - - for_each_cpu(cpu, cpus) - WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure); -} - /* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. @@ -3859,8 +4259,7 @@ static noinline void __schedule_bug(struct task_struct *prev) if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) && in_atomic_preempt_off()) { pr_err("Preemption disabled at:"); - print_ip_sym(preempt_disable_ip); - pr_cont("\n"); + print_ip_sym(KERN_ERR, preempt_disable_ip); } if (panic_on_warn) panic("scheduling while atomic\n"); @@ -3877,6 +4276,9 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt) #ifdef CONFIG_SCHED_STACK_END_CHECK if (task_stack_end_corrupted(prev)) panic("corrupted stack end detected inside scheduler\n"); + + if (task_scs_end_corrupted(prev)) + panic("corrupted shadow stack detected inside scheduler\n"); #endif #ifdef CONFIG_DEBUG_ATOMIC_SLEEP @@ -3899,6 +4301,28 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt) schedstat_inc(this_rq()->sched_count); } +static void put_prev_task_balance(struct rq *rq, struct task_struct *prev, + struct rq_flags *rf) +{ +#ifdef CONFIG_SMP + const struct sched_class *class; + /* + * We must do the balancing pass before put_prev_task(), such + * that when we release the rq->lock the task is in the same + * state as before we took rq->lock. + * + * We can terminate the balance pass as soon as we know there is + * a runnable task of @class priority or higher. + */ + for_class_range(class, prev->sched_class, &idle_sched_class) { + if (class->balance(rq, prev, rf)) + break; + } +#endif + + put_prev_task(rq, prev); +} + /* * Pick up the highest-prio task: */ @@ -3914,8 +4338,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * higher scheduling class, because otherwise those loose the * opportunity to pull in more work from other CPUs. */ - if (likely((prev->sched_class == &idle_sched_class || - prev->sched_class == &fair_sched_class) && + if (likely(prev->sched_class <= &fair_sched_class && rq->nr_running == rq->cfs.h_nr_running)) { p = pick_next_task_fair(rq, prev, rf); @@ -3932,22 +4355,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) } restart: -#ifdef CONFIG_SMP - /* - * We must do the balancing pass before put_next_task(), such - * that when we release the rq->lock the task is in the same - * state as before we took rq->lock. - * - * We can terminate the balance pass as soon as we know there is - * a runnable task of @class priority or higher. - */ - for_class_range(class, prev->sched_class, &idle_sched_class) { - if (class->balance(rq, prev, rf)) - break; - } -#endif - - put_prev_task(rq, prev); + put_prev_task_balance(rq, prev, rf); for_each_class(class) { p = class->pick_next_task(rq); @@ -4002,6 +4410,7 @@ static void __sched notrace __schedule(bool preempt) { struct task_struct *prev, *next; unsigned long *switch_count; + unsigned long prev_state; struct rq_flags rf; struct rq *rq; int cpu; @@ -4021,9 +4430,16 @@ static void __sched notrace __schedule(bool preempt) /* * Make sure that signal_pending_state()->signal_pending() below * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) - * done by the caller to avoid the race with signal_wake_up(). + * done by the caller to avoid the race with signal_wake_up(): + * + * __set_current_state(@state) signal_wake_up() + * schedule() set_tsk_thread_flag(p, TIF_SIGPENDING) + * wake_up_state(p, state) + * LOCK rq->lock LOCK p->pi_state + * smp_mb__after_spinlock() smp_mb__after_spinlock() + * if (signal_pending_state()) if (p->state & @state) * - * The membarrier system call requires a full memory barrier + * Also, the membarrier system call requires a full memory barrier * after coming from user-space, before storing to rq->curr. */ rq_lock(rq, &rf); @@ -4034,10 +4450,38 @@ static void __sched notrace __schedule(bool preempt) update_rq_clock(rq); switch_count = &prev->nivcsw; - if (!preempt && prev->state) { - if (signal_pending_state(prev->state, prev)) { + + /* + * We must load prev->state once (task_struct::state is volatile), such + * that: + * + * - we form a control dependency vs deactivate_task() below. + * - ptrace_{,un}freeze_traced() can change ->state underneath us. + */ + prev_state = prev->state; + if (!preempt && prev_state) { + if (signal_pending_state(prev_state, prev)) { prev->state = TASK_RUNNING; } else { + prev->sched_contributes_to_load = + (prev_state & TASK_UNINTERRUPTIBLE) && + !(prev_state & TASK_NOLOAD) && + !(prev->flags & PF_FROZEN); + + if (prev->sched_contributes_to_load) + rq->nr_uninterruptible++; + + /* + * __schedule() ttwu() + * prev_state = prev->state; if (p->on_rq && ...) + * if (prev_state) goto out; + * p->on_rq = 0; smp_acquire__after_ctrl_dep(); + * p->state = TASK_WAKING + * + * Where __schedule() and ttwu() have matching control dependencies. + * + * After this, schedule() must not care about p->state any more. + */ deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK); if (prev->in_iowait) { @@ -4349,6 +4793,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, void *key) { + WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC); return try_to_wake_up(curr->private, mode, wake_flags); } EXPORT_SYMBOL(default_wake_function); @@ -4461,7 +4906,8 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) */ if (dl_prio(prio)) { if (!dl_prio(p->normal_prio) || - (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { + (pi_task && dl_prio(pi_task->prio) && + dl_entity_preempt(&pi_task->dl, &p->dl))) { p->dl.dl_boosted = 1; queue_flag |= ENQUEUE_REPLENISH; } else @@ -4637,7 +5083,7 @@ int idle_cpu(int cpu) return 0; #ifdef CONFIG_SMP - if (!llist_empty(&rq->wake_list)) + if (rq->ttwu_pending) return 0; #endif @@ -5050,6 +5496,8 @@ static int _sched_setscheduler(struct task_struct *p, int policy, * @policy: new policy. * @param: structure containing the new RT priority. * + * Use sched_set_fifo(), read its comment. + * * Return: 0 on success. An error code otherwise. * * NOTE that the task may be already dead. @@ -5059,13 +5507,11 @@ int sched_setscheduler(struct task_struct *p, int policy, { return _sched_setscheduler(p, policy, param, true); } -EXPORT_SYMBOL_GPL(sched_setscheduler); int sched_setattr(struct task_struct *p, const struct sched_attr *attr) { return __sched_setscheduler(p, attr, true, true); } -EXPORT_SYMBOL_GPL(sched_setattr); int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) { @@ -5090,7 +5536,51 @@ int sched_setscheduler_nocheck(struct task_struct *p, int policy, { return _sched_setscheduler(p, policy, param, false); } -EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); + +/* + * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally + * incapable of resource management, which is the one thing an OS really should + * be doing. + * + * This is of course the reason it is limited to privileged users only. + * + * Worse still; it is fundamentally impossible to compose static priority + * workloads. You cannot take two correctly working static prio workloads + * and smash them together and still expect them to work. + * + * For this reason 'all' FIFO tasks the kernel creates are basically at: + * + * MAX_RT_PRIO / 2 + * + * The administrator _MUST_ configure the system, the kernel simply doesn't + * know enough information to make a sensible choice. + */ +void sched_set_fifo(struct task_struct *p) +{ + struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 }; + WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0); +} +EXPORT_SYMBOL_GPL(sched_set_fifo); + +/* + * For when you don't much care about FIFO, but want to be above SCHED_NORMAL. + */ +void sched_set_fifo_low(struct task_struct *p) +{ + struct sched_param sp = { .sched_priority = 1 }; + WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0); +} +EXPORT_SYMBOL_GPL(sched_set_fifo_low); + +void sched_set_normal(struct task_struct *p, int nice) +{ + struct sched_attr attr = { + .sched_policy = SCHED_NORMAL, + .sched_nice = nice, + }; + WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0); +} +EXPORT_SYMBOL_GPL(sched_set_normal); static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) @@ -5381,6 +5871,11 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, kattr.sched_nice = task_nice(p); #ifdef CONFIG_UCLAMP_TASK + /* + * This could race with another potential updater, but this is fine + * because it'll correctly read the old or the new value. We don't need + * to guarantee who wins the race as long as it doesn't return garbage. + */ kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; #endif @@ -5738,7 +6233,7 @@ again: if (task_running(p_rq, p) || p->state) goto out_unlock; - yielded = curr->sched_class->yield_to_task(rq, p, preempt); + yielded = curr->sched_class->yield_to_task(rq, p); if (yielded) { schedstat_inc(rq->yld_count); /* @@ -5953,7 +6448,7 @@ void sched_show_task(struct task_struct *p) (unsigned long)task_thread_info(p)->flags); print_worker_info(KERN_INFO, p); - show_stack(p, NULL); + show_stack(p, NULL, KERN_INFO); put_task_stack(p); } EXPORT_SYMBOL_GPL(sched_show_task); @@ -6040,6 +6535,7 @@ void init_idle(struct task_struct *idle, int cpu) idle->se.exec_start = sched_clock(); idle->flags |= PF_IDLE; + scs_task_reset(idle); kasan_unpoison_task_stack(idle); #ifdef CONFIG_SMP @@ -6190,13 +6686,14 @@ void idle_task_exit(void) struct mm_struct *mm = current->active_mm; BUG_ON(cpu_online(smp_processor_id())); + BUG_ON(current != this_rq()->idle); if (mm != &init_mm) { switch_mm(mm, &init_mm, current); - current->active_mm = &init_mm; finish_arch_post_lock_switch(); } - mmdrop(mm); + + /* finish_cpu(), as ran on the BP, will clean up the active_mm state */ } /* @@ -6486,7 +6983,6 @@ int sched_cpu_dying(unsigned int cpu) struct rq_flags rf; /* Handle pending wakeups and then migrate everything off */ - sched_ttwu_pending(); sched_tick_stop(cpu); rq_lock_irqsave(rq, &rf); @@ -6571,6 +7067,14 @@ void __init sched_init(void) unsigned long ptr = 0; int i; + /* Make sure the linker didn't screw up */ + BUG_ON(&idle_sched_class + 1 != &fair_sched_class || + &fair_sched_class + 1 != &rt_sched_class || + &rt_sched_class + 1 != &dl_sched_class); +#ifdef CONFIG_SMP + BUG_ON(&dl_sched_class + 1 != &stop_sched_class); +#endif + wait_bit_init(); #ifdef CONFIG_FAIR_GROUP_SCHED @@ -6589,6 +7093,8 @@ void __init sched_init(void) root_task_group.cfs_rq = (struct cfs_rq **)ptr; ptr += nr_cpu_ids * sizeof(void **); + root_task_group.shares = ROOT_TASK_GROUP_LOAD; + init_cfs_bandwidth(&root_task_group.cfs_bandwidth); #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED root_task_group.rt_se = (struct sched_rt_entity **)ptr; @@ -6641,7 +7147,6 @@ void __init sched_init(void) init_rt_rq(&rq->rt); init_dl_rq(&rq->dl); #ifdef CONFIG_FAIR_GROUP_SCHED - root_task_group.shares = ROOT_TASK_GROUP_LOAD; INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; /* @@ -6663,7 +7168,6 @@ void __init sched_init(void) * We achieve this by letting root_task_group's tasks sit * directly in rq->cfs (i.e root_task_group->se[] = NULL). */ - init_cfs_bandwidth(&root_task_group.cfs_bandwidth); init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -6691,6 +7195,8 @@ void __init sched_init(void) #ifdef CONFIG_NO_HZ_COMMON rq->last_blocked_load_update_tick = jiffies; atomic_set(&rq->nohz_flags, 0); + + rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func); #endif #endif /* CONFIG_SMP */ hrtick_rq_init(rq); @@ -6795,8 +7301,7 @@ void ___might_sleep(const char *file, int line, int preempt_offset) if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) && !preempt_count_equals(preempt_offset)) { pr_err("Preemption disabled at:"); - print_ip_sym(preempt_disable_ip); - pr_cont("\n"); + print_ip_sym(KERN_ERR, preempt_disable_ip); } dump_stack(); add_taint(TAINT_WARN, LOCKDEP_STILL_OK); @@ -7291,6 +7796,8 @@ static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf, if (req.ret) return req.ret; + static_branch_enable(&sched_uclamp_used); + mutex_lock(&uclamp_mutex); rcu_read_lock(); @@ -7385,6 +7892,8 @@ static DEFINE_MUTEX(cfs_constraints_mutex); const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ +/* More than 203 days if BW_SHIFT equals 20. */ +static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC; static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); @@ -7413,6 +7922,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) return -EINVAL; /* + * Bound quota to defend quota against overflow during bandwidth shift. + */ + if (quota != RUNTIME_INF && quota > max_cfs_runtime) + return -EINVAL; + + /* * Prevent race between setting of cfs_rq->runtime_enabled and * unthrottle_offline_cfs_rqs(). */ @@ -7970,4 +8485,7 @@ const u32 sched_prio_to_wmult[40] = { /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, }; -#undef CREATE_TRACE_POINTS +void call_trace_sched_update_nr_running(struct rq *rq, int count) +{ + trace_sched_update_nr_running_tp(rq, count); +} diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 9fbb10383434..941c28cf9738 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -5,6 +5,7 @@ * Based on the work by Paul Menage (menage@google.com) and Balbir Singh * (balbir@in.ibm.com). */ +#include <asm/irq_regs.h> #include "sched.h" /* Time spent by the tasks of the CPU accounting group executing in ... */ @@ -339,7 +340,7 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime) { struct cpuacct *ca; int index = CPUACCT_STAT_SYSTEM; - struct pt_regs *regs = task_pt_regs(tsk); + struct pt_regs *regs = get_irq_regs() ? : task_pt_regs(tsk); if (regs && user_mode(regs)) index = CPUACCT_STAT_USER; @@ -347,7 +348,7 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime) rcu_read_lock(); for (ca = task_ca(tsk); ca; ca = parent_ca(ca)) - this_cpu_ptr(ca->cpuusage)->usages[index] += cputime; + __this_cpu_add(ca->cpuusage->usages[index], cputime); rcu_read_unlock(); } @@ -363,7 +364,7 @@ void cpuacct_account_field(struct task_struct *tsk, int index, u64 val) rcu_read_lock(); for (ca = task_ca(tsk); ca != &root_cpuacct; ca = parent_ca(ca)) - this_cpu_ptr(ca->cpustat)->cpustat[index] += val; + __this_cpu_add(ca->cpustat->cpustat[index], val); rcu_read_unlock(); } diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 5cc4012572ec..8cb06c8c7eb1 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -121,6 +121,30 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, if (later_mask && cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) { + unsigned long cap, max_cap = 0; + int cpu, max_cpu = -1; + + if (!static_branch_unlikely(&sched_asym_cpucapacity)) + return 1; + + /* Ensure the capacity of the CPUs fits the task. */ + for_each_cpu(cpu, later_mask) { + if (!dl_task_fits_capacity(p, cpu)) { + cpumask_clear_cpu(cpu, later_mask); + + cap = capacity_orig_of(cpu); + + if (cap > max_cap || + (cpu == task_cpu(p) && cap == max_cap)) { + max_cap = cap; + max_cpu = cpu; + } + } + } + + if (cpumask_empty(later_mask)) + cpumask_set_cpu(max_cpu, later_mask); + return 1; } else { int best_cpu = cpudl_maximum(cp); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 7fbaee24c824..e39008242cf4 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -210,7 +210,7 @@ unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, unsigned long dl_util, util, irq; struct rq *rq = cpu_rq(cpu); - if (!IS_BUILTIN(CONFIG_UCLAMP_TASK) && + if (!uclamp_is_used() && type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) { return max; } @@ -909,11 +909,7 @@ struct cpufreq_governor *cpufreq_default_governor(void) } #endif -static int __init sugov_register(void) -{ - return cpufreq_register_governor(&schedutil_gov); -} -core_initcall(sugov_register); +cpufreq_governor_init(schedutil_gov); #ifdef CONFIG_ENERGY_MODEL extern bool sched_energy_update; diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index ff9435dee1df..5a55d2300452 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -520,50 +520,6 @@ void account_idle_ticks(unsigned long ticks) } /* - * Perform (stime * rtime) / total, but avoid multiplication overflow by - * losing precision when the numbers are big. - */ -static u64 scale_stime(u64 stime, u64 rtime, u64 total) -{ - u64 scaled; - - for (;;) { - /* Make sure "rtime" is the bigger of stime/rtime */ - if (stime > rtime) - swap(rtime, stime); - - /* Make sure 'total' fits in 32 bits */ - if (total >> 32) - goto drop_precision; - - /* Does rtime (and thus stime) fit in 32 bits? */ - if (!(rtime >> 32)) - break; - - /* Can we just balance rtime/stime rather than dropping bits? */ - if (stime >> 31) - goto drop_precision; - - /* We can grow stime and shrink rtime and try to make them both fit */ - stime <<= 1; - rtime >>= 1; - continue; - -drop_precision: - /* We drop from rtime, it has more bits than stime */ - rtime >>= 1; - total >>= 1; - } - - /* - * Make sure gcc understands that this is a 32x32->64 multiply, - * followed by a 64/32->64 divide. - */ - scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total); - return scaled; -} - -/* * Adjust tick based cputime random precision against scheduler runtime * accounting. * @@ -622,7 +578,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, goto update; } - stime = scale_stime(stime, rtime, stime + utime); + stime = mul_u64_u64_div_u64(stime, rtime, stime + utime); update: /* diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 504d2f51b0d6..3862a28cd05d 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -54,15 +54,49 @@ static inline struct dl_bw *dl_bw_of(int i) static inline int dl_bw_cpus(int i) { struct root_domain *rd = cpu_rq(i)->rd; - int cpus = 0; + int cpus; RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), "sched RCU must be held"); + + if (cpumask_subset(rd->span, cpu_active_mask)) + return cpumask_weight(rd->span); + + cpus = 0; + for_each_cpu_and(i, rd->span, cpu_active_mask) cpus++; return cpus; } + +static inline unsigned long __dl_bw_capacity(int i) +{ + struct root_domain *rd = cpu_rq(i)->rd; + unsigned long cap = 0; + + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), + "sched RCU must be held"); + + for_each_cpu_and(i, rd->span, cpu_active_mask) + cap += capacity_orig_of(i); + + return cap; +} + +/* + * XXX Fix: If 'rq->rd == def_root_domain' perform AC against capacity + * of the CPU the task is running on rather rd's \Sum CPU capacity. + */ +static inline unsigned long dl_bw_capacity(int i) +{ + if (!static_branch_unlikely(&sched_asym_cpucapacity) && + capacity_orig_of(i) == SCHED_CAPACITY_SCALE) { + return dl_bw_cpus(i) << SCHED_CAPACITY_SHIFT; + } else { + return __dl_bw_capacity(i); + } +} #else static inline struct dl_bw *dl_bw_of(int i) { @@ -73,6 +107,11 @@ static inline int dl_bw_cpus(int i) { return 1; } + +static inline unsigned long dl_bw_capacity(int i) +{ + return SCHED_CAPACITY_SCALE; +} #endif static inline @@ -1098,7 +1137,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se) * cannot use the runtime, and so it replenishes the task. This rule * works fine for implicit deadline tasks (deadline == period), and the * CBS was designed for implicit deadline tasks. However, a task with - * constrained deadline (deadine < period) might be awakened after the + * constrained deadline (deadline < period) might be awakened after the * deadline, but before the next period. In this case, replenishing the * task would allow it to run for runtime / deadline. As in this case * deadline < period, CBS enables a task to run for more than the @@ -1604,6 +1643,7 @@ static int select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) { struct task_struct *curr; + bool select_rq; struct rq *rq; if (sd_flag != SD_BALANCE_WAKE) @@ -1623,10 +1663,19 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) * other hand, if it has a shorter deadline, we * try to make it stay here, it might be important. */ - if (unlikely(dl_task(curr)) && - (curr->nr_cpus_allowed < 2 || - !dl_entity_preempt(&p->dl, &curr->dl)) && - (p->nr_cpus_allowed > 1)) { + select_rq = unlikely(dl_task(curr)) && + (curr->nr_cpus_allowed < 2 || + !dl_entity_preempt(&p->dl, &curr->dl)) && + p->nr_cpus_allowed > 1; + + /* + * Take the capacity of the CPU into account to + * ensure it fits the requirement of the task. + */ + if (static_branch_unlikely(&sched_asym_cpucapacity)) + select_rq |= !dl_task_fits_capacity(p, cpu); + + if (select_rq) { int target = find_later_rq(p); if (target != -1 && @@ -2430,8 +2479,8 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, } } -const struct sched_class dl_sched_class = { - .next = &rt_sched_class, +const struct sched_class dl_sched_class + __attribute__((section("__dl_sched_class"))) = { .enqueue_task = enqueue_task_dl, .dequeue_task = dequeue_task_dl, .yield_task = yield_task_dl, @@ -2551,11 +2600,12 @@ void sched_dl_do_global(void) int sched_dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr) { - struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); u64 period = attr->sched_period ?: attr->sched_deadline; u64 runtime = attr->sched_runtime; u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; - int cpus, err = -1; + int cpus, err = -1, cpu = task_cpu(p); + struct dl_bw *dl_b = dl_bw_of(cpu); + unsigned long cap; if (attr->sched_flags & SCHED_FLAG_SUGOV) return 0; @@ -2570,15 +2620,17 @@ int sched_dl_overflow(struct task_struct *p, int policy, * allocated bandwidth of the container. */ raw_spin_lock(&dl_b->lock); - cpus = dl_bw_cpus(task_cpu(p)); + cpus = dl_bw_cpus(cpu); + cap = dl_bw_capacity(cpu); + if (dl_policy(policy) && !task_has_dl_policy(p) && - !__dl_overflow(dl_b, cpus, 0, new_bw)) { + !__dl_overflow(dl_b, cap, 0, new_bw)) { if (hrtimer_active(&p->dl.inactive_timer)) __dl_sub(dl_b, p->dl.dl_bw, cpus); __dl_add(dl_b, new_bw, cpus); err = 0; } else if (dl_policy(policy) && task_has_dl_policy(p) && - !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) { + !__dl_overflow(dl_b, cap, p->dl.dl_bw, new_bw)) { /* * XXX this is slightly incorrect: when the task * utilization decreases, we should delay the total @@ -2635,6 +2687,14 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr) } /* + * Default limits for DL period; on the top end we guard against small util + * tasks still getting rediculous long effective runtimes, on the bottom end we + * guard against timer DoS. + */ +unsigned int sysctl_sched_dl_period_max = 1 << 22; /* ~4 seconds */ +unsigned int sysctl_sched_dl_period_min = 100; /* 100 us */ + +/* * This function validates the new parameters of a -deadline task. * We ask for the deadline not being zero, and greater or equal * than the runtime, as well as the period of being zero or @@ -2646,6 +2706,8 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr) */ bool __checkparam_dl(const struct sched_attr *attr) { + u64 period, max, min; + /* special dl tasks don't actually use any parameter */ if (attr->sched_flags & SCHED_FLAG_SUGOV) return true; @@ -2669,12 +2731,21 @@ bool __checkparam_dl(const struct sched_attr *attr) attr->sched_period & (1ULL << 63)) return false; + period = attr->sched_period; + if (!period) + period = attr->sched_deadline; + /* runtime <= deadline <= period (if period != 0) */ - if ((attr->sched_period != 0 && - attr->sched_period < attr->sched_deadline) || + if (period < attr->sched_deadline || attr->sched_deadline < attr->sched_runtime) return false; + max = (u64)READ_ONCE(sysctl_sched_dl_period_max) * NSEC_PER_USEC; + min = (u64)READ_ONCE(sysctl_sched_dl_period_min) * NSEC_PER_USEC; + + if (period < min || period > max) + return false; + return true; } @@ -2692,6 +2763,7 @@ void __dl_clear_params(struct task_struct *p) dl_se->dl_bw = 0; dl_se->dl_density = 0; + dl_se->dl_boosted = 0; dl_se->dl_throttled = 0; dl_se->dl_yielded = 0; dl_se->dl_non_contending = 0; @@ -2714,19 +2786,19 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) #ifdef CONFIG_SMP int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed) { + unsigned long flags, cap; unsigned int dest_cpu; struct dl_bw *dl_b; bool overflow; - int cpus, ret; - unsigned long flags; + int ret; dest_cpu = cpumask_any_and(cpu_active_mask, cs_cpus_allowed); rcu_read_lock_sched(); dl_b = dl_bw_of(dest_cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); - cpus = dl_bw_cpus(dest_cpu); - overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw); + cap = dl_bw_capacity(dest_cpu); + overflow = __dl_overflow(dl_b, cap, 0, p->dl.dl_bw); if (overflow) { ret = -EBUSY; } else { @@ -2736,6 +2808,8 @@ int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allo * We will free resources in the source root_domain * later on (see set_cpus_allowed_dl()). */ + int cpus = dl_bw_cpus(dest_cpu); + __dl_add(dl_b, p->dl.dl_bw, cpus); ret = 0; } @@ -2768,16 +2842,15 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, bool dl_cpu_busy(unsigned int cpu) { - unsigned long flags; + unsigned long flags, cap; struct dl_bw *dl_b; bool overflow; - int cpus; rcu_read_lock_sched(); dl_b = dl_bw_of(cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); - cpus = dl_bw_cpus(cpu); - overflow = __dl_overflow(dl_b, cpus, 0, 0); + cap = dl_bw_capacity(cpu); + overflow = __dl_overflow(dl_b, cap, 0, 0); raw_spin_unlock_irqrestore(&dl_b->lock, flags); rcu_read_unlock_sched(); diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 239970b991c0..36c54265bb2b 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -258,7 +258,7 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax); set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax); set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0444, proc_dointvec_minmax); set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax); set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring); /* &table[8] is terminator */ @@ -437,7 +437,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) else SEQ_printf(m, " %c", task_state_to_char(p)); - SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ", + SEQ_printf(m, " %15s %5d %9Ld.%06ld %9Ld %5d ", p->comm, task_pid_nr(p), SPLIT_NS(p->se.vruntime), (long long)(p->nvcsw + p->nivcsw), @@ -464,10 +464,10 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) SEQ_printf(m, "\n"); SEQ_printf(m, "runnable tasks:\n"); - SEQ_printf(m, " S task PID tree-key switches prio" + SEQ_printf(m, " S task PID tree-key switches prio" " wait-time sum-exec sum-sleep\n"); SEQ_printf(m, "-------------------------------------------------------" - "----------------------------------------------------\n"); + "------------------------------------------------------\n"); rcu_read_lock(); for_each_process_thread(g, p) { @@ -638,7 +638,6 @@ do { \ P(nr_running); P(nr_switches); - P(nr_load_updates); P(nr_uninterruptible); PN(next_balance); SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr))); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 538ba5d94e99..1a68a0536add 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -22,8 +22,6 @@ */ #include "sched.h" -#include <trace/events/sched.h> - /* * Targeted preemption latency for CPU-bound tasks: * @@ -191,7 +189,7 @@ static void update_sysctl(void) #undef SET_SYSCTL } -void sched_init_granularity(void) +void __init sched_init_granularity(void) { update_sysctl(); } @@ -645,8 +643,7 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) */ int sched_proc_update_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); unsigned int factor = get_update_sysctl_factor(); @@ -807,7 +804,7 @@ void post_init_entity_util_avg(struct task_struct *p) } } - sa->runnable_avg = cpu_scale; + sa->runnable_avg = sa->util_avg; if (p->sched_class != &fair_sched_class) { /* @@ -1094,7 +1091,7 @@ struct numa_group { * more by CPU use than by memory faults. */ unsigned long *faults_cpu; - unsigned long faults[0]; + unsigned long faults[]; }; /* @@ -2771,7 +2768,7 @@ static void task_numa_work(struct callback_head *work) return; - if (!down_read_trylock(&mm->mmap_sem)) + if (!mmap_read_trylock(mm)) return; vma = find_vma(mm, start); if (!vma) { @@ -2839,7 +2836,7 @@ out: mm->numa_scan_offset = start; else reset_ptenuma_scan(p); - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); /* * Make sure tasks use at least 32x as much time to run other code @@ -2908,7 +2905,7 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr) /* * We don't care about NUMA placement if we don't have memory. */ - if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work) + if ((curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work) return; /* @@ -3095,7 +3092,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, #ifdef CONFIG_SMP do { - u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib; + u32 divider = get_pelt_divider(&se->avg); se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider); } while (0); @@ -3441,52 +3438,50 @@ static inline void update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { long delta = gcfs_rq->avg.util_avg - se->avg.util_avg; + u32 divider; /* Nothing to update */ if (!delta) return; /* - * The relation between sum and avg is: - * - * LOAD_AVG_MAX - 1024 + sa->period_contrib - * - * however, the PELT windows are not aligned between grq and gse. + * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. + * See ___update_load_avg() for details. */ + divider = get_pelt_divider(&cfs_rq->avg); /* Set new sched_entity's utilization */ se->avg.util_avg = gcfs_rq->avg.util_avg; - se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX; + se->avg.util_sum = se->avg.util_avg * divider; /* Update parent cfs_rq utilization */ add_positive(&cfs_rq->avg.util_avg, delta); - cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX; + cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider; } static inline void update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg; + u32 divider; /* Nothing to update */ if (!delta) return; /* - * The relation between sum and avg is: - * - * LOAD_AVG_MAX - 1024 + sa->period_contrib - * - * however, the PELT windows are not aligned between grq and gse. + * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. + * See ___update_load_avg() for details. */ + divider = get_pelt_divider(&cfs_rq->avg); /* Set new sched_entity's runnable */ se->avg.runnable_avg = gcfs_rq->avg.runnable_avg; - se->avg.runnable_sum = se->avg.runnable_avg * LOAD_AVG_MAX; + se->avg.runnable_sum = se->avg.runnable_avg * divider; /* Update parent cfs_rq runnable */ add_positive(&cfs_rq->avg.runnable_avg, delta); - cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * LOAD_AVG_MAX; + cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider; } static inline void @@ -3496,19 +3491,26 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq unsigned long load_avg; u64 load_sum = 0; s64 delta_sum; + u32 divider; if (!runnable_sum) return; gcfs_rq->prop_runnable_sum = 0; + /* + * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. + * See ___update_load_avg() for details. + */ + divider = get_pelt_divider(&cfs_rq->avg); + if (runnable_sum >= 0) { /* * Add runnable; clip at LOAD_AVG_MAX. Reflects that until * the CPU is saturated running == runnable. */ runnable_sum += se->avg.load_sum; - runnable_sum = min(runnable_sum, (long)LOAD_AVG_MAX); + runnable_sum = min_t(long, runnable_sum, divider); } else { /* * Estimate the new unweighted runnable_sum of the gcfs_rq by @@ -3533,7 +3535,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq runnable_sum = max(runnable_sum, running_sum); load_sum = (s64)se_weight(se) * runnable_sum; - load_avg = div_s64(load_sum, LOAD_AVG_MAX); + load_avg = div_s64(load_sum, divider); delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum; delta_avg = load_avg - se->avg.load_avg; @@ -3646,7 +3648,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) if (cfs_rq->removed.nr) { unsigned long r; - u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib; + u32 divider = get_pelt_divider(&cfs_rq->avg); raw_spin_lock(&cfs_rq->removed.lock); swap(cfs_rq->removed.util_avg, removed_util); @@ -3697,7 +3699,11 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) */ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; + /* + * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. + * See ___update_load_avg() for details. + */ + u32 divider = get_pelt_divider(&cfs_rq->avg); /* * When we attach the @se to the @cfs_rq, we must align the decay @@ -3873,6 +3879,8 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) return cfs_rq->avg.load_avg; } +static int newidle_balance(struct rq *this_rq, struct rq_flags *rf); + static inline unsigned long task_util(struct task_struct *p) { return READ_ONCE(p->se.avg.util_avg); @@ -3916,6 +3924,8 @@ static inline void util_est_enqueue(struct cfs_rq *cfs_rq, enqueued = cfs_rq->avg.util_est.enqueued; enqueued += _task_util_est(p); WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued); + + trace_sched_util_est_cfs_tp(cfs_rq); } /* @@ -3946,6 +3956,8 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) ue.enqueued -= min_t(unsigned int, ue.enqueued, _task_util_est(p)); WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued); + trace_sched_util_est_cfs_tp(cfs_rq); + /* * Skip update of task's estimated utilization when the task has not * yet completed an activation, e.g. being migrated. @@ -4011,6 +4023,8 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) ue.ewma >>= UTIL_EST_WEIGHT_SHIFT; done: WRITE_ONCE(p->se.avg.util_est, ue); + + trace_sched_util_est_se_tp(&p->se); } static inline int task_fits_capacity(struct task_struct *p, long capacity) @@ -4033,7 +4047,11 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) return; } - rq->misfit_task_load = task_h_load(p); + /* + * Make sure that misfit_task_load will not be null even if + * task_h_load() returns 0. + */ + rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1); } #else /* CONFIG_SMP */ @@ -4054,7 +4072,7 @@ attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} static inline void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} -static inline int idle_balance(struct rq *rq, struct rq_flags *rf) +static inline int newidle_balance(struct rq *rq, struct rq_flags *rf) { return 0; } @@ -4588,16 +4606,16 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) } /* returns 0 on failure to allocate runtime */ -static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) +static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b, + struct cfs_rq *cfs_rq, u64 target_runtime) { - struct task_group *tg = cfs_rq->tg; - struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); - u64 amount = 0, min_amount; + u64 min_amount, amount = 0; + + lockdep_assert_held(&cfs_b->lock); /* note: this is a positive sum as runtime_remaining <= 0 */ - min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining; + min_amount = target_runtime - cfs_rq->runtime_remaining; - raw_spin_lock(&cfs_b->lock); if (cfs_b->quota == RUNTIME_INF) amount = min_amount; else { @@ -4609,13 +4627,25 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) cfs_b->idle = 0; } } - raw_spin_unlock(&cfs_b->lock); cfs_rq->runtime_remaining += amount; return cfs_rq->runtime_remaining > 0; } +/* returns 0 on failure to allocate runtime */ +static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) +{ + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); + int ret; + + raw_spin_lock(&cfs_b->lock); + ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice()); + raw_spin_unlock(&cfs_b->lock); + + return ret; +} + static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) { /* dock delta_exec before expiring quota (as it could span periods) */ @@ -4704,13 +4734,33 @@ static int tg_throttle_down(struct task_group *tg, void *data) return 0; } -static void throttle_cfs_rq(struct cfs_rq *cfs_rq) +static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; long task_delta, idle_task_delta, dequeue = 1; - bool empty; + + raw_spin_lock(&cfs_b->lock); + /* This will start the period timer if necessary */ + if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1)) { + /* + * We have raced with bandwidth becoming available, and if we + * actually throttled the timer might not unthrottle us for an + * entire period. We additionally needed to make sure that any + * subsequent check_cfs_rq_runtime calls agree not to throttle + * us, as we may commit to do cfs put_prev+pick_next, so we ask + * for 1ns of runtime rather than just check cfs_b. + */ + dequeue = 0; + } else { + list_add_tail_rcu(&cfs_rq->throttled_list, + &cfs_b->throttled_cfs_rq); + } + raw_spin_unlock(&cfs_b->lock); + + if (!dequeue) + return false; /* Throttle no longer required. */ se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; @@ -4744,29 +4794,13 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) if (!se) sub_nr_running(rq, task_delta); - cfs_rq->throttled = 1; - cfs_rq->throttled_clock = rq_clock(rq); - raw_spin_lock(&cfs_b->lock); - empty = list_empty(&cfs_b->throttled_cfs_rq); - - /* - * Add to the _head_ of the list, so that an already-started - * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is - * not running add to the tail so that later runqueues don't get starved. - */ - if (cfs_b->distribute_running) - list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); - else - list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); - /* - * If we're the first throttled task, make sure the bandwidth - * timer is running. + * Note: distribution will already see us throttled via the + * throttled-list. rq->lock protects completion. */ - if (empty) - start_cfs_bandwidth(cfs_b); - - raw_spin_unlock(&cfs_b->lock); + cfs_rq->throttled = 1; + cfs_rq->throttled_clock = rq_clock(rq); + return true; } void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) @@ -4933,14 +4967,12 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u /* * This check is repeated as we release cfs_b->lock while we unthrottle. */ - while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) { - cfs_b->distribute_running = 1; + while (throttled && cfs_b->runtime > 0) { raw_spin_unlock_irqrestore(&cfs_b->lock, flags); /* we can't nest cfs_b->lock while distributing bandwidth */ distribute_cfs_runtime(cfs_b); raw_spin_lock_irqsave(&cfs_b->lock, flags); - cfs_b->distribute_running = 0; throttled = !list_empty(&cfs_b->throttled_cfs_rq); } @@ -5054,10 +5086,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) /* confirm we're still not at a refresh boundary */ raw_spin_lock_irqsave(&cfs_b->lock, flags); cfs_b->slack_started = false; - if (cfs_b->distribute_running) { - raw_spin_unlock_irqrestore(&cfs_b->lock, flags); - return; - } if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) { raw_spin_unlock_irqrestore(&cfs_b->lock, flags); @@ -5067,9 +5095,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) runtime = cfs_b->runtime; - if (runtime) - cfs_b->distribute_running = 1; - raw_spin_unlock_irqrestore(&cfs_b->lock, flags); if (!runtime) @@ -5078,7 +5103,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) distribute_cfs_runtime(cfs_b); raw_spin_lock_irqsave(&cfs_b->lock, flags); - cfs_b->distribute_running = 0; raw_spin_unlock_irqrestore(&cfs_b->lock, flags); } @@ -5139,8 +5163,7 @@ static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) if (cfs_rq_throttled(cfs_rq)) return true; - throttle_cfs_rq(cfs_rq); - return true; + return throttle_cfs_rq(cfs_rq); } static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) @@ -5170,6 +5193,8 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) if (!overrun) break; + idle = do_sched_cfs_period_timer(cfs_b, overrun, flags); + if (++count > 3) { u64 new, old = ktime_to_ns(cfs_b->period); @@ -5199,8 +5224,6 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) /* reset count so we don't come right back in here */ count = 0; } - - idle = do_sched_cfs_period_timer(cfs_b, overrun, flags); } if (idle) cfs_b->period_active = 0; @@ -5221,7 +5244,6 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) cfs_b->period_timer.function = sched_cfs_period_timer; hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); cfs_b->slack_timer.function = sched_cfs_slack_timer; - cfs_b->distribute_running = 0; cfs_b->slack_started = false; } @@ -5506,28 +5528,27 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) list_add_leaf_cfs_rq(cfs_rq); } -enqueue_throttle: - if (!se) { - add_nr_running(rq, 1); - /* - * Since new tasks are assigned an initial util_avg equal to - * half of the spare capacity of their CPU, tiny tasks have the - * ability to cross the overutilized threshold, which will - * result in the load balancer ruining all the task placement - * done by EAS. As a way to mitigate that effect, do not account - * for the first enqueue operation of new tasks during the - * overutilized flag detection. - * - * A better way of solving this problem would be to wait for - * the PELT signals of tasks to converge before taking them - * into account, but that is not straightforward to implement, - * and the following generally works well enough in practice. - */ - if (flags & ENQUEUE_WAKEUP) - update_overutilized_status(rq); + /* At this point se is NULL and we are at root level*/ + add_nr_running(rq, 1); - } + /* + * Since new tasks are assigned an initial util_avg equal to + * half of the spare capacity of their CPU, tiny tasks have the + * ability to cross the overutilized threshold, which will + * result in the load balancer ruining all the task placement + * done by EAS. As a way to mitigate that effect, do not account + * for the first enqueue operation of new tasks during the + * overutilized flag detection. + * + * A better way of solving this problem would be to wait for + * the PELT signals of tasks to converge before taking them + * into account, but that is not straightforward to implement, + * and the following generally works well enough in practice. + */ + if (flags & ENQUEUE_WAKEUP) + update_overutilized_status(rq); +enqueue_throttle: if (cfs_bandwidth_used()) { /* * When bandwidth control is enabled; the cfs_rq_throttled() @@ -5605,14 +5626,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) } -dequeue_throttle: - if (!se) - sub_nr_running(rq, 1); + /* At this point se is NULL and we are at root level*/ + sub_nr_running(rq, 1); /* balance early to pull high priority tasks */ if (unlikely(!was_sched_idle && sched_idle_rq(rq))) rq->next_balance = jiffies; +dequeue_throttle: util_est_dequeue(&rq->cfs, p, task_sleep); hrtick_update(rq); } @@ -5737,7 +5758,7 @@ static int wake_wide(struct task_struct *p) { unsigned int master = current->wakee_flips; unsigned int slave = p->wakee_flips; - int factor = this_cpu_read(sd_llc_size); + int factor = __this_cpu_read(sd_llc_size); if (master < slave) swap(master, slave); @@ -5846,8 +5867,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, } static struct sched_group * -find_idlest_group(struct sched_domain *sd, struct task_struct *p, - int this_cpu, int sd_flag); +find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu); /* * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group. @@ -5930,7 +5950,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p continue; } - group = find_idlest_group(sd, p, cpu, sd_flag); + group = find_idlest_group(sd, p, cpu); if (!group) { sd = sd->child; continue; @@ -6489,7 +6509,7 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) max_util = max(max_util, cpu_util); } - return em_pd_energy(pd->em_pd, max_util, sum_util); + return em_cpu_energy(pd->em_pd, max_util, sum_util); } /* @@ -6671,9 +6691,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f rcu_read_lock(); for_each_domain(cpu, tmp) { - if (!(tmp->flags & SD_LOAD_BALANCE)) - break; - /* * If both 'cpu' and 'prev_cpu' are part of this domain, * cpu is a valid SD_WAKE_AFFINE target. @@ -7152,7 +7169,7 @@ static void yield_task_fair(struct rq *rq) set_skip_buddy(se); } -static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt) +static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) { struct sched_entity *se = &p->se; @@ -7633,7 +7650,14 @@ static int detach_tasks(struct lb_env *env) switch (env->migration_type) { case migrate_load: - load = task_h_load(p); + /* + * Depending of the number of CPUs and tasks and the + * cgroup hierarchy, task_h_load() can return a null + * value. Make sure that env->imbalance decreases + * otherwise detach_tasks() will stop only after + * detaching up to loop_max tasks. + */ + load = max_t(unsigned long, task_h_load(p), 1); if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed) @@ -8033,7 +8057,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) }; } -static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu) +static unsigned long scale_rt_capacity(int cpu) { struct rq *rq = cpu_rq(cpu); unsigned long max = arch_scale_cpu_capacity(cpu); @@ -8065,7 +8089,7 @@ static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu) static void update_cpu_capacity(struct sched_domain *sd, int cpu) { - unsigned long capacity = scale_rt_capacity(sd, cpu); + unsigned long capacity = scale_rt_capacity(cpu); struct sched_group *sdg = sd->groups; cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu); @@ -8584,7 +8608,7 @@ static int idle_cpu_without(int cpu, struct task_struct *p) */ #ifdef CONFIG_SMP - if (!llist_empty(&rq->wake_list)) + if (rq->ttwu_pending) return 0; #endif @@ -8687,8 +8711,14 @@ static bool update_pick_idlest(struct sched_group *idlest, case group_has_spare: /* Select group with most idle CPUs */ - if (idlest_sgs->idle_cpus >= sgs->idle_cpus) + if (idlest_sgs->idle_cpus > sgs->idle_cpus) + return false; + + /* Select group with lowest group_util */ + if (idlest_sgs->idle_cpus == sgs->idle_cpus && + idlest_sgs->group_util <= sgs->group_util) return false; + break; } @@ -8702,8 +8732,7 @@ static bool update_pick_idlest(struct sched_group *idlest, * Assumes p is allowed on at least one CPU in sd. */ static struct sched_group * -find_idlest_group(struct sched_domain *sd, struct task_struct *p, - int this_cpu, int sd_flag) +find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) { struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups; struct sg_lb_stats local_sgs, tmp_sgs; @@ -9434,7 +9463,7 @@ static int active_load_balance_cpu_stop(void *data); static int should_we_balance(struct lb_env *env) { struct sched_group *sg = env->sd->groups; - int cpu, balance_cpu = -1; + int cpu; /* * Ensure the balancing environment is consistent; can happen @@ -9455,18 +9484,12 @@ static int should_we_balance(struct lb_env *env) if (!idle_cpu(cpu)) continue; - balance_cpu = cpu; - break; + /* Are we the first idle CPU? */ + return cpu == env->dst_cpu; } - if (balance_cpu == -1) - balance_cpu = group_balance_cpu(sg); - - /* - * First idle CPU or the first CPU(busiest) in this sched group - * is eligible for doing load balancing at this and above domains. - */ - return balance_cpu == env->dst_cpu; + /* Are we the first CPU of this group ? */ + return group_balance_cpu(sg) == env->dst_cpu; } /* @@ -9819,9 +9842,8 @@ static int active_load_balance_cpu_stop(void *data) /* Search for an sd spanning us and the target CPU. */ rcu_read_lock(); for_each_domain(target_cpu, sd) { - if ((sd->flags & SD_LOAD_BALANCE) && - cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) - break; + if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) + break; } if (likely(sd)) { @@ -9910,9 +9932,6 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) } max_cost += sd->max_newidle_lb_cost; - if (!(sd->flags & SD_LOAD_BALANCE)) - continue; - /* * Stop the load balance at this level. There is another * CPU in our sched group which is doing load balancing more @@ -10022,24 +10041,32 @@ static void kick_ilb(unsigned int flags) { int ilb_cpu; - nohz.next_balance++; + /* + * Increase nohz.next_balance only when if full ilb is triggered but + * not if we only update stats. + */ + if (flags & NOHZ_BALANCE_KICK) + nohz.next_balance = jiffies+1; ilb_cpu = find_new_ilb(); if (ilb_cpu >= nr_cpu_ids) return; + /* + * Access to rq::nohz_csd is serialized by NOHZ_KICK_MASK; he who sets + * the first flag owns it; cleared by nohz_csd_func(). + */ flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu)); if (flags & NOHZ_KICK_MASK) return; /* - * Use smp_send_reschedule() instead of resched_cpu(). - * This way we generate a sched IPI on the target CPU which + * This way we generate an IPI on the target CPU which * is idle. And the softirq performing nohz idle load balance * will be run before returning from the IPI. */ - smp_send_reschedule(ilb_cpu); + smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd); } /* @@ -10340,6 +10367,14 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, } } + /* + * next_balance will be updated only when there is a need. + * When the CPU is attached to null domain for ex, it will not be + * updated. + */ + if (likely(update_next_balance)) + nohz.next_balance = next_balance; + /* Newly idle CPU doesn't need an update */ if (idle != CPU_NEWLY_IDLE) { update_blocked_averages(this_cpu); @@ -10360,14 +10395,6 @@ abort: if (has_blocked_load) WRITE_ONCE(nohz.has_blocked, 1); - /* - * next_balance will be updated only when there is a need. - * When the CPU is attached to null domain for ex, it will not be - * updated. - */ - if (likely(update_next_balance)) - nohz.next_balance = next_balance; - return ret; } @@ -10377,20 +10404,14 @@ abort: */ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { - int this_cpu = this_rq->cpu; - unsigned int flags; + unsigned int flags = this_rq->nohz_idle_balance; - if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK)) + if (!flags) return false; - if (idle != CPU_IDLE) { - atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu)); - return false; - } + this_rq->nohz_idle_balance = 0; - /* could be _relaxed() */ - flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu)); - if (!(flags & NOHZ_KICK_MASK)) + if (idle != CPU_IDLE) return false; _nohz_idle_balance(this_rq, flags, idle); @@ -10450,7 +10471,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { } * 0 - failed, no new tasks * > 0 - success, new (fair) tasks present */ -int newidle_balance(struct rq *this_rq, struct rq_flags *rf) +static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) { unsigned long next_balance = jiffies + HZ; int this_cpu = this_rq->cpu; @@ -10501,9 +10522,6 @@ int newidle_balance(struct rq *this_rq, struct rq_flags *rf) int continue_balancing = 1; u64 t0, domain_cost; - if (!(sd->flags & SD_LOAD_BALANCE)) - continue; - if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { update_next_balance(sd, &next_balance); break; @@ -11119,8 +11137,8 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task /* * All the scheduling class methods: */ -const struct sched_class fair_sched_class = { - .next = &idle_sched_class, +const struct sched_class fair_sched_class + __attribute__((section("__fair_sched_class"))) = { .enqueue_task = enqueue_task_fair, .dequeue_task = dequeue_task_fair, .yield_task = yield_task_fair, @@ -11293,3 +11311,9 @@ const struct cpumask *sched_trace_rd_span(struct root_domain *rd) #endif } EXPORT_SYMBOL_GPL(sched_trace_rd_span); + +int sched_trace_rq_nr_running(struct rq *rq) +{ + return rq ? rq->nr_running : -1; +} +EXPORT_SYMBOL_GPL(sched_trace_rq_nr_running); diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index b743bf38f08f..6bf34986f45c 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -96,6 +96,15 @@ void __cpuidle default_idle_call(void) } } +static int call_cpuidle_s2idle(struct cpuidle_driver *drv, + struct cpuidle_device *dev) +{ + if (current_clr_polling_and_test()) + return -EBUSY; + + return cpuidle_enter_s2idle(drv, dev); +} + static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev, int next_state) { @@ -171,11 +180,9 @@ static void cpuidle_idle_call(void) if (idle_should_enter_s2idle()) { rcu_idle_enter(); - entered_state = cpuidle_enter_s2idle(drv, dev); - if (entered_state > 0) { - local_irq_enable(); + entered_state = call_cpuidle_s2idle(drv, dev); + if (entered_state > 0) goto exit_idle; - } rcu_idle_exit(); @@ -289,7 +296,11 @@ static void do_idle(void) */ smp_mb__after_atomic(); - sched_ttwu_pending(); + /* + * RCU relies on this call to be done outside of an RCU read-side + * critical section. + */ + flush_smp_call_function_from_idle(); schedule_idle(); if (unlikely(klp_patch_pending(current))) @@ -442,11 +453,6 @@ prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio) BUG(); } -static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) -{ - return 0; -} - static void update_curr_idle(struct rq *rq) { } @@ -454,8 +460,8 @@ static void update_curr_idle(struct rq *rq) /* * Simple, special scheduling class for the per-CPU idle tasks: */ -const struct sched_class idle_sched_class = { - /* .next is NULL */ +const struct sched_class idle_sched_class + __attribute__((section("__idle_sched_class"))) = { /* no enqueue/yield_task for idle tasks */ /* dequeue is not valid, we print a debug message there: */ @@ -475,8 +481,6 @@ const struct sched_class idle_sched_class = { .task_tick = task_tick_idle, - .get_rr_interval = get_rr_interval_idle, - .prio_changed = prio_changed_idle, .switched_to = switched_to_idle, .update_curr = update_curr_idle, diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 808244f3ddd9..5a6ea03f9882 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -140,7 +140,8 @@ static int __init housekeeping_nohz_full_setup(char *str) { unsigned int flags; - flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC; + flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU | + HK_FLAG_MISC | HK_FLAG_KTHREAD; return housekeeping_setup(str, flags); } diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index de22da666ac7..d2a655643a02 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -347,7 +347,7 @@ static inline void calc_global_nohz(void) { } * * Called from the global timer code. */ -void calc_global_load(unsigned long ticks) +void calc_global_load(void) { unsigned long sample_window; long active, delta; diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index b647d04d9c8b..2c613e1cff3a 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -28,8 +28,6 @@ #include "sched.h" #include "pelt.h" -#include <trace/events/sched.h> - /* * Approximate: * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) @@ -83,8 +81,6 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3) return c1 + c2 + c3; } -#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) - /* * Accumulate the three separate parts of the sum; d1 the remainder * of the last (incomplete) period, d2 the span of full periods and d3 @@ -237,10 +233,34 @@ ___update_load_sum(u64 now, struct sched_avg *sa, return 1; } +/* + * When syncing *_avg with *_sum, we must take into account the current + * position in the PELT segment otherwise the remaining part of the segment + * will be considered as idle time whereas it's not yet elapsed and this will + * generate unwanted oscillation in the range [1002..1024[. + * + * The max value of *_sum varies with the position in the time segment and is + * equals to : + * + * LOAD_AVG_MAX*y + sa->period_contrib + * + * which can be simplified into: + * + * LOAD_AVG_MAX - 1024 + sa->period_contrib + * + * because LOAD_AVG_MAX*y == LOAD_AVG_MAX-1024 + * + * The same care must be taken when a sched entity is added, updated or + * removed from a cfs_rq and we need to update sched_avg. Scheduler entities + * and the cfs rq, to which they are attached, have the same position in the + * time segment because they use the same clock. This means that we can use + * the period_contrib of cfs_rq when updating the sched_avg of a sched_entity + * if it's more convenient. + */ static __always_inline void ___update_load_avg(struct sched_avg *sa, unsigned long load) { - u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib; + u32 divider = get_pelt_divider(sa); /* * Step 2: update *_avg. diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index eb034d9f024d..795e43e02afc 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -37,6 +37,11 @@ update_irq_load_avg(struct rq *rq, u64 running) } #endif +static inline u32 get_pelt_divider(struct sched_avg *avg) +{ + return LOAD_AVG_MAX - 1024 + avg->period_contrib; +} + /* * When a task is dequeued, its estimated utilization should not be update if * its util_avg has not been updated at least once. diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 8f45cdb6463b..967732c0766c 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -190,7 +190,6 @@ static void group_init(struct psi_group *group) INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work); mutex_init(&group->avgs_lock); /* Init trigger-related members */ - atomic_set(&group->poll_scheduled, 0); mutex_init(&group->trigger_lock); INIT_LIST_HEAD(&group->triggers); memset(group->nr_triggers, 0, sizeof(group->nr_triggers)); @@ -199,7 +198,7 @@ static void group_init(struct psi_group *group) memset(group->polling_total, 0, sizeof(group->polling_total)); group->polling_next_update = ULLONG_MAX; group->polling_until = 0; - rcu_assign_pointer(group->poll_kworker, NULL); + rcu_assign_pointer(group->poll_task, NULL); } void __init psi_init(void) @@ -547,47 +546,38 @@ static u64 update_triggers(struct psi_group *group, u64 now) return now + group->poll_min_period; } -/* - * Schedule polling if it's not already scheduled. It's safe to call even from - * hotpath because even though kthread_queue_delayed_work takes worker->lock - * spinlock that spinlock is never contended due to poll_scheduled atomic - * preventing such competition. - */ +/* Schedule polling if it's not already scheduled. */ static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay) { - struct kthread_worker *kworker; + struct task_struct *task; - /* Do not reschedule if already scheduled */ - if (atomic_cmpxchg(&group->poll_scheduled, 0, 1) != 0) + /* + * Do not reschedule if already scheduled. + * Possible race with a timer scheduled after this check but before + * mod_timer below can be tolerated because group->polling_next_update + * will keep updates on schedule. + */ + if (timer_pending(&group->poll_timer)) return; rcu_read_lock(); - kworker = rcu_dereference(group->poll_kworker); + task = rcu_dereference(group->poll_task); /* * kworker might be NULL in case psi_trigger_destroy races with * psi_task_change (hotpath) which can't use locks */ - if (likely(kworker)) - kthread_queue_delayed_work(kworker, &group->poll_work, delay); - else - atomic_set(&group->poll_scheduled, 0); + if (likely(task)) + mod_timer(&group->poll_timer, jiffies + delay); rcu_read_unlock(); } -static void psi_poll_work(struct kthread_work *work) +static void psi_poll_work(struct psi_group *group) { - struct kthread_delayed_work *dwork; - struct psi_group *group; u32 changed_states; u64 now; - dwork = container_of(work, struct kthread_delayed_work, work); - group = container_of(dwork, struct psi_group, poll_work); - - atomic_set(&group->poll_scheduled, 0); - mutex_lock(&group->trigger_lock); now = sched_clock(); @@ -623,6 +613,32 @@ out: mutex_unlock(&group->trigger_lock); } +static int psi_poll_worker(void *data) +{ + struct psi_group *group = (struct psi_group *)data; + + sched_set_fifo_low(current); + + while (true) { + wait_event_interruptible(group->poll_wait, + atomic_cmpxchg(&group->poll_wakeup, 1, 0) || + kthread_should_stop()); + if (kthread_should_stop()) + break; + + psi_poll_work(group); + } + return 0; +} + +static void poll_timer_fn(struct timer_list *t) +{ + struct psi_group *group = from_timer(group, t, poll_timer); + + atomic_set(&group->poll_wakeup, 1); + wake_up_interruptible(&group->poll_wait); +} + static void record_times(struct psi_group_cpu *groupc, int cpu, bool memstall_tick) { @@ -1099,22 +1115,20 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, mutex_lock(&group->trigger_lock); - if (!rcu_access_pointer(group->poll_kworker)) { - struct sched_param param = { - .sched_priority = 1, - }; - struct kthread_worker *kworker; + if (!rcu_access_pointer(group->poll_task)) { + struct task_struct *task; - kworker = kthread_create_worker(0, "psimon"); - if (IS_ERR(kworker)) { + task = kthread_create(psi_poll_worker, group, "psimon"); + if (IS_ERR(task)) { kfree(t); mutex_unlock(&group->trigger_lock); - return ERR_CAST(kworker); + return ERR_CAST(task); } - sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, ¶m); - kthread_init_delayed_work(&group->poll_work, - psi_poll_work); - rcu_assign_pointer(group->poll_kworker, kworker); + atomic_set(&group->poll_wakeup, 0); + init_waitqueue_head(&group->poll_wait); + wake_up_process(task); + timer_setup(&group->poll_timer, poll_timer_fn, 0); + rcu_assign_pointer(group->poll_task, task); } list_add(&t->node, &group->triggers); @@ -1132,7 +1146,7 @@ static void psi_trigger_destroy(struct kref *ref) { struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount); struct psi_group *group = t->group; - struct kthread_worker *kworker_to_destroy = NULL; + struct task_struct *task_to_destroy = NULL; if (static_branch_likely(&psi_disabled)) return; @@ -1158,13 +1172,13 @@ static void psi_trigger_destroy(struct kref *ref) period = min(period, div_u64(tmp->win.size, UPDATES_PER_WINDOW)); group->poll_min_period = period; - /* Destroy poll_kworker when the last trigger is destroyed */ + /* Destroy poll_task when the last trigger is destroyed */ if (group->poll_states == 0) { group->polling_until = 0; - kworker_to_destroy = rcu_dereference_protected( - group->poll_kworker, + task_to_destroy = rcu_dereference_protected( + group->poll_task, lockdep_is_held(&group->trigger_lock)); - rcu_assign_pointer(group->poll_kworker, NULL); + rcu_assign_pointer(group->poll_task, NULL); } } @@ -1172,25 +1186,23 @@ static void psi_trigger_destroy(struct kref *ref) /* * Wait for both *trigger_ptr from psi_trigger_replace and - * poll_kworker RCUs to complete their read-side critical sections - * before destroying the trigger and optionally the poll_kworker + * poll_task RCUs to complete their read-side critical sections + * before destroying the trigger and optionally the poll_task */ synchronize_rcu(); /* * Destroy the kworker after releasing trigger_lock to prevent a * deadlock while waiting for psi_poll_work to acquire trigger_lock */ - if (kworker_to_destroy) { + if (task_to_destroy) { /* * After the RCU grace period has expired, the worker - * can no longer be found through group->poll_kworker. + * can no longer be found through group->poll_task. * But it might have been already scheduled before * that - deschedule it cleanly before destroying it. */ - kthread_cancel_delayed_work_sync(&group->poll_work); - atomic_set(&group->poll_scheduled, 0); - - kthread_destroy_worker(kworker_to_destroy); + del_timer_sync(&group->poll_timer); + kthread_stop(task_to_destroy); } kfree(t); } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index df11d88c9895..f215eea6a966 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -9,6 +9,8 @@ int sched_rr_timeslice = RR_TIMESLICE; int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; +/* More than 4 hours if BW_SHIFT equals 20. */ +static const u64 max_rt_runtime = MAX_BW; static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); @@ -2427,8 +2429,8 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) return 0; } -const struct sched_class rt_sched_class = { - .next = &fair_sched_class, +const struct sched_class rt_sched_class + __attribute__((section("__rt_sched_class"))) = { .enqueue_task = enqueue_task_rt, .dequeue_task = dequeue_task_rt, .yield_task = yield_task_rt, @@ -2585,6 +2587,12 @@ static int tg_set_rt_bandwidth(struct task_group *tg, if (rt_period == 0) return -EINVAL; + /* + * Bound quota to defend quota against overflow during bandwidth shift. + */ + if (rt_runtime != RUNTIME_INF && rt_runtime > max_rt_runtime) + return -EINVAL; + mutex_lock(&rt_constraints_mutex); err = __rt_schedulable(tg, rt_period, rt_runtime); if (err) @@ -2702,7 +2710,9 @@ static int sched_rt_global_validate(void) return -EINVAL; if ((sysctl_sched_rt_runtime != RUNTIME_INF) && - (sysctl_sched_rt_runtime > sysctl_sched_rt_period)) + ((sysctl_sched_rt_runtime > sysctl_sched_rt_period) || + ((u64)sysctl_sched_rt_runtime * + NSEC_PER_USEC > max_rt_runtime))) return -EINVAL; return 0; @@ -2714,9 +2724,8 @@ static void sched_rt_do_global(void) def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); } -int sched_rt_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +int sched_rt_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int old_period, old_runtime; static DEFINE_MUTEX(mutex); @@ -2754,9 +2763,8 @@ undo: return ret; } -int sched_rr_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +int sched_rr_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int ret; static DEFINE_MUTEX(mutex); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index db3a57675ccf..3fd283892761 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -67,6 +67,7 @@ #include <linux/tsacct_kern.h> #include <asm/tlb.h> +#include <asm-generic/vmlinux.lds.h> #ifdef CONFIG_PARAVIRT # include <asm/paravirt.h> @@ -75,6 +76,8 @@ #include "cpupri.h" #include "cpudeadline.h" +#include <trace/events/sched.h> + #ifdef CONFIG_SCHED_DEBUG # define SCHED_WARN_ON(x) WARN_ONCE(x, #x) #else @@ -96,6 +99,7 @@ extern atomic_long_t calc_load_tasks; extern void calc_global_load_tick(struct rq *this_rq); extern long calc_load_fold_active(struct rq *this_rq, long adjust); +extern void call_trace_sched_update_nr_running(struct rq *rq, int count); /* * Helpers for converting nanosecond timing to jiffy resolution */ @@ -310,11 +314,26 @@ void __dl_add(struct dl_bw *dl_b, u64 tsk_bw, int cpus) __dl_update(dl_b, -((s32)tsk_bw / cpus)); } -static inline -bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) +static inline bool __dl_overflow(struct dl_bw *dl_b, unsigned long cap, + u64 old_bw, u64 new_bw) { return dl_b->bw != -1 && - dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; + cap_scale(dl_b->bw, cap) < dl_b->total_bw - old_bw + new_bw; +} + +/* + * Verify the fitness of task @p to run on @cpu taking into account the + * CPU original capacity and the runtime/deadline ratio of the task. + * + * The function will return true if the CPU original capacity of the + * @cpu scaled by SCHED_CAPACITY_SCALE >= runtime/deadline ratio of the + * task and false otherwise. + */ +static inline bool dl_task_fits_capacity(struct task_struct *p, int cpu) +{ + unsigned long cap = arch_scale_cpu_capacity(cpu); + + return cap_scale(p->dl.dl_deadline, cap) >= p->dl.dl_runtime; } extern void init_dl_bw(struct dl_bw *dl_b); @@ -349,7 +368,6 @@ struct cfs_bandwidth { u8 idle; u8 period_active; - u8 distribute_running; u8 slack_started; struct hrtimer period_timer; struct hrtimer slack_timer; @@ -863,6 +881,8 @@ struct uclamp_rq { unsigned int value; struct uclamp_bucket bucket[UCLAMP_BUCKETS]; }; + +DECLARE_STATIC_KEY_FALSE(sched_uclamp_used); #endif /* CONFIG_UCLAMP_TASK */ /* @@ -890,12 +910,15 @@ struct rq { #ifdef CONFIG_SMP unsigned long last_blocked_load_update_tick; unsigned int has_blocked_load; + call_single_data_t nohz_csd; #endif /* CONFIG_SMP */ unsigned int nohz_tick_stopped; - atomic_t nohz_flags; + atomic_t nohz_flags; #endif /* CONFIG_NO_HZ_COMMON */ - unsigned long nr_load_updates; +#ifdef CONFIG_SMP + unsigned int ttwu_pending; +#endif u64 nr_switches; #ifdef CONFIG_UCLAMP_TASK @@ -951,6 +974,7 @@ struct rq { struct callback_head *balance_callback; + unsigned char nohz_idle_balance; unsigned char idle_balance; unsigned long misfit_task_load; @@ -979,7 +1003,7 @@ struct rq { /* This is used to determine avg_idle's max value */ u64 max_idle_balance_cost; -#endif +#endif /* CONFIG_SMP */ #ifdef CONFIG_IRQ_TIME_ACCOUNTING u64 prev_irq_time; @@ -1020,10 +1044,6 @@ struct rq { unsigned int ttwu_local; #endif -#ifdef CONFIG_SMP - struct llist_head wake_list; -#endif - #ifdef CONFIG_CPU_IDLE /* Must be inspected within a rcu lock section */ struct cpuidle_state *idle_state; @@ -1183,6 +1203,16 @@ struct rq_flags { #endif }; +/* + * Lockdep annotation that avoids accidental unlocks; it's like a + * sticky/continuous lockdep_assert_held(). + * + * This avoids code that has access to 'struct rq *rq' (basically everything in + * the scheduler) from accidentally unlocking the rq if they do not also have a + * copy of the (on-stack) 'struct rq_flags rf'. + * + * Also see Documentation/locking/lockdep-design.rst. + */ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf) { rf->cookie = lockdep_pin_lock(&rq->lock); @@ -1367,8 +1397,6 @@ queue_balance_callback(struct rq *rq, rq->balance_callback = head; } -extern void sched_ttwu_pending(void); - #define rcu_dereference_check_sched_domain(p) \ rcu_dereference_check((p), \ lockdep_is_held(&sched_domains_mutex)) @@ -1461,7 +1489,7 @@ struct sched_group { * by attaching extra space to the end of the structure, * depending on how many CPUs the kernel has booted up with) */ - unsigned long cpumask[0]; + unsigned long cpumask[]; }; static inline struct cpumask *sched_group_span(struct sched_group *sg) @@ -1504,15 +1532,11 @@ static inline void unregister_sched_domain_sysctl(void) } #endif -extern int newidle_balance(struct rq *this_rq, struct rq_flags *rf); +extern void flush_smp_call_function_from_idle(void); -#else - -static inline void sched_ttwu_pending(void) { } - -static inline int newidle_balance(struct rq *this_rq, struct rq_flags *rf) { return 0; } - -#endif /* CONFIG_SMP */ +#else /* !CONFIG_SMP: */ +static inline void flush_smp_call_function_from_idle(void) { } +#endif #include "stats.h" #include "autogroup.h" @@ -1688,7 +1712,8 @@ static inline int task_on_rq_migrating(struct task_struct *p) */ #define WF_SYNC 0x01 /* Waker goes to sleep after wakeup */ #define WF_FORK 0x02 /* Child wakeup after fork */ -#define WF_MIGRATED 0x4 /* Internal use, task got migrated */ +#define WF_MIGRATED 0x04 /* Internal use, task got migrated */ +#define WF_ON_CPU 0x08 /* Wakee is on_cpu */ /* * To aid in avoiding the subversion of "niceness" due to uneven distribution @@ -1745,7 +1770,6 @@ extern const u32 sched_prio_to_wmult[40]; #define RETRY_TASK ((void *)-1UL) struct sched_class { - const struct sched_class *next; #ifdef CONFIG_UCLAMP_TASK int uclamp_enabled; @@ -1754,7 +1778,7 @@ struct sched_class { void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); void (*yield_task) (struct rq *rq); - bool (*yield_to_task)(struct rq *rq, struct task_struct *p, bool preempt); + bool (*yield_to_task)(struct rq *rq, struct task_struct *p); void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags); @@ -1802,7 +1826,7 @@ struct sched_class { #ifdef CONFIG_FAIR_GROUP_SCHED void (*task_change_group)(struct task_struct *p, int type); #endif -}; +} __aligned(STRUCT_ALIGNMENT); /* STRUCT_ALIGN(), vmlinux.lds.h */ static inline void put_prev_task(struct rq *rq, struct task_struct *prev) { @@ -1816,17 +1840,18 @@ static inline void set_next_task(struct rq *rq, struct task_struct *next) next->sched_class->set_next_task(rq, next, false); } -#ifdef CONFIG_SMP -#define sched_class_highest (&stop_sched_class) -#else -#define sched_class_highest (&dl_sched_class) -#endif +/* Defined in include/asm-generic/vmlinux.lds.h */ +extern struct sched_class __begin_sched_classes[]; +extern struct sched_class __end_sched_classes[]; + +#define sched_class_highest (__end_sched_classes - 1) +#define sched_class_lowest (__begin_sched_classes - 1) #define for_class_range(class, _from, _to) \ - for (class = (_from); class != (_to); class = class->next) + for (class = (_from); class != (_to); class--) #define for_each_class(class) \ - for_class_range(class, sched_class_highest, NULL) + for_class_range(class, sched_class_highest, sched_class_lowest) extern const struct sched_class stop_sched_class; extern const struct sched_class dl_sched_class; @@ -1918,6 +1943,8 @@ extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se); #define BW_SHIFT 20 #define BW_UNIT (1 << BW_SHIFT) #define RATIO_SHIFT 8 +#define MAX_BW_BITS (64 - BW_SHIFT) +#define MAX_BW ((1ULL << MAX_BW_BITS) - 1) unsigned long to_ratio(u64 period, u64 runtime); extern void init_entity_runnable_average(struct sched_entity *se); @@ -1934,12 +1961,7 @@ extern int __init sched_tick_offload_init(void); */ static inline void sched_update_tick_dependency(struct rq *rq) { - int cpu; - - if (!tick_nohz_full_enabled()) - return; - - cpu = cpu_of(rq); + int cpu = cpu_of(rq); if (!tick_nohz_full_cpu(cpu)) return; @@ -1959,6 +1981,9 @@ static inline void add_nr_running(struct rq *rq, unsigned count) unsigned prev_nr = rq->nr_running; rq->nr_running = prev_nr + count; + if (trace_sched_update_nr_running_tp_enabled()) { + call_trace_sched_update_nr_running(rq, count); + } #ifdef CONFIG_SMP if (prev_nr < 2 && rq->nr_running >= 2) { @@ -1973,6 +1998,10 @@ static inline void add_nr_running(struct rq *rq, unsigned count) static inline void sub_nr_running(struct rq *rq, unsigned count) { rq->nr_running -= count; + if (trace_sched_update_nr_running_tp_enabled()) { + call_trace_sched_update_nr_running(rq, count); + } + /* Check if we still need preemption */ sched_update_tick_dependency(rq); } @@ -2020,6 +2049,16 @@ void arch_scale_freq_tick(void) #endif #ifndef arch_scale_freq_capacity +/** + * arch_scale_freq_capacity - get the frequency scale factor of a given CPU. + * @cpu: the CPU in question. + * + * Return: the frequency scale factor normalized against SCHED_CAPACITY_SCALE, i.e. + * + * f_curr + * ------ * SCHED_CAPACITY_SCALE + * f_max + */ static __always_inline unsigned long arch_scale_freq_capacity(int cpu) { @@ -2353,12 +2392,35 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} #ifdef CONFIG_UCLAMP_TASK unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id); +/** + * uclamp_rq_util_with - clamp @util with @rq and @p effective uclamp values. + * @rq: The rq to clamp against. Must not be NULL. + * @util: The util value to clamp. + * @p: The task to clamp against. Can be NULL if you want to clamp + * against @rq only. + * + * Clamps the passed @util to the max(@rq, @p) effective uclamp values. + * + * If sched_uclamp_used static key is disabled, then just return the util + * without any clamping since uclamp aggregation at the rq level in the fast + * path is disabled, rendering this operation a NOP. + * + * Use uclamp_eff_value() if you don't care about uclamp values at rq level. It + * will return the correct effective uclamp value of the task even if the + * static key is disabled. + */ static __always_inline unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, struct task_struct *p) { - unsigned long min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value); - unsigned long max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value); + unsigned long min_util; + unsigned long max_util; + + if (!static_branch_likely(&sched_uclamp_used)) + return util; + + min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value); + max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value); if (p) { min_util = max(min_util, uclamp_eff_value(p, UCLAMP_MIN)); @@ -2375,6 +2437,19 @@ unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, return clamp(util, min_util, max_util); } + +/* + * When uclamp is compiled in, the aggregation at rq level is 'turned off' + * by default in the fast path and only gets turned on once userspace performs + * an operation that requires it. + * + * Returns true if userspace opted-in to use uclamp and aggregation at rq level + * hence is active. + */ +static inline bool uclamp_is_used(void) +{ + return static_branch_likely(&sched_uclamp_used); +} #else /* CONFIG_UCLAMP_TASK */ static inline unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, @@ -2382,6 +2457,11 @@ unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, { return util; } + +static inline bool uclamp_is_used(void) +{ + return false; +} #endif /* CONFIG_UCLAMP_TASK */ #ifdef arch_scale_freq_capacity diff --git a/kernel/sched/smp.h b/kernel/sched/smp.h new file mode 100644 index 000000000000..9620e323162c --- /dev/null +++ b/kernel/sched/smp.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Scheduler internal SMP callback types and methods between the scheduler + * and other internal parts of the core kernel: + */ + +extern void sched_ttwu_pending(void *arg); + +extern void send_call_function_single_ipi(int cpu); diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 4c9e9975684f..394bc8126a1e 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -102,12 +102,6 @@ prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio) BUG(); /* how!?, what priority? */ } -static unsigned int -get_rr_interval_stop(struct rq *rq, struct task_struct *task) -{ - return 0; -} - static void update_curr_stop(struct rq *rq) { } @@ -115,8 +109,8 @@ static void update_curr_stop(struct rq *rq) /* * Simple, special scheduling class for the per-CPU stop tasks: */ -const struct sched_class stop_sched_class = { - .next = &dl_sched_class, +const struct sched_class stop_sched_class + __attribute__((section("__stop_sched_class"))) = { .enqueue_task = enqueue_task_stop, .dequeue_task = dequeue_task_stop, @@ -136,8 +130,6 @@ const struct sched_class stop_sched_class = { .task_tick = task_tick_stop, - .get_rr_interval = get_rr_interval_stop, - .prio_changed = prio_changed_stop, .switched_to = switched_to_stop, .update_curr = update_curr_stop, diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 8344757bba6e..007b0a6b0152 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -33,14 +33,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpumask_clear(groupmask); printk(KERN_DEBUG "%*s domain-%d: ", level, "", level); - - if (!(sd->flags & SD_LOAD_BALANCE)) { - printk("does not load-balance\n"); - if (sd->parent) - printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent"); - return -1; - } - printk(KERN_CONT "span=%*pbl level=%s\n", cpumask_pr_args(sched_domain_span(sd)), sd->name); @@ -151,8 +143,7 @@ static int sd_degenerate(struct sched_domain *sd) return 1; /* Following flags need at least 2 groups */ - if (sd->flags & (SD_LOAD_BALANCE | - SD_BALANCE_NEWIDLE | + if (sd->flags & (SD_BALANCE_NEWIDLE | SD_BALANCE_FORK | SD_BALANCE_EXEC | SD_SHARE_CPUCAPACITY | @@ -183,15 +174,14 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) /* Flags needing groups don't count if only 1 group in parent */ if (parent->groups == parent->groups->next) { - pflags &= ~(SD_LOAD_BALANCE | - SD_BALANCE_NEWIDLE | - SD_BALANCE_FORK | - SD_BALANCE_EXEC | - SD_ASYM_CPUCAPACITY | - SD_SHARE_CPUCAPACITY | - SD_SHARE_PKG_RESOURCES | - SD_PREFER_SIBLING | - SD_SHARE_POWERDOMAIN); + pflags &= ~(SD_BALANCE_NEWIDLE | + SD_BALANCE_FORK | + SD_BALANCE_EXEC | + SD_ASYM_CPUCAPACITY | + SD_SHARE_CPUCAPACITY | + SD_SHARE_PKG_RESOURCES | + SD_PREFER_SIBLING | + SD_SHARE_POWERDOMAIN); if (nr_node_ids == 1) pflags &= ~SD_SERIALIZE; } @@ -209,7 +199,7 @@ bool sched_energy_update; #ifdef CONFIG_PROC_SYSCTL int sched_energy_aware_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret, state; @@ -282,10 +272,10 @@ static void perf_domain_debug(const struct cpumask *cpu_map, printk(KERN_DEBUG "root_domain %*pbl:", cpumask_pr_args(cpu_map)); while (pd) { - printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_cstate=%d }", + printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_pstate=%d }", cpumask_first(perf_domain_span(pd)), cpumask_pr_args(perf_domain_span(pd)), - em_pd_nr_cap_states(pd->em_pd)); + em_pd_nr_perf_states(pd->em_pd)); pd = pd->next; } @@ -323,26 +313,26 @@ static void sched_energy_set(bool has_eas) * * The complexity of the Energy Model is defined as: * - * C = nr_pd * (nr_cpus + nr_cs) + * C = nr_pd * (nr_cpus + nr_ps) * * with parameters defined as: * - nr_pd: the number of performance domains * - nr_cpus: the number of CPUs - * - nr_cs: the sum of the number of capacity states of all performance + * - nr_ps: the sum of the number of performance states of all performance * domains (for example, on a system with 2 performance domains, - * with 10 capacity states each, nr_cs = 2 * 10 = 20). + * with 10 performance states each, nr_ps = 2 * 10 = 20). * * It is generally not a good idea to use such a model in the wake-up path on * very complex platforms because of the associated scheduling overheads. The * arbitrary constraint below prevents that. It makes EAS usable up to 16 CPUs - * with per-CPU DVFS and less than 8 capacity states each, for example. + * with per-CPU DVFS and less than 8 performance states each, for example. */ #define EM_MAX_COMPLEXITY 2048 extern struct cpufreq_governor schedutil_gov; static bool build_perf_domains(const struct cpumask *cpu_map) { - int i, nr_pd = 0, nr_cs = 0, nr_cpus = cpumask_weight(cpu_map); + int i, nr_pd = 0, nr_ps = 0, nr_cpus = cpumask_weight(cpu_map); struct perf_domain *pd = NULL, *tmp; int cpu = cpumask_first(cpu_map); struct root_domain *rd = cpu_rq(cpu)->rd; @@ -394,15 +384,15 @@ static bool build_perf_domains(const struct cpumask *cpu_map) pd = tmp; /* - * Count performance domains and capacity states for the + * Count performance domains and performance states for the * complexity check. */ nr_pd++; - nr_cs += em_pd_nr_cap_states(pd->em_pd); + nr_ps += em_pd_nr_perf_states(pd->em_pd); } /* Bail out if the Energy Model complexity is too high. */ - if (nr_pd * (nr_cs + nr_cpus) > EM_MAX_COMPLEXITY) { + if (nr_pd * (nr_ps + nr_cpus) > EM_MAX_COMPLEXITY) { WARN(1, "rd %*pbl: Failed to start EAS, EM complexity is too high\n", cpumask_pr_args(cpu_map)); goto free; @@ -1338,7 +1328,7 @@ sd_init(struct sched_domain_topology_level *tl, sd_flags = (*tl->sd_flags)(); if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS, "wrong sd_flags in topology description\n")) - sd_flags &= ~TOPOLOGY_SD_FLAGS; + sd_flags &= TOPOLOGY_SD_FLAGS; /* Apply detected topology flags */ sd_flags |= dflags; @@ -1351,8 +1341,7 @@ sd_init(struct sched_domain_topology_level *tl, .cache_nice_tries = 0, - .flags = 1*SD_LOAD_BALANCE - | 1*SD_BALANCE_NEWIDLE + .flags = 1*SD_BALANCE_NEWIDLE | 1*SD_BALANCE_EXEC | 1*SD_BALANCE_FORK | 0*SD_BALANCE_WAKE diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index ba059fbfc53a..01f5d3020589 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -389,7 +389,7 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i int ret = default_wake_function(wq_entry, mode, sync, key); if (ret) - list_del_init(&wq_entry->entry); + list_del_init_careful(&wq_entry->entry); return ret; } diff --git a/kernel/scs.c b/kernel/scs.c new file mode 100644 index 000000000000..5d4d9bbdec36 --- /dev/null +++ b/kernel/scs.c @@ -0,0 +1,104 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Shadow Call Stack support. + * + * Copyright (C) 2019 Google LLC + */ + +#include <linux/kasan.h> +#include <linux/mm.h> +#include <linux/scs.h> +#include <linux/slab.h> +#include <linux/vmstat.h> + +static struct kmem_cache *scs_cache; + +static void __scs_account(void *s, int account) +{ + struct page *scs_page = virt_to_page(s); + + mod_zone_page_state(page_zone(scs_page), NR_KERNEL_SCS_KB, + account * (SCS_SIZE / SZ_1K)); +} + +static void *scs_alloc(int node) +{ + void *s = kmem_cache_alloc_node(scs_cache, GFP_SCS, node); + + if (!s) + return NULL; + + *__scs_magic(s) = SCS_END_MAGIC; + + /* + * Poison the allocation to catch unintentional accesses to + * the shadow stack when KASAN is enabled. + */ + kasan_poison_object_data(scs_cache, s); + __scs_account(s, 1); + return s; +} + +static void scs_free(void *s) +{ + __scs_account(s, -1); + kasan_unpoison_object_data(scs_cache, s); + kmem_cache_free(scs_cache, s); +} + +void __init scs_init(void) +{ + scs_cache = kmem_cache_create("scs_cache", SCS_SIZE, 0, 0, NULL); +} + +int scs_prepare(struct task_struct *tsk, int node) +{ + void *s = scs_alloc(node); + + if (!s) + return -ENOMEM; + + task_scs(tsk) = task_scs_sp(tsk) = s; + return 0; +} + +static void scs_check_usage(struct task_struct *tsk) +{ + static unsigned long highest; + + unsigned long *p, prev, curr = highest, used = 0; + + if (!IS_ENABLED(CONFIG_DEBUG_STACK_USAGE)) + return; + + for (p = task_scs(tsk); p < __scs_magic(tsk); ++p) { + if (!READ_ONCE_NOCHECK(*p)) + break; + used += sizeof(*p); + } + + while (used > curr) { + prev = cmpxchg_relaxed(&highest, curr, used); + + if (prev == curr) { + pr_info("%s (%d): highest shadow stack usage: %lu bytes\n", + tsk->comm, task_pid_nr(tsk), used); + break; + } + + curr = prev; + } +} + +void scs_release(struct task_struct *tsk) +{ + void *s = task_scs(tsk); + + if (!s) + return; + + WARN(task_scs_end_corrupted(tsk), + "corrupted shadow stack detected when freeing task\n"); + scs_check_usage(tsk); + scs_free(s); +} diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 55a6184f5990..3ee59ce0a323 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -13,6 +13,7 @@ * Mode 2 allows user-defined system call filters in the form * of Berkeley Packet Filters/Linux Socket Filters. */ +#define pr_fmt(fmt) "seccomp: " fmt #include <linux/refcount.h> #include <linux/audit.h> @@ -41,6 +42,15 @@ #include <linux/tracehook.h> #include <linux/uaccess.h> #include <linux/anon_inodes.h> +#include <linux/lockdep.h> + +/* + * When SECCOMP_IOCTL_NOTIF_ID_VALID was first introduced, it had the + * wrong direction flag in the ioctl number. This is the broken one, + * which the kernel needs to keep supporting until all userspaces stop + * using the wrong command number. + */ +#define SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR SECCOMP_IOR(2, __u64) enum notify_state { SECCOMP_NOTIFY_INIT, @@ -77,10 +87,42 @@ struct seccomp_knotif { long val; u32 flags; - /* Signals when this has entered SECCOMP_NOTIFY_REPLIED */ + /* + * Signals when this has changed states, such as the listener + * dying, a new seccomp addfd message, or changing to REPLIED + */ struct completion ready; struct list_head list; + + /* outstanding addfd requests */ + struct list_head addfd; +}; + +/** + * struct seccomp_kaddfd - container for seccomp_addfd ioctl messages + * + * @file: A reference to the file to install in the other task + * @fd: The fd number to install it at. If the fd number is -1, it means the + * installing process should allocate the fd as normal. + * @flags: The flags for the new file descriptor. At the moment, only O_CLOEXEC + * is allowed. + * @ret: The return value of the installing process. It is set to the fd num + * upon success (>= 0). + * @completion: Indicates that the installing process has completed fd + * installation, or gone away (either due to successful + * reply, or signal) + * + */ +struct seccomp_kaddfd { + struct file *file; + int fd; + unsigned int flags; + + /* To only be set on reply */ + int ret; + struct completion completion; + struct list_head list; }; /** @@ -94,27 +136,35 @@ struct seccomp_knotif { * filter->notify_lock. * @next_id: The id of the next request. * @notifications: A list of struct seccomp_knotif elements. - * @wqh: A wait queue for poll. */ struct notification { struct semaphore request; u64 next_id; struct list_head notifications; - wait_queue_head_t wqh; }; /** * struct seccomp_filter - container for seccomp BPF programs * - * @usage: reference count to manage the object lifetime. - * get/put helpers should be used when accessing an instance - * outside of a lifetime-guarded section. In general, this - * is only needed for handling filters shared across tasks. + * @refs: Reference count to manage the object lifetime. + * A filter's reference count is incremented for each directly + * attached task, once for the dependent filter, and if + * requested for the user notifier. When @refs reaches zero, + * the filter can be freed. + * @users: A filter's @users count is incremented for each directly + * attached task (filter installation, fork(), thread_sync), + * and once for the dependent filter (tracked in filter->prev). + * When it reaches zero it indicates that no direct or indirect + * users of that filter exist. No new tasks can get associated with + * this filter after reaching 0. The @users count is always smaller + * or equal to @refs. Hence, reaching 0 for @users does not mean + * the filter can be freed. * @log: true if all actions except for SECCOMP_RET_ALLOW should be logged * @prev: points to a previously installed, or inherited, filter * @prog: the BPF program to evaluate * @notif: the struct that holds all notification related information * @notify_lock: A lock for all notification-related accesses. + * @wqh: A wait queue for poll if a notifier is in use. * * seccomp_filter objects are organized in a tree linked via the @prev * pointer. For any task, it appears to be a singly-linked list starting @@ -124,15 +174,17 @@ struct notification { * how namespaces work. * * seccomp_filter objects should never be modified after being attached - * to a task_struct (other than @usage). + * to a task_struct (other than @refs). */ struct seccomp_filter { - refcount_t usage; + refcount_t refs; + refcount_t users; bool log; struct seccomp_filter *prev; struct bpf_prog *prog; struct notification *notif; struct mutex notify_lock; + wait_queue_head_t wqh; }; /* Limit any path through the tree to 256KB worth of instructions. */ @@ -366,6 +418,59 @@ static inline pid_t seccomp_can_sync_threads(void) return 0; } +static inline void seccomp_filter_free(struct seccomp_filter *filter) +{ + if (filter) { + bpf_prog_destroy(filter->prog); + kfree(filter); + } +} + +static void __seccomp_filter_orphan(struct seccomp_filter *orig) +{ + while (orig && refcount_dec_and_test(&orig->users)) { + if (waitqueue_active(&orig->wqh)) + wake_up_poll(&orig->wqh, EPOLLHUP); + orig = orig->prev; + } +} + +static void __put_seccomp_filter(struct seccomp_filter *orig) +{ + /* Clean up single-reference branches iteratively. */ + while (orig && refcount_dec_and_test(&orig->refs)) { + struct seccomp_filter *freeme = orig; + orig = orig->prev; + seccomp_filter_free(freeme); + } +} + +static void __seccomp_filter_release(struct seccomp_filter *orig) +{ + /* Notify about any unused filters in the task's former filter tree. */ + __seccomp_filter_orphan(orig); + /* Finally drop all references to the task's former tree. */ + __put_seccomp_filter(orig); +} + +/** + * seccomp_filter_release - Detach the task from its filter tree, + * drop its reference count, and notify + * about unused filters + * + * This function should only be called when the task is exiting as + * it detaches it from its filter tree. As such, READ_ONCE() and + * barriers are not needed here, as would normally be needed. + */ +void seccomp_filter_release(struct task_struct *tsk) +{ + struct seccomp_filter *orig = tsk->seccomp.filter; + + /* Detach task from its filter tree. */ + tsk->seccomp.filter = NULL; + __seccomp_filter_release(orig); +} + /** * seccomp_sync_threads: sets all threads to use current's filter * @@ -390,14 +495,19 @@ static inline void seccomp_sync_threads(unsigned long flags) /* Get a task reference for the new leaf node. */ get_seccomp_filter(caller); + /* * Drop the task reference to the shared ancestor since * current's path will hold a reference. (This also * allows a put before the assignment.) */ - put_seccomp_filter(thread); + __seccomp_filter_release(thread->seccomp.filter); + + /* Make our new filter tree visible. */ smp_store_release(&thread->seccomp.filter, caller->seccomp.filter); + atomic_set(&thread->seccomp.filter_count, + atomic_read(&thread->seccomp.filter_count)); /* * Don't let an unprivileged task work around @@ -461,7 +571,9 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) return ERR_PTR(ret); } - refcount_set(&sfilter->usage, 1); + refcount_set(&sfilter->refs, 1); + refcount_set(&sfilter->users, 1); + init_waitqueue_head(&sfilter->wqh); return sfilter; } @@ -544,6 +656,7 @@ static long seccomp_attach_filter(unsigned int flags, */ filter->prev = current->seccomp.filter; current->seccomp.filter = filter; + atomic_inc(¤t->seccomp.filter_count); /* Now that the new filter is in place, synchronize to all threads. */ if (flags & SECCOMP_FILTER_FLAG_TSYNC) @@ -554,7 +667,7 @@ static long seccomp_attach_filter(unsigned int flags, static void __get_seccomp_filter(struct seccomp_filter *filter) { - refcount_inc(&filter->usage); + refcount_inc(&filter->refs); } /* get_seccomp_filter - increments the reference count of the filter on @tsk */ @@ -564,30 +677,7 @@ void get_seccomp_filter(struct task_struct *tsk) if (!orig) return; __get_seccomp_filter(orig); -} - -static inline void seccomp_filter_free(struct seccomp_filter *filter) -{ - if (filter) { - bpf_prog_destroy(filter->prog); - kfree(filter); - } -} - -static void __put_seccomp_filter(struct seccomp_filter *orig) -{ - /* Clean up single-reference branches iteratively. */ - while (orig && refcount_dec_and_test(&orig->usage)) { - struct seccomp_filter *freeme = orig; - orig = orig->prev; - seccomp_filter_free(freeme); - } -} - -/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */ -void put_seccomp_filter(struct task_struct *tsk) -{ - __put_seccomp_filter(tsk->seccomp.filter); + refcount_inc(&orig->users); } static void seccomp_init_siginfo(kernel_siginfo_t *info, int syscall, int reason) @@ -684,20 +774,20 @@ static inline void seccomp_log(unsigned long syscall, long signr, u32 action, */ static const int mode1_syscalls[] = { __NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn, - 0, /* null terminated */ + -1, /* negative terminated */ }; static void __secure_computing_strict(int this_syscall) { - const int *syscall_whitelist = mode1_syscalls; + const int *allowed_syscalls = mode1_syscalls; #ifdef CONFIG_COMPAT if (in_compat_syscall()) - syscall_whitelist = get_compat_mode1_syscalls(); + allowed_syscalls = get_compat_mode1_syscalls(); #endif do { - if (*syscall_whitelist == this_syscall) + if (*allowed_syscalls == this_syscall) return; - } while (*++syscall_whitelist); + } while (*++allowed_syscalls != -1); #ifdef SECCOMP_DEBUG dump_stack(); @@ -735,6 +825,17 @@ static u64 seccomp_next_notify_id(struct seccomp_filter *filter) return filter->notif->next_id++; } +static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd) +{ + /* + * Remove the notification, and reset the list pointers, indicating + * that it has been handled. + */ + list_del_init(&addfd->list); + addfd->ret = receive_fd_replace(addfd->fd, addfd->file, addfd->flags); + complete(&addfd->completion); +} + static int seccomp_do_user_notification(int this_syscall, struct seccomp_filter *match, const struct seccomp_data *sd) @@ -743,6 +844,7 @@ static int seccomp_do_user_notification(int this_syscall, u32 flags = 0; long ret = 0; struct seccomp_knotif n = {}; + struct seccomp_kaddfd *addfd, *tmp; mutex_lock(&match->notify_lock); err = -ENOSYS; @@ -755,25 +857,43 @@ static int seccomp_do_user_notification(int this_syscall, n.id = seccomp_next_notify_id(match); init_completion(&n.ready); list_add(&n.list, &match->notif->notifications); + INIT_LIST_HEAD(&n.addfd); up(&match->notif->request); - wake_up_poll(&match->notif->wqh, EPOLLIN | EPOLLRDNORM); + wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM); mutex_unlock(&match->notify_lock); /* * This is where we wait for a reply from userspace. */ +wait: err = wait_for_completion_interruptible(&n.ready); mutex_lock(&match->notify_lock); if (err == 0) { + /* Check if we were woken up by a addfd message */ + addfd = list_first_entry_or_null(&n.addfd, + struct seccomp_kaddfd, list); + if (addfd && n.state != SECCOMP_NOTIFY_REPLIED) { + seccomp_handle_addfd(addfd); + mutex_unlock(&match->notify_lock); + goto wait; + } ret = n.val; err = n.error; flags = n.flags; } + /* If there were any pending addfd calls, clear them out */ + list_for_each_entry_safe(addfd, tmp, &n.addfd, list) { + /* The process went away before we got a chance to handle it */ + addfd->ret = -ESRCH; + list_del_init(&addfd->list); + complete(&addfd->completion); + } + /* * Note that it's possible the listener died in between the time when - * we were notified of a respons (or a signal) and when we were able to + * we were notified of a response (or a signal) and when we were able to * re-acquire the lock, so only delete from the list if the * notification actually exists. * @@ -1011,6 +1131,11 @@ static int seccomp_notify_release(struct inode *inode, struct file *file) knotif->error = -ENOSYS; knotif->val = 0; + /* + * We do not need to wake up any pending addfd messages, as + * the notifier will do that for us, as this just looks + * like a standard reply. + */ complete(&knotif->ready); } @@ -1021,6 +1146,23 @@ static int seccomp_notify_release(struct inode *inode, struct file *file) return 0; } +/* must be called with notif_lock held */ +static inline struct seccomp_knotif * +find_notification(struct seccomp_filter *filter, u64 id) +{ + struct seccomp_knotif *cur; + + lockdep_assert_held(&filter->notify_lock); + + list_for_each_entry(cur, &filter->notif->notifications, list) { + if (cur->id == id) + return cur; + } + + return NULL; +} + + static long seccomp_notify_recv(struct seccomp_filter *filter, void __user *buf) { @@ -1064,7 +1206,7 @@ static long seccomp_notify_recv(struct seccomp_filter *filter, unotif.data = *(knotif->data); knotif->state = SECCOMP_NOTIFY_SENT; - wake_up_poll(&filter->notif->wqh, EPOLLOUT | EPOLLWRNORM); + wake_up_poll(&filter->wqh, EPOLLOUT | EPOLLWRNORM); ret = 0; out: mutex_unlock(&filter->notify_lock); @@ -1078,15 +1220,8 @@ out: * may have died when we released the lock, so we need to make * sure it's still around. */ - knotif = NULL; mutex_lock(&filter->notify_lock); - list_for_each_entry(cur, &filter->notif->notifications, list) { - if (cur->id == unotif.id) { - knotif = cur; - break; - } - } - + knotif = find_notification(filter, unotif.id); if (knotif) { knotif->state = SECCOMP_NOTIFY_INIT; up(&filter->notif->request); @@ -1101,7 +1236,7 @@ static long seccomp_notify_send(struct seccomp_filter *filter, void __user *buf) { struct seccomp_notif_resp resp = {}; - struct seccomp_knotif *knotif = NULL, *cur; + struct seccomp_knotif *knotif; long ret; if (copy_from_user(&resp, buf, sizeof(resp))) @@ -1118,13 +1253,7 @@ static long seccomp_notify_send(struct seccomp_filter *filter, if (ret < 0) return ret; - list_for_each_entry(cur, &filter->notif->notifications, list) { - if (cur->id == resp.id) { - knotif = cur; - break; - } - } - + knotif = find_notification(filter, resp.id); if (!knotif) { ret = -ENOENT; goto out; @@ -1150,7 +1279,7 @@ out: static long seccomp_notify_id_valid(struct seccomp_filter *filter, void __user *buf) { - struct seccomp_knotif *knotif = NULL; + struct seccomp_knotif *knotif; u64 id; long ret; @@ -1161,17 +1290,109 @@ static long seccomp_notify_id_valid(struct seccomp_filter *filter, if (ret < 0) return ret; - ret = -ENOENT; - list_for_each_entry(knotif, &filter->notif->notifications, list) { - if (knotif->id == id) { - if (knotif->state == SECCOMP_NOTIFY_SENT) - ret = 0; - goto out; - } + knotif = find_notification(filter, id); + if (knotif && knotif->state == SECCOMP_NOTIFY_SENT) + ret = 0; + else + ret = -ENOENT; + + mutex_unlock(&filter->notify_lock); + return ret; +} + +static long seccomp_notify_addfd(struct seccomp_filter *filter, + struct seccomp_notif_addfd __user *uaddfd, + unsigned int size) +{ + struct seccomp_notif_addfd addfd; + struct seccomp_knotif *knotif; + struct seccomp_kaddfd kaddfd; + int ret; + + BUILD_BUG_ON(sizeof(addfd) < SECCOMP_NOTIFY_ADDFD_SIZE_VER0); + BUILD_BUG_ON(sizeof(addfd) != SECCOMP_NOTIFY_ADDFD_SIZE_LATEST); + + if (size < SECCOMP_NOTIFY_ADDFD_SIZE_VER0 || size >= PAGE_SIZE) + return -EINVAL; + + ret = copy_struct_from_user(&addfd, sizeof(addfd), uaddfd, size); + if (ret) + return ret; + + if (addfd.newfd_flags & ~O_CLOEXEC) + return -EINVAL; + + if (addfd.flags & ~SECCOMP_ADDFD_FLAG_SETFD) + return -EINVAL; + + if (addfd.newfd && !(addfd.flags & SECCOMP_ADDFD_FLAG_SETFD)) + return -EINVAL; + + kaddfd.file = fget(addfd.srcfd); + if (!kaddfd.file) + return -EBADF; + + kaddfd.flags = addfd.newfd_flags; + kaddfd.fd = (addfd.flags & SECCOMP_ADDFD_FLAG_SETFD) ? + addfd.newfd : -1; + init_completion(&kaddfd.completion); + + ret = mutex_lock_interruptible(&filter->notify_lock); + if (ret < 0) + goto out; + + knotif = find_notification(filter, addfd.id); + if (!knotif) { + ret = -ENOENT; + goto out_unlock; } -out: + /* + * We do not want to allow for FD injection to occur before the + * notification has been picked up by a userspace handler, or after + * the notification has been replied to. + */ + if (knotif->state != SECCOMP_NOTIFY_SENT) { + ret = -EINPROGRESS; + goto out_unlock; + } + + list_add(&kaddfd.list, &knotif->addfd); + complete(&knotif->ready); + mutex_unlock(&filter->notify_lock); + + /* Now we wait for it to be processed or be interrupted */ + ret = wait_for_completion_interruptible(&kaddfd.completion); + if (ret == 0) { + /* + * We had a successful completion. The other side has already + * removed us from the addfd queue, and + * wait_for_completion_interruptible has a memory barrier upon + * success that lets us read this value directly without + * locking. + */ + ret = kaddfd.ret; + goto out; + } + + mutex_lock(&filter->notify_lock); + /* + * Even though we were woken up by a signal and not a successful + * completion, a completion may have happened in the mean time. + * + * We need to check again if the addfd request has been handled, + * and if not, we will remove it from the queue. + */ + if (list_empty(&kaddfd.list)) + ret = kaddfd.ret; + else + list_del(&kaddfd.list); + +out_unlock: mutex_unlock(&filter->notify_lock); +out: + fput(kaddfd.file); + return ret; } @@ -1181,13 +1402,22 @@ static long seccomp_notify_ioctl(struct file *file, unsigned int cmd, struct seccomp_filter *filter = file->private_data; void __user *buf = (void __user *)arg; + /* Fixed-size ioctls */ switch (cmd) { case SECCOMP_IOCTL_NOTIF_RECV: return seccomp_notify_recv(filter, buf); case SECCOMP_IOCTL_NOTIF_SEND: return seccomp_notify_send(filter, buf); + case SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR: case SECCOMP_IOCTL_NOTIF_ID_VALID: return seccomp_notify_id_valid(filter, buf); + } + + /* Extensible Argument ioctls */ +#define EA_IOCTL(cmd) ((cmd) & ~(IOC_INOUT | IOCSIZE_MASK)) + switch (EA_IOCTL(cmd)) { + case EA_IOCTL(SECCOMP_IOCTL_NOTIF_ADDFD): + return seccomp_notify_addfd(filter, buf, _IOC_SIZE(cmd)); default: return -EINVAL; } @@ -1200,7 +1430,7 @@ static __poll_t seccomp_notify_poll(struct file *file, __poll_t ret = 0; struct seccomp_knotif *cur; - poll_wait(file, &filter->notif->wqh, poll_tab); + poll_wait(file, &filter->wqh, poll_tab); if (mutex_lock_interruptible(&filter->notify_lock) < 0) return EPOLLERR; @@ -1216,6 +1446,9 @@ static __poll_t seccomp_notify_poll(struct file *file, mutex_unlock(&filter->notify_lock); + if (refcount_read(&filter->users) == 0) + ret |= EPOLLHUP; + return ret; } @@ -1244,7 +1477,6 @@ static struct file *init_listener(struct seccomp_filter *filter) sema_init(&filter->notif->request, 0); filter->notif->next_id = get_random_u64(); INIT_LIST_HEAD(&filter->notif->notifications); - init_waitqueue_head(&filter->notif->wqh); ret = anon_inode_getfile("seccomp notify", &seccomp_notify_ops, filter, O_RDWR); @@ -1776,7 +2008,7 @@ static void audit_actions_logged(u32 actions_logged, u32 old_actions_logged, } static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write, - void __user *buffer, size_t *lenp, + void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -1822,7 +2054,7 @@ static int __init seccomp_sysctl_init(void) hdr = register_sysctl_paths(seccomp_sysctl_path, seccomp_sysctl_table); if (!hdr) - pr_warn("seccomp: sysctl registration failed\n"); + pr_warn("sysctl registration failed\n"); else kmemleak_not_leak(hdr); diff --git a/kernel/signal.c b/kernel/signal.c index 284fc1600063..6f16f7c5d375 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -719,7 +719,7 @@ static int dequeue_synchronous_signal(kernel_siginfo_t *info) * Return the first synchronous signal in the queue. */ list_for_each_entry(q, &pending->list, list) { - /* Synchronous signals have a postive si_code */ + /* Synchronous signals have a positive si_code */ if ((q->info.si_code > SI_USER) && (sigmask(q->info.si_signo) & SYNCHRONOUS_MASK)) { sync = q; @@ -2529,9 +2529,6 @@ bool get_signal(struct ksignal *ksig) struct signal_struct *signal = current->signal; int signr; - if (unlikely(current->task_works)) - task_work_run(); - if (unlikely(uprobe_deny_signal())) return false; @@ -2544,6 +2541,13 @@ bool get_signal(struct ksignal *ksig) relock: spin_lock_irq(&sighand->siglock); + current->jobctl &= ~JOBCTL_TASK_WORK; + if (unlikely(current->task_works)) { + spin_unlock_irq(&sighand->siglock); + task_work_run(); + goto relock; + } + /* * Every stopped thread goes here after wakeup. Check to see if * we should notify the parent, prepare_signal(SIGCONT) encodes @@ -3235,94 +3239,94 @@ int copy_siginfo_from_user(kernel_siginfo_t *to, const siginfo_t __user *from) } #ifdef CONFIG_COMPAT -int copy_siginfo_to_user32(struct compat_siginfo __user *to, - const struct kernel_siginfo *from) -#if defined(CONFIG_X86_X32_ABI) || defined(CONFIG_IA32_EMULATION) -{ - return __copy_siginfo_to_user32(to, from, in_x32_syscall()); -} -int __copy_siginfo_to_user32(struct compat_siginfo __user *to, - const struct kernel_siginfo *from, bool x32_ABI) -#endif +/** + * copy_siginfo_to_external32 - copy a kernel siginfo into a compat user siginfo + * @to: compat siginfo destination + * @from: kernel siginfo source + * + * Note: This function does not work properly for the SIGCHLD on x32, but + * fortunately it doesn't have to. The only valid callers for this function are + * copy_siginfo_to_user32, which is overriden for x32 and the coredump code. + * The latter does not care because SIGCHLD will never cause a coredump. + */ +void copy_siginfo_to_external32(struct compat_siginfo *to, + const struct kernel_siginfo *from) { - struct compat_siginfo new; - memset(&new, 0, sizeof(new)); + memset(to, 0, sizeof(*to)); - new.si_signo = from->si_signo; - new.si_errno = from->si_errno; - new.si_code = from->si_code; + to->si_signo = from->si_signo; + to->si_errno = from->si_errno; + to->si_code = from->si_code; switch(siginfo_layout(from->si_signo, from->si_code)) { case SIL_KILL: - new.si_pid = from->si_pid; - new.si_uid = from->si_uid; + to->si_pid = from->si_pid; + to->si_uid = from->si_uid; break; case SIL_TIMER: - new.si_tid = from->si_tid; - new.si_overrun = from->si_overrun; - new.si_int = from->si_int; + to->si_tid = from->si_tid; + to->si_overrun = from->si_overrun; + to->si_int = from->si_int; break; case SIL_POLL: - new.si_band = from->si_band; - new.si_fd = from->si_fd; + to->si_band = from->si_band; + to->si_fd = from->si_fd; break; case SIL_FAULT: - new.si_addr = ptr_to_compat(from->si_addr); + to->si_addr = ptr_to_compat(from->si_addr); #ifdef __ARCH_SI_TRAPNO - new.si_trapno = from->si_trapno; + to->si_trapno = from->si_trapno; #endif break; case SIL_FAULT_MCEERR: - new.si_addr = ptr_to_compat(from->si_addr); + to->si_addr = ptr_to_compat(from->si_addr); #ifdef __ARCH_SI_TRAPNO - new.si_trapno = from->si_trapno; + to->si_trapno = from->si_trapno; #endif - new.si_addr_lsb = from->si_addr_lsb; + to->si_addr_lsb = from->si_addr_lsb; break; case SIL_FAULT_BNDERR: - new.si_addr = ptr_to_compat(from->si_addr); + to->si_addr = ptr_to_compat(from->si_addr); #ifdef __ARCH_SI_TRAPNO - new.si_trapno = from->si_trapno; + to->si_trapno = from->si_trapno; #endif - new.si_lower = ptr_to_compat(from->si_lower); - new.si_upper = ptr_to_compat(from->si_upper); + to->si_lower = ptr_to_compat(from->si_lower); + to->si_upper = ptr_to_compat(from->si_upper); break; case SIL_FAULT_PKUERR: - new.si_addr = ptr_to_compat(from->si_addr); + to->si_addr = ptr_to_compat(from->si_addr); #ifdef __ARCH_SI_TRAPNO - new.si_trapno = from->si_trapno; + to->si_trapno = from->si_trapno; #endif - new.si_pkey = from->si_pkey; + to->si_pkey = from->si_pkey; break; case SIL_CHLD: - new.si_pid = from->si_pid; - new.si_uid = from->si_uid; - new.si_status = from->si_status; -#ifdef CONFIG_X86_X32_ABI - if (x32_ABI) { - new._sifields._sigchld_x32._utime = from->si_utime; - new._sifields._sigchld_x32._stime = from->si_stime; - } else -#endif - { - new.si_utime = from->si_utime; - new.si_stime = from->si_stime; - } + to->si_pid = from->si_pid; + to->si_uid = from->si_uid; + to->si_status = from->si_status; + to->si_utime = from->si_utime; + to->si_stime = from->si_stime; break; case SIL_RT: - new.si_pid = from->si_pid; - new.si_uid = from->si_uid; - new.si_int = from->si_int; + to->si_pid = from->si_pid; + to->si_uid = from->si_uid; + to->si_int = from->si_int; break; case SIL_SYS: - new.si_call_addr = ptr_to_compat(from->si_call_addr); - new.si_syscall = from->si_syscall; - new.si_arch = from->si_arch; + to->si_call_addr = ptr_to_compat(from->si_call_addr); + to->si_syscall = from->si_syscall; + to->si_arch = from->si_arch; break; } +} +int __copy_siginfo_to_user32(struct compat_siginfo __user *to, + const struct kernel_siginfo *from) +{ + struct compat_siginfo new; + + copy_siginfo_to_external32(&new, from); if (copy_to_user(to, &new, sizeof(struct compat_siginfo))) return -EFAULT; - return 0; } diff --git a/kernel/smp.c b/kernel/smp.c index 786092aabdcd..d0ae8eb6bf8b 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -22,11 +22,9 @@ #include <linux/hypervisor.h> #include "smpboot.h" +#include "sched/smp.h" -enum { - CSD_FLAG_LOCK = 0x01, - CSD_FLAG_SYNCHRONOUS = 0x02, -}; +#define CSD_TYPE(_csd) ((_csd)->flags & CSD_FLAG_TYPE_MASK) struct call_function_data { call_single_data_t __percpu *csd; @@ -84,6 +82,7 @@ int smpcfd_dying_cpu(unsigned int cpu) * still pending. */ flush_smp_call_function_queue(false); + irq_work_run(); return 0; } @@ -134,15 +133,33 @@ static __always_inline void csd_unlock(call_single_data_t *csd) static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data); +void __smp_call_single_queue(int cpu, struct llist_node *node) +{ + /* + * The list addition should be visible before sending the IPI + * handler locks the list to pull the entry off it because of + * normal cache coherency rules implied by spinlocks. + * + * If IPIs can go out of order to the cache coherency protocol + * in an architecture, sufficient synchronisation should be added + * to arch code to make it appear to obey cache coherency WRT + * locking and barrier primitives. Generic code isn't really + * equipped to do the right thing... + */ + if (llist_add(node, &per_cpu(call_single_queue, cpu))) + send_call_function_single_ipi(cpu); +} + /* * Insert a previously allocated call_single_data_t element * for execution on the given CPU. data must already have * ->func, ->info, and ->flags set. */ -static int generic_exec_single(int cpu, call_single_data_t *csd, - smp_call_func_t func, void *info) +static int generic_exec_single(int cpu, call_single_data_t *csd) { if (cpu == smp_processor_id()) { + smp_call_func_t func = csd->func; + void *info = csd->info; unsigned long flags; /* @@ -156,28 +173,12 @@ static int generic_exec_single(int cpu, call_single_data_t *csd, return 0; } - if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu)) { csd_unlock(csd); return -ENXIO; } - csd->func = func; - csd->info = info; - - /* - * The list addition should be visible before sending the IPI - * handler locks the list to pull the entry off it because of - * normal cache coherency rules implied by spinlocks. - * - * If IPIs can go out of order to the cache coherency protocol - * in an architecture, sufficient synchronisation should be added - * to arch code to make it appear to obey cache coherency WRT - * locking and barrier primitives. Generic code isn't really - * equipped to do the right thing... - */ - if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu))) - arch_send_call_function_single_ipi(cpu); + __smp_call_single_queue(cpu, &csd->llist); return 0; } @@ -209,9 +210,9 @@ void generic_smp_call_function_single_interrupt(void) */ static void flush_smp_call_function_queue(bool warn_cpu_offline) { - struct llist_head *head; - struct llist_node *entry; call_single_data_t *csd, *csd_next; + struct llist_node *entry, *prev; + struct llist_head *head; static bool warned; lockdep_assert_irqs_disabled(); @@ -230,32 +231,99 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) * We don't have to use the _safe() variant here * because we are not invoking the IPI handlers yet. */ - llist_for_each_entry(csd, entry, llist) - pr_warn("IPI callback %pS sent to offline CPU\n", - csd->func); + llist_for_each_entry(csd, entry, llist) { + switch (CSD_TYPE(csd)) { + case CSD_TYPE_ASYNC: + case CSD_TYPE_SYNC: + case CSD_TYPE_IRQ_WORK: + pr_warn("IPI callback %pS sent to offline CPU\n", + csd->func); + break; + + case CSD_TYPE_TTWU: + pr_warn("IPI task-wakeup sent to offline CPU\n"); + break; + + default: + pr_warn("IPI callback, unknown type %d, sent to offline CPU\n", + CSD_TYPE(csd)); + break; + } + } } + /* + * First; run all SYNC callbacks, people are waiting for us. + */ + prev = NULL; llist_for_each_entry_safe(csd, csd_next, entry, llist) { - smp_call_func_t func = csd->func; - void *info = csd->info; - /* Do we wait until *after* callback? */ - if (csd->flags & CSD_FLAG_SYNCHRONOUS) { + if (CSD_TYPE(csd) == CSD_TYPE_SYNC) { + smp_call_func_t func = csd->func; + void *info = csd->info; + + if (prev) { + prev->next = &csd_next->llist; + } else { + entry = &csd_next->llist; + } + func(info); csd_unlock(csd); } else { - csd_unlock(csd); - func(info); + prev = &csd->llist; } } + if (!entry) + return; + /* - * Handle irq works queued remotely by irq_work_queue_on(). - * Smp functions above are typically synchronous so they - * better run first since some other CPUs may be busy waiting - * for them. + * Second; run all !SYNC callbacks. */ - irq_work_run(); + prev = NULL; + llist_for_each_entry_safe(csd, csd_next, entry, llist) { + int type = CSD_TYPE(csd); + + if (type != CSD_TYPE_TTWU) { + if (prev) { + prev->next = &csd_next->llist; + } else { + entry = &csd_next->llist; + } + + if (type == CSD_TYPE_ASYNC) { + smp_call_func_t func = csd->func; + void *info = csd->info; + + csd_unlock(csd); + func(info); + } else if (type == CSD_TYPE_IRQ_WORK) { + irq_work_single(csd); + } + + } else { + prev = &csd->llist; + } + } + + /* + * Third; only CSD_TYPE_TTWU is left, issue those. + */ + if (entry) + sched_ttwu_pending(entry); +} + +void flush_smp_call_function_from_idle(void) +{ + unsigned long flags; + + if (llist_empty(this_cpu_ptr(&call_single_queue))) + return; + + local_irq_save(flags); + flush_smp_call_function_queue(true); + local_irq_restore(flags); } /* @@ -271,7 +339,7 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, { call_single_data_t *csd; call_single_data_t csd_stack = { - .flags = CSD_FLAG_LOCK | CSD_FLAG_SYNCHRONOUS, + .flags = CSD_FLAG_LOCK | CSD_TYPE_SYNC, }; int this_cpu; int err; @@ -305,7 +373,10 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, csd_lock(csd); } - err = generic_exec_single(cpu, csd, func, info); + csd->func = func; + csd->info = info; + + err = generic_exec_single(cpu, csd); if (wait) csd_lock_wait(csd); @@ -351,7 +422,7 @@ int smp_call_function_single_async(int cpu, call_single_data_t *csd) csd->flags = CSD_FLAG_LOCK; smp_wmb(); - err = generic_exec_single(cpu, csd, csd->func, csd->info); + err = generic_exec_single(cpu, csd); out: preempt_enable(); @@ -466,7 +537,7 @@ static void smp_call_function_many_cond(const struct cpumask *mask, csd_lock(csd); if (wait) - csd->flags |= CSD_FLAG_SYNCHRONOUS; + csd->flags |= CSD_TYPE_SYNC; csd->func = func; csd->info = info; if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu))) @@ -563,8 +634,7 @@ static int __init nrcpus(char *str) { int nr_cpus; - get_option(&str, &nr_cpus); - if (nr_cpus > 0 && nr_cpus < nr_cpu_ids) + if (get_option(&str, &nr_cpus) && nr_cpus > 0 && nr_cpus < nr_cpu_ids) nr_cpu_ids = nr_cpus; return 0; @@ -620,7 +690,7 @@ void __init smp_init(void) * early_boot_irqs_disabled is set. Use local_irq_save/restore() instead * of local_irq_disable/enable(). */ -void on_each_cpu(void (*func) (void *info), void *info, int wait) +void on_each_cpu(smp_call_func_t func, void *info, int wait) { unsigned long flags; diff --git a/kernel/softirq.c b/kernel/softirq.c index a47c6dd57452..bf88d7f62433 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -107,6 +107,12 @@ static bool ksoftirqd_running(unsigned long pending) * where hardirqs are disabled legitimately: */ #ifdef CONFIG_TRACE_IRQFLAGS + +DEFINE_PER_CPU(int, hardirqs_enabled); +DEFINE_PER_CPU(int, hardirq_context); +EXPORT_PER_CPU_SYMBOL_GPL(hardirqs_enabled); +EXPORT_PER_CPU_SYMBOL_GPL(hardirq_context); + void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) { unsigned long flags; @@ -224,7 +230,7 @@ static inline bool lockdep_softirq_start(void) { bool in_hardirq = false; - if (lockdep_hardirq_context(current)) { + if (lockdep_hardirq_context()) { in_hardirq = true; lockdep_hardirq_exit(); } @@ -339,12 +345,11 @@ asmlinkage __visible void do_softirq(void) local_irq_restore(flags); } -/* - * Enter an interrupt context. +/** + * irq_enter_rcu - Enter an interrupt context with RCU watching */ -void irq_enter(void) +void irq_enter_rcu(void) { - rcu_irq_enter(); if (is_idle_task(current) && !in_interrupt()) { /* * Prevent raise_softirq from needlessly waking up ksoftirqd @@ -354,10 +359,18 @@ void irq_enter(void) tick_irq_enter(); _local_bh_enable(); } - __irq_enter(); } +/** + * irq_enter - Enter an interrupt context including RCU update + */ +void irq_enter(void) +{ + rcu_irq_enter(); + irq_enter_rcu(); +} + static inline void invoke_softirq(void) { if (ksoftirqd_running(local_softirq_pending())) @@ -397,10 +410,7 @@ static inline void tick_irq_exit(void) #endif } -/* - * Exit an interrupt context. Process softirqs if needed and possible: - */ -void irq_exit(void) +static inline void __irq_exit_rcu(void) { #ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED local_irq_disable(); @@ -413,6 +423,28 @@ void irq_exit(void) invoke_softirq(); tick_irq_exit(); +} + +/** + * irq_exit_rcu() - Exit an interrupt context without updating RCU + * + * Also processes softirqs if needed and possible. + */ +void irq_exit_rcu(void) +{ + __irq_exit_rcu(); + /* must be last! */ + lockdep_hardirq_exit(); +} + +/** + * irq_exit - Exit an interrupt context, update RCU and lockdep + * + * Also processes softirqs if needed and possible. + */ +void irq_exit(void) +{ + __irq_exit_rcu(); rcu_irq_exit(); /* must be last! */ lockdep_hardirq_exit(); @@ -521,7 +553,10 @@ static void tasklet_action_common(struct softirq_action *a, if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) BUG(); - t->func(t->data); + if (t->use_callback) + t->callback(t); + else + t->func(t->data); tasklet_unlock(t); continue; } @@ -547,6 +582,18 @@ static __latent_entropy void tasklet_hi_action(struct softirq_action *a) tasklet_action_common(a, this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ); } +void tasklet_setup(struct tasklet_struct *t, + void (*callback)(struct tasklet_struct *)) +{ + t->next = NULL; + t->state = 0; + atomic_set(&t->count, 0); + t->callback = callback; + t->use_callback = true; + t->data = 0; +} +EXPORT_SYMBOL(tasklet_setup); + void tasklet_init(struct tasklet_struct *t, void (*func)(unsigned long), unsigned long data) { @@ -554,6 +601,7 @@ void tasklet_init(struct tasklet_struct *t, t->state = 0; atomic_set(&t->count, 0); t->func = func; + t->use_callback = false; t->data = data; } EXPORT_SYMBOL(tasklet_init); diff --git a/kernel/stackleak.c b/kernel/stackleak.c index b193a59fc05b..a8fc9ae1d03d 100644 --- a/kernel/stackleak.c +++ b/kernel/stackleak.c @@ -104,19 +104,9 @@ asmlinkage void notrace stackleak_erase(void) } NOKPROBE_SYMBOL(stackleak_erase); -void __used notrace stackleak_track_stack(void) +void __used __no_caller_saved_registers notrace stackleak_track_stack(void) { - /* - * N.B. stackleak_erase() fills the kernel stack with the poison value, - * which has the register width. That code assumes that the value - * of 'lowest_stack' is aligned on the register width boundary. - * - * That is true for x86 and x86_64 because of the kernel stack - * alignment on these platforms (for details, see 'cc_stack_align' in - * arch/x86/Makefile). Take care of that when you port STACKLEAK to - * new platforms. - */ - unsigned long sp = (unsigned long)&sp; + unsigned long sp = current_stack_pointer; /* * Having CONFIG_STACKLEAK_TRACK_MIN_SIZE larger than @@ -125,6 +115,8 @@ void __used notrace stackleak_track_stack(void) */ BUILD_BUG_ON(CONFIG_STACKLEAK_TRACK_MIN_SIZE > STACKLEAK_SEARCH_DEPTH); + /* 'lowest_stack' should be aligned on the register width boundary */ + sp = ALIGN(sp, sizeof(unsigned long)); if (sp < current->lowest_stack && sp >= (unsigned long)task_stack_page(current) + sizeof(unsigned long)) { diff --git a/kernel/sys.c b/kernel/sys.c index d325f3ab624a..ca11af9d815d 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -393,6 +393,10 @@ long __sys_setregid(gid_t rgid, gid_t egid) new->sgid = new->egid; new->fsgid = new->egid; + retval = security_task_fix_setgid(new, old, LSM_SETID_RE); + if (retval < 0) + goto error; + return commit_creds(new); error: @@ -435,6 +439,10 @@ long __sys_setgid(gid_t gid) else goto error; + retval = security_task_fix_setgid(new, old, LSM_SETID_ID); + if (retval < 0) + goto error; + return commit_creds(new); error: @@ -756,6 +764,10 @@ long __sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) new->sgid = ksgid; new->fsgid = new->egid; + retval = security_task_fix_setgid(new, old, LSM_SETID_RES); + if (retval < 0) + goto error; + return commit_creds(new); error: @@ -862,7 +874,8 @@ long __sys_setfsgid(gid_t gid) ns_capable(old->user_ns, CAP_SETGID)) { if (!gid_eq(kgid, old->fsgid)) { new->fsgid = kgid; - goto change_okay; + if (security_task_fix_setgid(new,old,LSM_SETID_FS) == 0) + goto change_okay; } } @@ -1846,7 +1859,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) if (exe_file) { struct vm_area_struct *vma; - down_read(&mm->mmap_sem); + mmap_read_lock(mm); for (vma = mm->mmap; vma; vma = vma->vm_next) { if (!vma->vm_file) continue; @@ -1855,7 +1868,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) goto exit_err; } - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); fput(exe_file); } @@ -1869,7 +1882,7 @@ exit: fdput(exe); return err; exit_err: - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); fput(exe_file); goto exit; } @@ -1994,12 +2007,15 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data if (prctl_map.exe_fd != (u32)-1) { /* - * Make sure the caller has the rights to - * change /proc/pid/exe link: only local sys admin should - * be allowed to. + * Check if the current user is checkpoint/restore capable. + * At the time of this writing, it checks for CAP_SYS_ADMIN + * or CAP_CHECKPOINT_RESTORE. + * Note that a user with access to ptrace can masquerade an + * arbitrary program as any executable, even setuid ones. + * This may have implications in the tomoyo subsystem. */ - if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN)) - return -EINVAL; + if (!checkpoint_restore_ns_capable(current_user_ns())) + return -EPERM; error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd); if (error) @@ -2007,10 +2023,10 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data } /* - * arg_lock protects concurent updates but we still need mmap_sem for + * arg_lock protects concurent updates but we still need mmap_lock for * read to exclude races with sys_brk. */ - down_read(&mm->mmap_sem); + mmap_read_lock(mm); /* * We don't validate if these members are pointing to @@ -2049,7 +2065,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data if (prctl_map.auxv_size) memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv)); - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); return 0; } #endif /* CONFIG_CHECKPOINT_RESTORE */ @@ -2122,10 +2138,10 @@ static int prctl_set_mm(int opt, unsigned long addr, /* * arg_lock protects concurent updates of arg boundaries, we need - * mmap_sem for a) concurrent sys_brk, b) finding VMA for addr + * mmap_lock for a) concurrent sys_brk, b) finding VMA for addr * validation. */ - down_read(&mm->mmap_sem); + mmap_read_lock(mm); vma = find_vma(mm, addr); spin_lock(&mm->arg_lock); @@ -2217,7 +2233,7 @@ static int prctl_set_mm(int opt, unsigned long addr, error = 0; out: spin_unlock(&mm->arg_lock); - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); return error; } @@ -2262,7 +2278,7 @@ int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which, return -EINVAL; } -#define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LESS_THROTTLE) +#define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LOCAL_THROTTLE) SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, unsigned long, arg4, unsigned long, arg5) @@ -2442,13 +2458,13 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_SET_THP_DISABLE: if (arg3 || arg4 || arg5) return -EINVAL; - if (down_write_killable(&me->mm->mmap_sem)) + if (mmap_write_lock_killable(me->mm)) return -EINTR; if (arg2) set_bit(MMF_DISABLE_THP, &me->mm->flags); else clear_bit(MMF_DISABLE_THP, &me->mm->flags); - up_write(&me->mm->mmap_sem); + mmap_write_unlock(me->mm); break; case PR_MPX_ENABLE_MANAGEMENT: case PR_MPX_DISABLE_MANAGEMENT: @@ -2634,6 +2650,7 @@ struct compat_sysinfo { COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info) { struct sysinfo s; + struct compat_sysinfo s_32; do_sysinfo(&s); @@ -2658,23 +2675,23 @@ COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info) s.freehigh >>= bitcount; } - if (!access_ok(info, sizeof(struct compat_sysinfo)) || - __put_user(s.uptime, &info->uptime) || - __put_user(s.loads[0], &info->loads[0]) || - __put_user(s.loads[1], &info->loads[1]) || - __put_user(s.loads[2], &info->loads[2]) || - __put_user(s.totalram, &info->totalram) || - __put_user(s.freeram, &info->freeram) || - __put_user(s.sharedram, &info->sharedram) || - __put_user(s.bufferram, &info->bufferram) || - __put_user(s.totalswap, &info->totalswap) || - __put_user(s.freeswap, &info->freeswap) || - __put_user(s.procs, &info->procs) || - __put_user(s.totalhigh, &info->totalhigh) || - __put_user(s.freehigh, &info->freehigh) || - __put_user(s.mem_unit, &info->mem_unit)) + memset(&s_32, 0, sizeof(s_32)); + s_32.uptime = s.uptime; + s_32.loads[0] = s.loads[0]; + s_32.loads[1] = s.loads[1]; + s_32.loads[2] = s.loads[2]; + s_32.totalram = s.totalram; + s_32.freeram = s.freeram; + s_32.sharedram = s.sharedram; + s_32.bufferram = s.bufferram; + s_32.totalswap = s.totalswap; + s_32.freeswap = s.freeswap; + s_32.procs = s.procs; + s_32.totalhigh = s.totalhigh; + s_32.freehigh = s.freehigh; + s_32.mem_unit = s.mem_unit; + if (copy_to_user(info, &s_32, sizeof(s_32))) return -EFAULT; - return 0; } #endif /* CONFIG_COMPAT */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8a176d8727a3..1b4d2dc270a5 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -68,6 +68,9 @@ #include <linux/bpf.h> #include <linux/mount.h> #include <linux/userfaultfd_k.h> +#include <linux/coredump.h> +#include <linux/latencytop.h> +#include <linux/pid.h> #include "../lib/kstrtox.h" @@ -103,22 +106,6 @@ #if defined(CONFIG_SYSCTL) -/* External variables not in a header file. */ -extern int suid_dumpable; -#ifdef CONFIG_COREDUMP -extern int core_uses_pid; -extern char core_pattern[]; -extern unsigned int core_pipe_limit; -#endif -extern int pid_max; -extern int pid_max_min, pid_max_max; -extern int percpu_pagelist_fraction; -extern int latencytop_enabled; -extern unsigned int sysctl_nr_open_min, sysctl_nr_open_max; -#ifndef CONFIG_MMU -extern int sysctl_nr_trim_pages; -#endif - /* Constants used for minimum and maximum */ #ifdef CONFIG_LOCKUP_DETECTOR static int sixty = 60; @@ -131,6 +118,7 @@ static unsigned long zero_ul; static unsigned long one_ul = 1; static unsigned long long_max = LONG_MAX; static int one_hundred = 100; +static int two_hundred = 200; static int one_thousand = 1000; #ifdef CONFIG_PRINTK static int ten_thousand = 10000; @@ -160,24 +148,6 @@ static unsigned long hung_task_timeout_max = (LONG_MAX/HZ); #ifdef CONFIG_INOTIFY_USER #include <linux/inotify.h> #endif -#ifdef CONFIG_SPARC -#endif - -#ifdef CONFIG_PARISC -extern int pwrsw_enabled; -#endif - -#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW -extern int unaligned_enabled; -#endif - -#ifdef CONFIG_IA64 -extern int unaligned_dump_stack; -#endif - -#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN -extern int no_unaligned_warning; -#endif #ifdef CONFIG_PROC_SYSCTL @@ -207,102 +177,1480 @@ enum sysctl_writes_mode { }; static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT; +#endif /* CONFIG_PROC_SYSCTL */ + +#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ + defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) +int sysctl_legacy_va_layout; +#endif + +#ifdef CONFIG_SCHED_DEBUG +static int min_sched_granularity_ns = 100000; /* 100 usecs */ +static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ +static int min_wakeup_granularity_ns; /* 0 usecs */ +static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ +#ifdef CONFIG_SMP +static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; +static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; +#endif /* CONFIG_SMP */ +#endif /* CONFIG_SCHED_DEBUG */ -static int proc_do_cad_pid(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); -static int proc_taint(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); #ifdef CONFIG_COMPACTION -static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table, - int write, void __user *buffer, - size_t *lenp, loff_t *ppos); +static int min_extfrag_threshold; +static int max_extfrag_threshold = 1000; #endif + +#endif /* CONFIG_SYSCTL */ + +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_SYSCTL) +static int bpf_stats_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + struct static_key *key = (struct static_key *)table->data; + static int saved_val; + int val, ret; + struct ctl_table tmp = { + .data = &val, + .maxlen = sizeof(val), + .mode = table->mode, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + mutex_lock(&bpf_stats_enabled_mutex); + val = saved_val; + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + if (write && !ret && val != saved_val) { + if (val) + static_key_slow_inc(key); + else + static_key_slow_dec(key); + saved_val = val; + } + mutex_unlock(&bpf_stats_enabled_mutex); + return ret; +} #endif +/* + * /proc/sys support + */ + +#ifdef CONFIG_PROC_SYSCTL + +static int _proc_do_string(char *data, int maxlen, int write, + char *buffer, size_t *lenp, loff_t *ppos) +{ + size_t len; + char c, *p; + + if (!data || !maxlen || !*lenp) { + *lenp = 0; + return 0; + } + + if (write) { + if (sysctl_writes_strict == SYSCTL_WRITES_STRICT) { + /* Only continue writes not past the end of buffer. */ + len = strlen(data); + if (len > maxlen - 1) + len = maxlen - 1; + + if (*ppos > len) + return 0; + len = *ppos; + } else { + /* Start writing from beginning of buffer. */ + len = 0; + } + + *ppos += *lenp; + p = buffer; + while ((p - buffer) < *lenp && len < maxlen - 1) { + c = *(p++); + if (c == 0 || c == '\n') + break; + data[len++] = c; + } + data[len] = 0; + } else { + len = strlen(data); + if (len > maxlen) + len = maxlen; + + if (*ppos > len) { + *lenp = 0; + return 0; + } + + data += *ppos; + len -= *ppos; + + if (len > *lenp) + len = *lenp; + if (len) + memcpy(buffer, data, len); + if (len < *lenp) { + buffer[len] = '\n'; + len++; + } + *lenp = len; + *ppos += len; + } + return 0; +} + +static void warn_sysctl_write(struct ctl_table *table) +{ + pr_warn_once("%s wrote to %s when file position was not 0!\n" + "This will not be supported in the future. To silence this\n" + "warning, set kernel.sysctl_writes_strict = -1\n", + current->comm, table->procname); +} + +/** + * proc_first_pos_non_zero_ignore - check if first position is allowed + * @ppos: file position + * @table: the sysctl table + * + * Returns true if the first position is non-zero and the sysctl_writes_strict + * mode indicates this is not allowed for numeric input types. String proc + * handlers can ignore the return value. + */ +static bool proc_first_pos_non_zero_ignore(loff_t *ppos, + struct ctl_table *table) +{ + if (!*ppos) + return false; + + switch (sysctl_writes_strict) { + case SYSCTL_WRITES_STRICT: + return true; + case SYSCTL_WRITES_WARN: + warn_sysctl_write(table); + return false; + default: + return false; + } +} + +/** + * proc_dostring - read a string sysctl + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes a string from/to the user buffer. If the kernel + * buffer provided is not large enough to hold the string, the + * string is truncated. The copied string is %NULL-terminated. + * If the string is being read by the user process, it is copied + * and a newline '\n' is added. It is truncated if the buffer is + * not large enough. + * + * Returns 0 on success. + */ +int proc_dostring(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + if (write) + proc_first_pos_non_zero_ignore(ppos, table); + + return _proc_do_string(table->data, table->maxlen, write, buffer, lenp, + ppos); +} + +static size_t proc_skip_spaces(char **buf) +{ + size_t ret; + char *tmp = skip_spaces(*buf); + ret = tmp - *buf; + *buf = tmp; + return ret; +} + +static void proc_skip_char(char **buf, size_t *size, const char v) +{ + while (*size) { + if (**buf != v) + break; + (*size)--; + (*buf)++; + } +} + +/** + * strtoul_lenient - parse an ASCII formatted integer from a buffer and only + * fail on overflow + * + * @cp: kernel buffer containing the string to parse + * @endp: pointer to store the trailing characters + * @base: the base to use + * @res: where the parsed integer will be stored + * + * In case of success 0 is returned and @res will contain the parsed integer, + * @endp will hold any trailing characters. + * This function will fail the parse on overflow. If there wasn't an overflow + * the function will defer the decision what characters count as invalid to the + * caller. + */ +static int strtoul_lenient(const char *cp, char **endp, unsigned int base, + unsigned long *res) +{ + unsigned long long result; + unsigned int rv; + + cp = _parse_integer_fixup_radix(cp, &base); + rv = _parse_integer(cp, base, &result); + if ((rv & KSTRTOX_OVERFLOW) || (result != (unsigned long)result)) + return -ERANGE; + + cp += rv; + + if (endp) + *endp = (char *)cp; + + *res = (unsigned long)result; + return 0; +} + +#define TMPBUFLEN 22 +/** + * proc_get_long - reads an ASCII formatted integer from a user buffer + * + * @buf: a kernel buffer + * @size: size of the kernel buffer + * @val: this is where the number will be stored + * @neg: set to %TRUE if number is negative + * @perm_tr: a vector which contains the allowed trailers + * @perm_tr_len: size of the perm_tr vector + * @tr: pointer to store the trailer character + * + * In case of success %0 is returned and @buf and @size are updated with + * the amount of bytes read. If @tr is non-NULL and a trailing + * character exists (size is non-zero after returning from this + * function), @tr is updated with the trailing character. + */ +static int proc_get_long(char **buf, size_t *size, + unsigned long *val, bool *neg, + const char *perm_tr, unsigned perm_tr_len, char *tr) +{ + int len; + char *p, tmp[TMPBUFLEN]; + + if (!*size) + return -EINVAL; + + len = *size; + if (len > TMPBUFLEN - 1) + len = TMPBUFLEN - 1; + + memcpy(tmp, *buf, len); + + tmp[len] = 0; + p = tmp; + if (*p == '-' && *size > 1) { + *neg = true; + p++; + } else + *neg = false; + if (!isdigit(*p)) + return -EINVAL; + + if (strtoul_lenient(p, &p, 0, val)) + return -EINVAL; + + len = p - tmp; + + /* We don't know if the next char is whitespace thus we may accept + * invalid integers (e.g. 1234...a) or two integers instead of one + * (e.g. 123...1). So lets not allow such large numbers. */ + if (len == TMPBUFLEN - 1) + return -EINVAL; + + if (len < *size && perm_tr_len && !memchr(perm_tr, *p, perm_tr_len)) + return -EINVAL; + + if (tr && (len < *size)) + *tr = *p; + + *buf += len; + *size -= len; + + return 0; +} + +/** + * proc_put_long - converts an integer to a decimal ASCII formatted string + * + * @buf: the user buffer + * @size: the size of the user buffer + * @val: the integer to be converted + * @neg: sign of the number, %TRUE for negative + * + * In case of success @buf and @size are updated with the amount of bytes + * written. + */ +static void proc_put_long(void **buf, size_t *size, unsigned long val, bool neg) +{ + int len; + char tmp[TMPBUFLEN], *p = tmp; + + sprintf(p, "%s%lu", neg ? "-" : "", val); + len = strlen(tmp); + if (len > *size) + len = *size; + memcpy(*buf, tmp, len); + *size -= len; + *buf += len; +} +#undef TMPBUFLEN + +static void proc_put_char(void **buf, size_t *size, char c) +{ + if (*size) { + char **buffer = (char **)buf; + **buffer = c; + + (*size)--; + (*buffer)++; + *buf = *buffer; + } +} + +static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp, + int *valp, + int write, void *data) +{ + if (write) { + if (*negp) { + if (*lvalp > (unsigned long) INT_MAX + 1) + return -EINVAL; + *valp = -*lvalp; + } else { + if (*lvalp > (unsigned long) INT_MAX) + return -EINVAL; + *valp = *lvalp; + } + } else { + int val = *valp; + if (val < 0) { + *negp = true; + *lvalp = -(unsigned long)val; + } else { + *negp = false; + *lvalp = (unsigned long)val; + } + } + return 0; +} + +static int do_proc_douintvec_conv(unsigned long *lvalp, + unsigned int *valp, + int write, void *data) +{ + if (write) { + if (*lvalp > UINT_MAX) + return -EINVAL; + *valp = *lvalp; + } else { + unsigned int val = *valp; + *lvalp = (unsigned long)val; + } + return 0; +} + +static const char proc_wspace_sep[] = { ' ', '\t', '\n' }; + +static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, + int write, void *buffer, + size_t *lenp, loff_t *ppos, + int (*conv)(bool *negp, unsigned long *lvalp, int *valp, + int write, void *data), + void *data) +{ + int *i, vleft, first = 1, err = 0; + size_t left; + char *p; + + if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) { + *lenp = 0; + return 0; + } + + i = (int *) tbl_data; + vleft = table->maxlen / sizeof(*i); + left = *lenp; + + if (!conv) + conv = do_proc_dointvec_conv; + + if (write) { + if (proc_first_pos_non_zero_ignore(ppos, table)) + goto out; + + if (left > PAGE_SIZE - 1) + left = PAGE_SIZE - 1; + p = buffer; + } + + for (; left && vleft--; i++, first=0) { + unsigned long lval; + bool neg; + + if (write) { + left -= proc_skip_spaces(&p); + + if (!left) + break; + err = proc_get_long(&p, &left, &lval, &neg, + proc_wspace_sep, + sizeof(proc_wspace_sep), NULL); + if (err) + break; + if (conv(&neg, &lval, i, 1, data)) { + err = -EINVAL; + break; + } + } else { + if (conv(&neg, &lval, i, 0, data)) { + err = -EINVAL; + break; + } + if (!first) + proc_put_char(&buffer, &left, '\t'); + proc_put_long(&buffer, &left, lval, neg); + } + } + + if (!write && !first && left && !err) + proc_put_char(&buffer, &left, '\n'); + if (write && !err && left) + left -= proc_skip_spaces(&p); + if (write && first) + return err ? : -EINVAL; + *lenp -= left; +out: + *ppos += *lenp; + return err; +} + +static int do_proc_dointvec(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos, + int (*conv)(bool *negp, unsigned long *lvalp, int *valp, + int write, void *data), + void *data) +{ + return __do_proc_dointvec(table->data, table, write, + buffer, lenp, ppos, conv, data); +} + +static int do_proc_douintvec_w(unsigned int *tbl_data, + struct ctl_table *table, + void *buffer, + size_t *lenp, loff_t *ppos, + int (*conv)(unsigned long *lvalp, + unsigned int *valp, + int write, void *data), + void *data) +{ + unsigned long lval; + int err = 0; + size_t left; + bool neg; + char *p = buffer; + + left = *lenp; + + if (proc_first_pos_non_zero_ignore(ppos, table)) + goto bail_early; + + if (left > PAGE_SIZE - 1) + left = PAGE_SIZE - 1; + + left -= proc_skip_spaces(&p); + if (!left) { + err = -EINVAL; + goto out_free; + } + + err = proc_get_long(&p, &left, &lval, &neg, + proc_wspace_sep, + sizeof(proc_wspace_sep), NULL); + if (err || neg) { + err = -EINVAL; + goto out_free; + } + + if (conv(&lval, tbl_data, 1, data)) { + err = -EINVAL; + goto out_free; + } + + if (!err && left) + left -= proc_skip_spaces(&p); + +out_free: + if (err) + return -EINVAL; + + return 0; + + /* This is in keeping with old __do_proc_dointvec() */ +bail_early: + *ppos += *lenp; + return err; +} + +static int do_proc_douintvec_r(unsigned int *tbl_data, void *buffer, + size_t *lenp, loff_t *ppos, + int (*conv)(unsigned long *lvalp, + unsigned int *valp, + int write, void *data), + void *data) +{ + unsigned long lval; + int err = 0; + size_t left; + + left = *lenp; + + if (conv(&lval, tbl_data, 0, data)) { + err = -EINVAL; + goto out; + } + + proc_put_long(&buffer, &left, lval, false); + if (!left) + goto out; + + proc_put_char(&buffer, &left, '\n'); + +out: + *lenp -= left; + *ppos += *lenp; + + return err; +} + +static int __do_proc_douintvec(void *tbl_data, struct ctl_table *table, + int write, void *buffer, + size_t *lenp, loff_t *ppos, + int (*conv)(unsigned long *lvalp, + unsigned int *valp, + int write, void *data), + void *data) +{ + unsigned int *i, vleft; + + if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) { + *lenp = 0; + return 0; + } + + i = (unsigned int *) tbl_data; + vleft = table->maxlen / sizeof(*i); + + /* + * Arrays are not supported, keep this simple. *Do not* add + * support for them. + */ + if (vleft != 1) { + *lenp = 0; + return -EINVAL; + } + + if (!conv) + conv = do_proc_douintvec_conv; + + if (write) + return do_proc_douintvec_w(i, table, buffer, lenp, ppos, + conv, data); + return do_proc_douintvec_r(i, buffer, lenp, ppos, conv, data); +} + +static int do_proc_douintvec(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos, + int (*conv)(unsigned long *lvalp, + unsigned int *valp, + int write, void *data), + void *data) +{ + return __do_proc_douintvec(table->data, table, write, + buffer, lenp, ppos, conv, data); +} + +/** + * proc_dointvec - read a vector of integers + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * + * Returns 0 on success. + */ +int proc_dointvec(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) +{ + return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL); +} + +#ifdef CONFIG_COMPACTION +static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table, + int write, void *buffer, size_t *lenp, loff_t *ppos) +{ + int ret, old; + + if (!IS_ENABLED(CONFIG_PREEMPT_RT) || !write) + return proc_dointvec_minmax(table, write, buffer, lenp, ppos); + + old = *(int *)table->data; + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (ret) + return ret; + if (old != *(int *)table->data) + pr_warn_once("sysctl attribute %s changed by %s[%d]\n", + table->procname, current->comm, + task_pid_nr(current)); + return ret; +} +#endif + +/** + * proc_douintvec - read a vector of unsigned integers + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer + * values from/to the user buffer, treated as an ASCII string. + * + * Returns 0 on success. + */ +int proc_douintvec(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) +{ + return do_proc_douintvec(table, write, buffer, lenp, ppos, + do_proc_douintvec_conv, NULL); +} + +/* + * Taint values can only be increased + * This means we can safely use a temporary. + */ +static int proc_taint(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table t; + unsigned long tmptaint = get_taint(); + int err; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + t = *table; + t.data = &tmptaint; + err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos); + if (err < 0) + return err; + + if (write) { + int i; + + /* + * If we are relying on panic_on_taint not producing + * false positives due to userspace input, bail out + * before setting the requested taint flags. + */ + if (panic_on_taint_nousertaint && (tmptaint & panic_on_taint)) + return -EINVAL; + + /* + * Poor man's atomic or. Not worth adding a primitive + * to everyone's atomic.h for this + */ + for (i = 0; i < TAINT_FLAGS_COUNT; i++) + if ((1UL << i) & tmptaint) + add_taint(i, LOCKDEP_STILL_OK); + } + + return err; +} + #ifdef CONFIG_PRINTK static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos) +{ + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + return proc_dointvec_minmax(table, write, buffer, lenp, ppos); +} +#endif + +/** + * struct do_proc_dointvec_minmax_conv_param - proc_dointvec_minmax() range checking structure + * @min: pointer to minimum allowable value + * @max: pointer to maximum allowable value + * + * The do_proc_dointvec_minmax_conv_param structure provides the + * minimum and maximum values for doing range checking for those sysctl + * parameters that use the proc_dointvec_minmax() handler. + */ +struct do_proc_dointvec_minmax_conv_param { + int *min; + int *max; +}; + +static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp, + int *valp, + int write, void *data) +{ + int tmp, ret; + struct do_proc_dointvec_minmax_conv_param *param = data; + /* + * If writing, first do so via a temporary local int so we can + * bounds-check it before touching *valp. + */ + int *ip = write ? &tmp : valp; + + ret = do_proc_dointvec_conv(negp, lvalp, ip, write, data); + if (ret) + return ret; + + if (write) { + if ((param->min && *param->min > tmp) || + (param->max && *param->max < tmp)) + return -EINVAL; + *valp = tmp; + } + + return 0; +} + +/** + * proc_dointvec_minmax - read a vector of integers with min/max values + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * + * This routine will ensure the values are within the range specified by + * table->extra1 (min) and table->extra2 (max). + * + * Returns 0 on success or -EINVAL on write when the range check fails. + */ +int proc_dointvec_minmax(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct do_proc_dointvec_minmax_conv_param param = { + .min = (int *) table->extra1, + .max = (int *) table->extra2, + }; + return do_proc_dointvec(table, write, buffer, lenp, ppos, + do_proc_dointvec_minmax_conv, ¶m); +} + +/** + * struct do_proc_douintvec_minmax_conv_param - proc_douintvec_minmax() range checking structure + * @min: pointer to minimum allowable value + * @max: pointer to maximum allowable value + * + * The do_proc_douintvec_minmax_conv_param structure provides the + * minimum and maximum values for doing range checking for those sysctl + * parameters that use the proc_douintvec_minmax() handler. + */ +struct do_proc_douintvec_minmax_conv_param { + unsigned int *min; + unsigned int *max; +}; + +static int do_proc_douintvec_minmax_conv(unsigned long *lvalp, + unsigned int *valp, + int write, void *data) +{ + int ret; + unsigned int tmp; + struct do_proc_douintvec_minmax_conv_param *param = data; + /* write via temporary local uint for bounds-checking */ + unsigned int *up = write ? &tmp : valp; + + ret = do_proc_douintvec_conv(lvalp, up, write, data); + if (ret) + return ret; + + if (write) { + if ((param->min && *param->min > tmp) || + (param->max && *param->max < tmp)) + return -ERANGE; + + *valp = tmp; + } + + return 0; +} + +/** + * proc_douintvec_minmax - read a vector of unsigned ints with min/max values + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer + * values from/to the user buffer, treated as an ASCII string. Negative + * strings are not allowed. + * + * This routine will ensure the values are within the range specified by + * table->extra1 (min) and table->extra2 (max). There is a final sanity + * check for UINT_MAX to avoid having to support wrap around uses from + * userspace. + * + * Returns 0 on success or -ERANGE on write when the range check fails. + */ +int proc_douintvec_minmax(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct do_proc_douintvec_minmax_conv_param param = { + .min = (unsigned int *) table->extra1, + .max = (unsigned int *) table->extra2, + }; + return do_proc_douintvec(table, write, buffer, lenp, ppos, + do_proc_douintvec_minmax_conv, ¶m); +} + +static int do_proc_dopipe_max_size_conv(unsigned long *lvalp, + unsigned int *valp, + int write, void *data) +{ + if (write) { + unsigned int val; + + val = round_pipe_size(*lvalp); + if (val == 0) + return -EINVAL; + + *valp = val; + } else { + unsigned int val = *valp; + *lvalp = (unsigned long) val; + } + + return 0; +} + +static int proc_dopipe_max_size(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return do_proc_douintvec(table, write, buffer, lenp, ppos, + do_proc_dopipe_max_size_conv, NULL); +} + +static void validate_coredump_safety(void) +{ +#ifdef CONFIG_COREDUMP + if (suid_dumpable == SUID_DUMP_ROOT && + core_pattern[0] != '/' && core_pattern[0] != '|') { + printk(KERN_WARNING +"Unsafe core_pattern used with fs.suid_dumpable=2.\n" +"Pipe handler or fully qualified core dump path required.\n" +"Set kernel.core_pattern before fs.suid_dumpable.\n" + ); + } #endif +} static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos) +{ + int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (!error) + validate_coredump_safety(); + return error; +} + #ifdef CONFIG_COREDUMP static int proc_dostring_coredump(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos) +{ + int error = proc_dostring(table, write, buffer, lenp, ppos); + if (!error) + validate_coredump_safety(); + return error; +} #endif -static int proc_dopipe_max_size(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); #ifdef CONFIG_MAGIC_SYSRQ static int sysrq_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); -#endif + void *buffer, size_t *lenp, loff_t *ppos) +{ + int tmp, ret; -static struct ctl_table kern_table[]; -static struct ctl_table vm_table[]; -static struct ctl_table fs_table[]; -static struct ctl_table debug_table[]; -static struct ctl_table dev_table[]; -extern struct ctl_table random_table[]; -#ifdef CONFIG_EPOLL -extern struct ctl_table epoll_table[]; -#endif + tmp = sysrq_mask(); -#ifdef CONFIG_FW_LOADER_USER_HELPER -extern struct ctl_table firmware_config_table[]; -#endif + ret = __do_proc_dointvec(&tmp, table, write, buffer, + lenp, ppos, NULL, NULL); + if (ret || !write) + return ret; -#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ - defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) -int sysctl_legacy_va_layout; + if (write) + sysrq_toggle_support(tmp); + + return 0; +} #endif -/* The default sysctl tables: */ +static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, + int write, void *buffer, size_t *lenp, loff_t *ppos, + unsigned long convmul, unsigned long convdiv) +{ + unsigned long *i, *min, *max; + int vleft, first = 1, err = 0; + size_t left; + char *p; -static struct ctl_table sysctl_base_table[] = { - { - .procname = "kernel", - .mode = 0555, - .child = kern_table, - }, - { - .procname = "vm", - .mode = 0555, - .child = vm_table, - }, - { - .procname = "fs", - .mode = 0555, - .child = fs_table, - }, - { - .procname = "debug", - .mode = 0555, - .child = debug_table, - }, - { - .procname = "dev", - .mode = 0555, - .child = dev_table, - }, - { } -}; + if (!data || !table->maxlen || !*lenp || (*ppos && !write)) { + *lenp = 0; + return 0; + } -#ifdef CONFIG_SCHED_DEBUG -static int min_sched_granularity_ns = 100000; /* 100 usecs */ -static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ -static int min_wakeup_granularity_ns; /* 0 usecs */ -static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ -#ifdef CONFIG_SMP -static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; -static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; -#endif /* CONFIG_SMP */ -#endif /* CONFIG_SCHED_DEBUG */ + i = (unsigned long *) data; + min = (unsigned long *) table->extra1; + max = (unsigned long *) table->extra2; + vleft = table->maxlen / sizeof(unsigned long); + left = *lenp; -#ifdef CONFIG_COMPACTION -static int min_extfrag_threshold; -static int max_extfrag_threshold = 1000; -#endif + if (write) { + if (proc_first_pos_non_zero_ignore(ppos, table)) + goto out; + + if (left > PAGE_SIZE - 1) + left = PAGE_SIZE - 1; + p = buffer; + } + + for (; left && vleft--; i++, first = 0) { + unsigned long val; + + if (write) { + bool neg; + + left -= proc_skip_spaces(&p); + if (!left) + break; + + err = proc_get_long(&p, &left, &val, &neg, + proc_wspace_sep, + sizeof(proc_wspace_sep), NULL); + if (err) + break; + if (neg) + continue; + val = convmul * val / convdiv; + if ((min && val < *min) || (max && val > *max)) { + err = -EINVAL; + break; + } + *i = val; + } else { + val = convdiv * (*i) / convmul; + if (!first) + proc_put_char(&buffer, &left, '\t'); + proc_put_long(&buffer, &left, val, false); + } + } + + if (!write && !first && left && !err) + proc_put_char(&buffer, &left, '\n'); + if (write && !err) + left -= proc_skip_spaces(&p); + if (write && first) + return err ? : -EINVAL; + *lenp -= left; +out: + *ppos += *lenp; + return err; +} + +static int do_proc_doulongvec_minmax(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos, unsigned long convmul, + unsigned long convdiv) +{ + return __do_proc_doulongvec_minmax(table->data, table, write, + buffer, lenp, ppos, convmul, convdiv); +} + +/** + * proc_doulongvec_minmax - read a vector of long integers with min/max values + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long + * values from/to the user buffer, treated as an ASCII string. + * + * This routine will ensure the values are within the range specified by + * table->extra1 (min) and table->extra2 (max). + * + * Returns 0 on success. + */ +int proc_doulongvec_minmax(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, 1l, 1l); +} + +/** + * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long + * values from/to the user buffer, treated as an ASCII string. The values + * are treated as milliseconds, and converted to jiffies when they are stored. + * + * This routine will ensure the values are within the range specified by + * table->extra1 (min) and table->extra2 (max). + * + * Returns 0 on success. + */ +int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return do_proc_doulongvec_minmax(table, write, buffer, + lenp, ppos, HZ, 1000l); +} + + +static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp, + int *valp, + int write, void *data) +{ + if (write) { + if (*lvalp > INT_MAX / HZ) + return 1; + *valp = *negp ? -(*lvalp*HZ) : (*lvalp*HZ); + } else { + int val = *valp; + unsigned long lval; + if (val < 0) { + *negp = true; + lval = -(unsigned long)val; + } else { + *negp = false; + lval = (unsigned long)val; + } + *lvalp = lval / HZ; + } + return 0; +} + +static int do_proc_dointvec_userhz_jiffies_conv(bool *negp, unsigned long *lvalp, + int *valp, + int write, void *data) +{ + if (write) { + if (USER_HZ < HZ && *lvalp > (LONG_MAX / HZ) * USER_HZ) + return 1; + *valp = clock_t_to_jiffies(*negp ? -*lvalp : *lvalp); + } else { + int val = *valp; + unsigned long lval; + if (val < 0) { + *negp = true; + lval = -(unsigned long)val; + } else { + *negp = false; + lval = (unsigned long)val; + } + *lvalp = jiffies_to_clock_t(lval); + } + return 0; +} + +static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp, + int *valp, + int write, void *data) +{ + if (write) { + unsigned long jif = msecs_to_jiffies(*negp ? -*lvalp : *lvalp); + + if (jif > INT_MAX) + return 1; + *valp = (int)jif; + } else { + int val = *valp; + unsigned long lval; + if (val < 0) { + *negp = true; + lval = -(unsigned long)val; + } else { + *negp = false; + lval = (unsigned long)val; + } + *lvalp = jiffies_to_msecs(lval); + } + return 0; +} + +/** + * proc_dointvec_jiffies - read a vector of integers as seconds + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * The values read are assumed to be in seconds, and are converted into + * jiffies. + * + * Returns 0 on success. + */ +int proc_dointvec_jiffies(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return do_proc_dointvec(table,write,buffer,lenp,ppos, + do_proc_dointvec_jiffies_conv,NULL); +} + +/** + * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: pointer to the file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * The values read are assumed to be in 1/USER_HZ seconds, and + * are converted into jiffies. + * + * Returns 0 on success. + */ +int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return do_proc_dointvec(table,write,buffer,lenp,ppos, + do_proc_dointvec_userhz_jiffies_conv,NULL); +} + +/** + * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * @ppos: the current position in the file + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * The values read are assumed to be in 1/1000 seconds, and + * are converted into jiffies. + * + * Returns 0 on success. + */ +int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) +{ + return do_proc_dointvec(table, write, buffer, lenp, ppos, + do_proc_dointvec_ms_jiffies_conv, NULL); +} + +static int proc_do_cad_pid(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) +{ + struct pid *new_pid; + pid_t tmp; + int r; + + tmp = pid_vnr(cad_pid); + + r = __do_proc_dointvec(&tmp, table, write, buffer, + lenp, ppos, NULL, NULL); + if (r || !write) + return r; + + new_pid = find_get_pid(tmp); + if (!new_pid) + return -ESRCH; + + put_pid(xchg(&cad_pid, new_pid)); + return 0; +} + +/** + * proc_do_large_bitmap - read/write from/to a large bitmap + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * The bitmap is stored at table->data and the bitmap length (in bits) + * in table->maxlen. + * + * We use a range comma separated format (e.g. 1,3-4,10-10) so that + * large bitmaps may be represented in a compact manner. Writing into + * the file will clear the bitmap then update it with the given input. + * + * Returns 0 on success. + */ +int proc_do_large_bitmap(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int err = 0; + bool first = 1; + size_t left = *lenp; + unsigned long bitmap_len = table->maxlen; + unsigned long *bitmap = *(unsigned long **) table->data; + unsigned long *tmp_bitmap = NULL; + char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c; + + if (!bitmap || !bitmap_len || !left || (*ppos && !write)) { + *lenp = 0; + return 0; + } + + if (write) { + char *p = buffer; + size_t skipped = 0; + + if (left > PAGE_SIZE - 1) { + left = PAGE_SIZE - 1; + /* How much of the buffer we'll skip this pass */ + skipped = *lenp - left; + } + + tmp_bitmap = bitmap_zalloc(bitmap_len, GFP_KERNEL); + if (!tmp_bitmap) + return -ENOMEM; + proc_skip_char(&p, &left, '\n'); + while (!err && left) { + unsigned long val_a, val_b; + bool neg; + size_t saved_left; + + /* In case we stop parsing mid-number, we can reset */ + saved_left = left; + err = proc_get_long(&p, &left, &val_a, &neg, tr_a, + sizeof(tr_a), &c); + /* + * If we consumed the entirety of a truncated buffer or + * only one char is left (may be a "-"), then stop here, + * reset, & come back for more. + */ + if ((left <= 1) && skipped) { + left = saved_left; + break; + } + + if (err) + break; + if (val_a >= bitmap_len || neg) { + err = -EINVAL; + break; + } + + val_b = val_a; + if (left) { + p++; + left--; + } + + if (c == '-') { + err = proc_get_long(&p, &left, &val_b, + &neg, tr_b, sizeof(tr_b), + &c); + /* + * If we consumed all of a truncated buffer or + * then stop here, reset, & come back for more. + */ + if (!left && skipped) { + left = saved_left; + break; + } + + if (err) + break; + if (val_b >= bitmap_len || neg || + val_a > val_b) { + err = -EINVAL; + break; + } + if (left) { + p++; + left--; + } + } + + bitmap_set(tmp_bitmap, val_a, val_b - val_a + 1); + first = 0; + proc_skip_char(&p, &left, '\n'); + } + left += skipped; + } else { + unsigned long bit_a, bit_b = 0; + + while (left) { + bit_a = find_next_bit(bitmap, bitmap_len, bit_b); + if (bit_a >= bitmap_len) + break; + bit_b = find_next_zero_bit(bitmap, bitmap_len, + bit_a + 1) - 1; + + if (!first) + proc_put_char(&buffer, &left, ','); + proc_put_long(&buffer, &left, bit_a, false); + if (bit_a != bit_b) { + proc_put_char(&buffer, &left, '-'); + proc_put_long(&buffer, &left, bit_b, false); + } + + first = 0; bit_b++; + } + proc_put_char(&buffer, &left, '\n'); + } + + if (!err) { + if (write) { + if (*ppos) + bitmap_or(bitmap, bitmap, tmp_bitmap, bitmap_len); + else + bitmap_copy(bitmap, tmp_bitmap, bitmap_len); + } + *lenp -= left; + *ppos += *lenp; + } + + bitmap_free(tmp_bitmap); + return err; +} + +#else /* CONFIG_PROC_SYSCTL */ + +int proc_dostring(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_dointvec(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_douintvec(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_dointvec_minmax(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_douintvec_minmax(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_dointvec_jiffies(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_doulongvec_minmax(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_do_large_bitmap(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +#endif /* CONFIG_PROC_SYSCTL */ + +#if defined(CONFIG_SYSCTL) +int proc_do_static_key(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct static_key *key = (struct static_key *)table->data; + static DEFINE_MUTEX(static_key_mutex); + int val, ret; + struct ctl_table tmp = { + .data = &val, + .maxlen = sizeof(val), + .mode = table->mode, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + mutex_lock(&static_key_mutex); + val = static_key_enabled(key); + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + if (write && !ret) { + if (val) + static_key_enable(key); + else + static_key_disable(key); + } + mutex_unlock(&static_key_mutex); + return ret; +} static struct ctl_table kern_table[] = { { @@ -432,6 +1780,20 @@ static struct ctl_table kern_table[] = { .proc_handler = sched_rt_handler, }, { + .procname = "sched_deadline_period_max_us", + .data = &sysctl_sched_dl_period_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sched_deadline_period_min_us", + .data = &sysctl_sched_dl_period_min, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { .procname = "sched_rr_timeslice_ms", .data = &sysctl_sched_rr_timeslice, .maxlen = sizeof(int), @@ -453,6 +1815,13 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = sysctl_sched_uclamp_handler, }, + { + .procname = "sched_util_clamp_min_rt_default", + .data = &sysctl_sched_uclamp_util_min_rt_default, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_sched_uclamp_handler, + }, #endif #ifdef CONFIG_SCHED_AUTOGROUP { @@ -613,7 +1982,7 @@ static struct ctl_table kern_table[] = { .procname = "soft-power", .data = &pwrsw_enabled, .maxlen = sizeof (int), - .mode = 0644, + .mode = 0644, .proc_handler = proc_dointvec, }, #endif @@ -801,6 +2170,17 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif +#ifdef CONFIG_SMP + { + .procname = "oops_all_cpu_backtrace", + .data = &sysctl_oops_all_cpu_backtrace, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif /* CONFIG_SMP */ { .procname = "pid_max", .data = &pid_max, @@ -994,30 +2374,32 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#if defined(CONFIG_X86) + +#if (defined(CONFIG_X86_32) || defined(CONFIG_PARISC)) && \ + defined(CONFIG_DEBUG_STACKOVERFLOW) { - .procname = "panic_on_unrecovered_nmi", - .data = &panic_on_unrecovered_nmi, + .procname = "panic_on_stackoverflow", + .data = &sysctl_panic_on_stackoverflow, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, +#endif +#if defined(CONFIG_X86) { - .procname = "panic_on_io_nmi", - .data = &panic_on_io_nmi, + .procname = "panic_on_unrecovered_nmi", + .data = &panic_on_unrecovered_nmi, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, -#ifdef CONFIG_DEBUG_STACKOVERFLOW { - .procname = "panic_on_stackoverflow", - .data = &sysctl_panic_on_stackoverflow, + .procname = "panic_on_io_nmi", + .data = &panic_on_io_nmi, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, -#endif { .procname = "bootloader_type", .data = &bootloader_type, @@ -1072,7 +2454,7 @@ static struct ctl_table kern_table[] = { .procname = "ignore-unaligned-usertrap", .data = &no_unaligned_warning, .maxlen = sizeof (int), - .mode = 0644, + .mode = 0644, .proc_handler = proc_dointvec, }, #endif @@ -1086,6 +2468,17 @@ static struct ctl_table kern_table[] = { }, #endif #ifdef CONFIG_DETECT_HUNG_TASK +#ifdef CONFIG_SMP + { + .procname = "hung_task_all_cpu_backtrace", + .data = &sysctl_hung_task_all_cpu_backtrace, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif /* CONFIG_SMP */ { .procname = "hung_task_panic", .data = &sysctl_hung_task_panic, @@ -1244,7 +2637,7 @@ static struct ctl_table kern_table[] = { .data = &bpf_stats_enabled_key.key, .maxlen = sizeof(bpf_stats_enabled_key), .mode = 0644, - .proc_handler = proc_do_static_key, + .proc_handler = bpf_stats_handler, }, #endif #if defined(CONFIG_TREE_RCU) @@ -1320,7 +2713,7 @@ static struct ctl_table vm_table[] = { .proc_handler = overcommit_kbytes_handler, }, { - .procname = "page-cluster", + .procname = "page-cluster", .data = &page_cluster, .maxlen = sizeof(int), .mode = 0644, @@ -1391,7 +2784,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = &one_hundred, + .extra2 = &two_hundred, }, #ifdef CONFIG_HUGETLB_PAGE { @@ -1491,7 +2884,7 @@ static struct ctl_table vm_table[] = { .data = &watermark_boost_factor, .maxlen = sizeof(watermark_boost_factor), .mode = 0644, - .proc_handler = watermark_boost_factor_sysctl_handler, + .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, { @@ -1959,6 +3352,35 @@ static struct ctl_table dev_table[] = { { } }; +static struct ctl_table sysctl_base_table[] = { + { + .procname = "kernel", + .mode = 0555, + .child = kern_table, + }, + { + .procname = "vm", + .mode = 0555, + .child = vm_table, + }, + { + .procname = "fs", + .mode = 0555, + .child = fs_table, + }, + { + .procname = "debug", + .mode = 0555, + .child = debug_table, + }, + { + .procname = "dev", + .mode = 0555, + .child = dev_table, + }, + { } +}; + int __init sysctl_init(void) { struct ctl_table_header *hdr; @@ -1967,1474 +3389,7 @@ int __init sysctl_init(void) kmemleak_not_leak(hdr); return 0; } - #endif /* CONFIG_SYSCTL */ - -/* - * /proc/sys support - */ - -#ifdef CONFIG_PROC_SYSCTL - -static int _proc_do_string(char *data, int maxlen, int write, - char __user *buffer, - size_t *lenp, loff_t *ppos) -{ - size_t len; - char __user *p; - char c; - - if (!data || !maxlen || !*lenp) { - *lenp = 0; - return 0; - } - - if (write) { - if (sysctl_writes_strict == SYSCTL_WRITES_STRICT) { - /* Only continue writes not past the end of buffer. */ - len = strlen(data); - if (len > maxlen - 1) - len = maxlen - 1; - - if (*ppos > len) - return 0; - len = *ppos; - } else { - /* Start writing from beginning of buffer. */ - len = 0; - } - - *ppos += *lenp; - p = buffer; - while ((p - buffer) < *lenp && len < maxlen - 1) { - if (get_user(c, p++)) - return -EFAULT; - if (c == 0 || c == '\n') - break; - data[len++] = c; - } - data[len] = 0; - } else { - len = strlen(data); - if (len > maxlen) - len = maxlen; - - if (*ppos > len) { - *lenp = 0; - return 0; - } - - data += *ppos; - len -= *ppos; - - if (len > *lenp) - len = *lenp; - if (len) - if (copy_to_user(buffer, data, len)) - return -EFAULT; - if (len < *lenp) { - if (put_user('\n', buffer + len)) - return -EFAULT; - len++; - } - *lenp = len; - *ppos += len; - } - return 0; -} - -static void warn_sysctl_write(struct ctl_table *table) -{ - pr_warn_once("%s wrote to %s when file position was not 0!\n" - "This will not be supported in the future. To silence this\n" - "warning, set kernel.sysctl_writes_strict = -1\n", - current->comm, table->procname); -} - -/** - * proc_first_pos_non_zero_ignore - check if first position is allowed - * @ppos: file position - * @table: the sysctl table - * - * Returns true if the first position is non-zero and the sysctl_writes_strict - * mode indicates this is not allowed for numeric input types. String proc - * handlers can ignore the return value. - */ -static bool proc_first_pos_non_zero_ignore(loff_t *ppos, - struct ctl_table *table) -{ - if (!*ppos) - return false; - - switch (sysctl_writes_strict) { - case SYSCTL_WRITES_STRICT: - return true; - case SYSCTL_WRITES_WARN: - warn_sysctl_write(table); - return false; - default: - return false; - } -} - -/** - * proc_dostring - read a string sysctl - * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file - * @buffer: the user buffer - * @lenp: the size of the user buffer - * @ppos: file position - * - * Reads/writes a string from/to the user buffer. If the kernel - * buffer provided is not large enough to hold the string, the - * string is truncated. The copied string is %NULL-terminated. - * If the string is being read by the user process, it is copied - * and a newline '\n' is added. It is truncated if the buffer is - * not large enough. - * - * Returns 0 on success. - */ -int proc_dostring(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - if (write) - proc_first_pos_non_zero_ignore(ppos, table); - - return _proc_do_string((char *)(table->data), table->maxlen, write, - (char __user *)buffer, lenp, ppos); -} - -static size_t proc_skip_spaces(char **buf) -{ - size_t ret; - char *tmp = skip_spaces(*buf); - ret = tmp - *buf; - *buf = tmp; - return ret; -} - -static void proc_skip_char(char **buf, size_t *size, const char v) -{ - while (*size) { - if (**buf != v) - break; - (*size)--; - (*buf)++; - } -} - -/** - * strtoul_lenient - parse an ASCII formatted integer from a buffer and only - * fail on overflow - * - * @cp: kernel buffer containing the string to parse - * @endp: pointer to store the trailing characters - * @base: the base to use - * @res: where the parsed integer will be stored - * - * In case of success 0 is returned and @res will contain the parsed integer, - * @endp will hold any trailing characters. - * This function will fail the parse on overflow. If there wasn't an overflow - * the function will defer the decision what characters count as invalid to the - * caller. - */ -static int strtoul_lenient(const char *cp, char **endp, unsigned int base, - unsigned long *res) -{ - unsigned long long result; - unsigned int rv; - - cp = _parse_integer_fixup_radix(cp, &base); - rv = _parse_integer(cp, base, &result); - if ((rv & KSTRTOX_OVERFLOW) || (result != (unsigned long)result)) - return -ERANGE; - - cp += rv; - - if (endp) - *endp = (char *)cp; - - *res = (unsigned long)result; - return 0; -} - -#define TMPBUFLEN 22 -/** - * proc_get_long - reads an ASCII formatted integer from a user buffer - * - * @buf: a kernel buffer - * @size: size of the kernel buffer - * @val: this is where the number will be stored - * @neg: set to %TRUE if number is negative - * @perm_tr: a vector which contains the allowed trailers - * @perm_tr_len: size of the perm_tr vector - * @tr: pointer to store the trailer character - * - * In case of success %0 is returned and @buf and @size are updated with - * the amount of bytes read. If @tr is non-NULL and a trailing - * character exists (size is non-zero after returning from this - * function), @tr is updated with the trailing character. - */ -static int proc_get_long(char **buf, size_t *size, - unsigned long *val, bool *neg, - const char *perm_tr, unsigned perm_tr_len, char *tr) -{ - int len; - char *p, tmp[TMPBUFLEN]; - - if (!*size) - return -EINVAL; - - len = *size; - if (len > TMPBUFLEN - 1) - len = TMPBUFLEN - 1; - - memcpy(tmp, *buf, len); - - tmp[len] = 0; - p = tmp; - if (*p == '-' && *size > 1) { - *neg = true; - p++; - } else - *neg = false; - if (!isdigit(*p)) - return -EINVAL; - - if (strtoul_lenient(p, &p, 0, val)) - return -EINVAL; - - len = p - tmp; - - /* We don't know if the next char is whitespace thus we may accept - * invalid integers (e.g. 1234...a) or two integers instead of one - * (e.g. 123...1). So lets not allow such large numbers. */ - if (len == TMPBUFLEN - 1) - return -EINVAL; - - if (len < *size && perm_tr_len && !memchr(perm_tr, *p, perm_tr_len)) - return -EINVAL; - - if (tr && (len < *size)) - *tr = *p; - - *buf += len; - *size -= len; - - return 0; -} - -/** - * proc_put_long - converts an integer to a decimal ASCII formatted string - * - * @buf: the user buffer - * @size: the size of the user buffer - * @val: the integer to be converted - * @neg: sign of the number, %TRUE for negative - * - * In case of success %0 is returned and @buf and @size are updated with - * the amount of bytes written. - */ -static int proc_put_long(void __user **buf, size_t *size, unsigned long val, - bool neg) -{ - int len; - char tmp[TMPBUFLEN], *p = tmp; - - sprintf(p, "%s%lu", neg ? "-" : "", val); - len = strlen(tmp); - if (len > *size) - len = *size; - if (copy_to_user(*buf, tmp, len)) - return -EFAULT; - *size -= len; - *buf += len; - return 0; -} -#undef TMPBUFLEN - -static int proc_put_char(void __user **buf, size_t *size, char c) -{ - if (*size) { - char __user **buffer = (char __user **)buf; - if (put_user(c, *buffer)) - return -EFAULT; - (*size)--, (*buffer)++; - *buf = *buffer; - } - return 0; -} - -static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp, - int *valp, - int write, void *data) -{ - if (write) { - if (*negp) { - if (*lvalp > (unsigned long) INT_MAX + 1) - return -EINVAL; - *valp = -*lvalp; - } else { - if (*lvalp > (unsigned long) INT_MAX) - return -EINVAL; - *valp = *lvalp; - } - } else { - int val = *valp; - if (val < 0) { - *negp = true; - *lvalp = -(unsigned long)val; - } else { - *negp = false; - *lvalp = (unsigned long)val; - } - } - return 0; -} - -static int do_proc_douintvec_conv(unsigned long *lvalp, - unsigned int *valp, - int write, void *data) -{ - if (write) { - if (*lvalp > UINT_MAX) - return -EINVAL; - *valp = *lvalp; - } else { - unsigned int val = *valp; - *lvalp = (unsigned long)val; - } - return 0; -} - -static const char proc_wspace_sep[] = { ' ', '\t', '\n' }; - -static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, - int write, void __user *buffer, - size_t *lenp, loff_t *ppos, - int (*conv)(bool *negp, unsigned long *lvalp, int *valp, - int write, void *data), - void *data) -{ - int *i, vleft, first = 1, err = 0; - size_t left; - char *kbuf = NULL, *p; - - if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) { - *lenp = 0; - return 0; - } - - i = (int *) tbl_data; - vleft = table->maxlen / sizeof(*i); - left = *lenp; - - if (!conv) - conv = do_proc_dointvec_conv; - - if (write) { - if (proc_first_pos_non_zero_ignore(ppos, table)) - goto out; - - if (left > PAGE_SIZE - 1) - left = PAGE_SIZE - 1; - p = kbuf = memdup_user_nul(buffer, left); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); - } - - for (; left && vleft--; i++, first=0) { - unsigned long lval; - bool neg; - - if (write) { - left -= proc_skip_spaces(&p); - - if (!left) - break; - err = proc_get_long(&p, &left, &lval, &neg, - proc_wspace_sep, - sizeof(proc_wspace_sep), NULL); - if (err) - break; - if (conv(&neg, &lval, i, 1, data)) { - err = -EINVAL; - break; - } - } else { - if (conv(&neg, &lval, i, 0, data)) { - err = -EINVAL; - break; - } - if (!first) - err = proc_put_char(&buffer, &left, '\t'); - if (err) - break; - err = proc_put_long(&buffer, &left, lval, neg); - if (err) - break; - } - } - - if (!write && !first && left && !err) - err = proc_put_char(&buffer, &left, '\n'); - if (write && !err && left) - left -= proc_skip_spaces(&p); - if (write) { - kfree(kbuf); - if (first) - return err ? : -EINVAL; - } - *lenp -= left; -out: - *ppos += *lenp; - return err; -} - -static int do_proc_dointvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos, - int (*conv)(bool *negp, unsigned long *lvalp, int *valp, - int write, void *data), - void *data) -{ - return __do_proc_dointvec(table->data, table, write, - buffer, lenp, ppos, conv, data); -} - -static int do_proc_douintvec_w(unsigned int *tbl_data, - struct ctl_table *table, - void __user *buffer, - size_t *lenp, loff_t *ppos, - int (*conv)(unsigned long *lvalp, - unsigned int *valp, - int write, void *data), - void *data) -{ - unsigned long lval; - int err = 0; - size_t left; - bool neg; - char *kbuf = NULL, *p; - - left = *lenp; - - if (proc_first_pos_non_zero_ignore(ppos, table)) - goto bail_early; - - if (left > PAGE_SIZE - 1) - left = PAGE_SIZE - 1; - - p = kbuf = memdup_user_nul(buffer, left); - if (IS_ERR(kbuf)) - return -EINVAL; - - left -= proc_skip_spaces(&p); - if (!left) { - err = -EINVAL; - goto out_free; - } - - err = proc_get_long(&p, &left, &lval, &neg, - proc_wspace_sep, - sizeof(proc_wspace_sep), NULL); - if (err || neg) { - err = -EINVAL; - goto out_free; - } - - if (conv(&lval, tbl_data, 1, data)) { - err = -EINVAL; - goto out_free; - } - - if (!err && left) - left -= proc_skip_spaces(&p); - -out_free: - kfree(kbuf); - if (err) - return -EINVAL; - - return 0; - - /* This is in keeping with old __do_proc_dointvec() */ -bail_early: - *ppos += *lenp; - return err; -} - -static int do_proc_douintvec_r(unsigned int *tbl_data, void __user *buffer, - size_t *lenp, loff_t *ppos, - int (*conv)(unsigned long *lvalp, - unsigned int *valp, - int write, void *data), - void *data) -{ - unsigned long lval; - int err = 0; - size_t left; - - left = *lenp; - - if (conv(&lval, tbl_data, 0, data)) { - err = -EINVAL; - goto out; - } - - err = proc_put_long(&buffer, &left, lval, false); - if (err || !left) - goto out; - - err = proc_put_char(&buffer, &left, '\n'); - -out: - *lenp -= left; - *ppos += *lenp; - - return err; -} - -static int __do_proc_douintvec(void *tbl_data, struct ctl_table *table, - int write, void __user *buffer, - size_t *lenp, loff_t *ppos, - int (*conv)(unsigned long *lvalp, - unsigned int *valp, - int write, void *data), - void *data) -{ - unsigned int *i, vleft; - - if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) { - *lenp = 0; - return 0; - } - - i = (unsigned int *) tbl_data; - vleft = table->maxlen / sizeof(*i); - - /* - * Arrays are not supported, keep this simple. *Do not* add - * support for them. - */ - if (vleft != 1) { - *lenp = 0; - return -EINVAL; - } - - if (!conv) - conv = do_proc_douintvec_conv; - - if (write) - return do_proc_douintvec_w(i, table, buffer, lenp, ppos, - conv, data); - return do_proc_douintvec_r(i, buffer, lenp, ppos, conv, data); -} - -static int do_proc_douintvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos, - int (*conv)(unsigned long *lvalp, - unsigned int *valp, - int write, void *data), - void *data) -{ - return __do_proc_douintvec(table->data, table, write, - buffer, lenp, ppos, conv, data); -} - -/** - * proc_dointvec - read a vector of integers - * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file - * @buffer: the user buffer - * @lenp: the size of the user buffer - * @ppos: file position - * - * Reads/writes up to table->maxlen/sizeof(unsigned int) integer - * values from/to the user buffer, treated as an ASCII string. - * - * Returns 0 on success. - */ -int proc_dointvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL); -} - -#ifdef CONFIG_COMPACTION -static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table, - int write, void __user *buffer, - size_t *lenp, loff_t *ppos) -{ - int ret, old; - - if (!IS_ENABLED(CONFIG_PREEMPT_RT) || !write) - return proc_dointvec_minmax(table, write, buffer, lenp, ppos); - - old = *(int *)table->data; - ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - if (ret) - return ret; - if (old != *(int *)table->data) - pr_warn_once("sysctl attribute %s changed by %s[%d]\n", - table->procname, current->comm, - task_pid_nr(current)); - return ret; -} -#endif - -/** - * proc_douintvec - read a vector of unsigned integers - * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file - * @buffer: the user buffer - * @lenp: the size of the user buffer - * @ppos: file position - * - * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer - * values from/to the user buffer, treated as an ASCII string. - * - * Returns 0 on success. - */ -int proc_douintvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return do_proc_douintvec(table, write, buffer, lenp, ppos, - do_proc_douintvec_conv, NULL); -} - -/* - * Taint values can only be increased - * This means we can safely use a temporary. - */ -static int proc_taint(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - struct ctl_table t; - unsigned long tmptaint = get_taint(); - int err; - - if (write && !capable(CAP_SYS_ADMIN)) - return -EPERM; - - t = *table; - t.data = &tmptaint; - err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos); - if (err < 0) - return err; - - if (write) { - /* - * Poor man's atomic or. Not worth adding a primitive - * to everyone's atomic.h for this - */ - int i; - for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) { - if ((tmptaint >> i) & 1) - add_taint(i, LOCKDEP_STILL_OK); - } - } - - return err; -} - -#ifdef CONFIG_PRINTK -static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - if (write && !capable(CAP_SYS_ADMIN)) - return -EPERM; - - return proc_dointvec_minmax(table, write, buffer, lenp, ppos); -} -#endif - -/** - * struct do_proc_dointvec_minmax_conv_param - proc_dointvec_minmax() range checking structure - * @min: pointer to minimum allowable value - * @max: pointer to maximum allowable value - * - * The do_proc_dointvec_minmax_conv_param structure provides the - * minimum and maximum values for doing range checking for those sysctl - * parameters that use the proc_dointvec_minmax() handler. - */ -struct do_proc_dointvec_minmax_conv_param { - int *min; - int *max; -}; - -static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp, - int *valp, - int write, void *data) -{ - int tmp, ret; - struct do_proc_dointvec_minmax_conv_param *param = data; - /* - * If writing, first do so via a temporary local int so we can - * bounds-check it before touching *valp. - */ - int *ip = write ? &tmp : valp; - - ret = do_proc_dointvec_conv(negp, lvalp, ip, write, data); - if (ret) - return ret; - - if (write) { - if ((param->min && *param->min > tmp) || - (param->max && *param->max < tmp)) - return -EINVAL; - *valp = tmp; - } - - return 0; -} - -/** - * proc_dointvec_minmax - read a vector of integers with min/max values - * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file - * @buffer: the user buffer - * @lenp: the size of the user buffer - * @ppos: file position - * - * Reads/writes up to table->maxlen/sizeof(unsigned int) integer - * values from/to the user buffer, treated as an ASCII string. - * - * This routine will ensure the values are within the range specified by - * table->extra1 (min) and table->extra2 (max). - * - * Returns 0 on success or -EINVAL on write when the range check fails. - */ -int proc_dointvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - struct do_proc_dointvec_minmax_conv_param param = { - .min = (int *) table->extra1, - .max = (int *) table->extra2, - }; - return do_proc_dointvec(table, write, buffer, lenp, ppos, - do_proc_dointvec_minmax_conv, ¶m); -} - -/** - * struct do_proc_douintvec_minmax_conv_param - proc_douintvec_minmax() range checking structure - * @min: pointer to minimum allowable value - * @max: pointer to maximum allowable value - * - * The do_proc_douintvec_minmax_conv_param structure provides the - * minimum and maximum values for doing range checking for those sysctl - * parameters that use the proc_douintvec_minmax() handler. - */ -struct do_proc_douintvec_minmax_conv_param { - unsigned int *min; - unsigned int *max; -}; - -static int do_proc_douintvec_minmax_conv(unsigned long *lvalp, - unsigned int *valp, - int write, void *data) -{ - int ret; - unsigned int tmp; - struct do_proc_douintvec_minmax_conv_param *param = data; - /* write via temporary local uint for bounds-checking */ - unsigned int *up = write ? &tmp : valp; - - ret = do_proc_douintvec_conv(lvalp, up, write, data); - if (ret) - return ret; - - if (write) { - if ((param->min && *param->min > tmp) || - (param->max && *param->max < tmp)) - return -ERANGE; - - *valp = tmp; - } - - return 0; -} - -/** - * proc_douintvec_minmax - read a vector of unsigned ints with min/max values - * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file - * @buffer: the user buffer - * @lenp: the size of the user buffer - * @ppos: file position - * - * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer - * values from/to the user buffer, treated as an ASCII string. Negative - * strings are not allowed. - * - * This routine will ensure the values are within the range specified by - * table->extra1 (min) and table->extra2 (max). There is a final sanity - * check for UINT_MAX to avoid having to support wrap around uses from - * userspace. - * - * Returns 0 on success or -ERANGE on write when the range check fails. - */ -int proc_douintvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - struct do_proc_douintvec_minmax_conv_param param = { - .min = (unsigned int *) table->extra1, - .max = (unsigned int *) table->extra2, - }; - return do_proc_douintvec(table, write, buffer, lenp, ppos, - do_proc_douintvec_minmax_conv, ¶m); -} - -static int do_proc_dopipe_max_size_conv(unsigned long *lvalp, - unsigned int *valp, - int write, void *data) -{ - if (write) { - unsigned int val; - - val = round_pipe_size(*lvalp); - if (val == 0) - return -EINVAL; - - *valp = val; - } else { - unsigned int val = *valp; - *lvalp = (unsigned long) val; - } - - return 0; -} - -static int proc_dopipe_max_size(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return do_proc_douintvec(table, write, buffer, lenp, ppos, - do_proc_dopipe_max_size_conv, NULL); -} - -static void validate_coredump_safety(void) -{ -#ifdef CONFIG_COREDUMP - if (suid_dumpable == SUID_DUMP_ROOT && - core_pattern[0] != '/' && core_pattern[0] != '|') { - printk(KERN_WARNING -"Unsafe core_pattern used with fs.suid_dumpable=2.\n" -"Pipe handler or fully qualified core dump path required.\n" -"Set kernel.core_pattern before fs.suid_dumpable.\n" - ); - } -#endif -} - -static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - if (!error) - validate_coredump_safety(); - return error; -} - -#ifdef CONFIG_COREDUMP -static int proc_dostring_coredump(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - int error = proc_dostring(table, write, buffer, lenp, ppos); - if (!error) - validate_coredump_safety(); - return error; -} -#endif - -#ifdef CONFIG_MAGIC_SYSRQ -static int sysrq_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - int tmp, ret; - - tmp = sysrq_mask(); - - ret = __do_proc_dointvec(&tmp, table, write, buffer, - lenp, ppos, NULL, NULL); - if (ret || !write) - return ret; - - if (write) - sysrq_toggle_support(tmp); - - return 0; -} -#endif - -static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos, - unsigned long convmul, - unsigned long convdiv) -{ - unsigned long *i, *min, *max; - int vleft, first = 1, err = 0; - size_t left; - char *kbuf = NULL, *p; - - if (!data || !table->maxlen || !*lenp || (*ppos && !write)) { - *lenp = 0; - return 0; - } - - i = (unsigned long *) data; - min = (unsigned long *) table->extra1; - max = (unsigned long *) table->extra2; - vleft = table->maxlen / sizeof(unsigned long); - left = *lenp; - - if (write) { - if (proc_first_pos_non_zero_ignore(ppos, table)) - goto out; - - if (left > PAGE_SIZE - 1) - left = PAGE_SIZE - 1; - p = kbuf = memdup_user_nul(buffer, left); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); - } - - for (; left && vleft--; i++, first = 0) { - unsigned long val; - - if (write) { - bool neg; - - left -= proc_skip_spaces(&p); - if (!left) - break; - - err = proc_get_long(&p, &left, &val, &neg, - proc_wspace_sep, - sizeof(proc_wspace_sep), NULL); - if (err) - break; - if (neg) - continue; - val = convmul * val / convdiv; - if ((min && val < *min) || (max && val > *max)) { - err = -EINVAL; - break; - } - *i = val; - } else { - val = convdiv * (*i) / convmul; - if (!first) { - err = proc_put_char(&buffer, &left, '\t'); - if (err) - break; - } - err = proc_put_long(&buffer, &left, val, false); - if (err) - break; - } - } - - if (!write && !first && left && !err) - err = proc_put_char(&buffer, &left, '\n'); - if (write && !err) - left -= proc_skip_spaces(&p); - if (write) { - kfree(kbuf); - if (first) - return err ? : -EINVAL; - } - *lenp -= left; -out: - *ppos += *lenp; - return err; -} - -static int do_proc_doulongvec_minmax(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos, - unsigned long convmul, - unsigned long convdiv) -{ - return __do_proc_doulongvec_minmax(table->data, table, write, - buffer, lenp, ppos, convmul, convdiv); -} - -/** - * proc_doulongvec_minmax - read a vector of long integers with min/max values - * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file - * @buffer: the user buffer - * @lenp: the size of the user buffer - * @ppos: file position - * - * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long - * values from/to the user buffer, treated as an ASCII string. - * - * This routine will ensure the values are within the range specified by - * table->extra1 (min) and table->extra2 (max). - * - * Returns 0 on success. - */ -int proc_doulongvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, 1l, 1l); -} - -/** - * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values - * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file - * @buffer: the user buffer - * @lenp: the size of the user buffer - * @ppos: file position - * - * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long - * values from/to the user buffer, treated as an ASCII string. The values - * are treated as milliseconds, and converted to jiffies when they are stored. - * - * This routine will ensure the values are within the range specified by - * table->extra1 (min) and table->extra2 (max). - * - * Returns 0 on success. - */ -int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) -{ - return do_proc_doulongvec_minmax(table, write, buffer, - lenp, ppos, HZ, 1000l); -} - - -static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp, - int *valp, - int write, void *data) -{ - if (write) { - if (*lvalp > INT_MAX / HZ) - return 1; - *valp = *negp ? -(*lvalp*HZ) : (*lvalp*HZ); - } else { - int val = *valp; - unsigned long lval; - if (val < 0) { - *negp = true; - lval = -(unsigned long)val; - } else { - *negp = false; - lval = (unsigned long)val; - } - *lvalp = lval / HZ; - } - return 0; -} - -static int do_proc_dointvec_userhz_jiffies_conv(bool *negp, unsigned long *lvalp, - int *valp, - int write, void *data) -{ - if (write) { - if (USER_HZ < HZ && *lvalp > (LONG_MAX / HZ) * USER_HZ) - return 1; - *valp = clock_t_to_jiffies(*negp ? -*lvalp : *lvalp); - } else { - int val = *valp; - unsigned long lval; - if (val < 0) { - *negp = true; - lval = -(unsigned long)val; - } else { - *negp = false; - lval = (unsigned long)val; - } - *lvalp = jiffies_to_clock_t(lval); - } - return 0; -} - -static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp, - int *valp, - int write, void *data) -{ - if (write) { - unsigned long jif = msecs_to_jiffies(*negp ? -*lvalp : *lvalp); - - if (jif > INT_MAX) - return 1; - *valp = (int)jif; - } else { - int val = *valp; - unsigned long lval; - if (val < 0) { - *negp = true; - lval = -(unsigned long)val; - } else { - *negp = false; - lval = (unsigned long)val; - } - *lvalp = jiffies_to_msecs(lval); - } - return 0; -} - -/** - * proc_dointvec_jiffies - read a vector of integers as seconds - * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file - * @buffer: the user buffer - * @lenp: the size of the user buffer - * @ppos: file position - * - * Reads/writes up to table->maxlen/sizeof(unsigned int) integer - * values from/to the user buffer, treated as an ASCII string. - * The values read are assumed to be in seconds, and are converted into - * jiffies. - * - * Returns 0 on success. - */ -int proc_dointvec_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return do_proc_dointvec(table,write,buffer,lenp,ppos, - do_proc_dointvec_jiffies_conv,NULL); -} - -/** - * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds - * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file - * @buffer: the user buffer - * @lenp: the size of the user buffer - * @ppos: pointer to the file position - * - * Reads/writes up to table->maxlen/sizeof(unsigned int) integer - * values from/to the user buffer, treated as an ASCII string. - * The values read are assumed to be in 1/USER_HZ seconds, and - * are converted into jiffies. - * - * Returns 0 on success. - */ -int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return do_proc_dointvec(table,write,buffer,lenp,ppos, - do_proc_dointvec_userhz_jiffies_conv,NULL); -} - -/** - * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds - * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file - * @buffer: the user buffer - * @lenp: the size of the user buffer - * @ppos: file position - * @ppos: the current position in the file - * - * Reads/writes up to table->maxlen/sizeof(unsigned int) integer - * values from/to the user buffer, treated as an ASCII string. - * The values read are assumed to be in 1/1000 seconds, and - * are converted into jiffies. - * - * Returns 0 on success. - */ -int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return do_proc_dointvec(table, write, buffer, lenp, ppos, - do_proc_dointvec_ms_jiffies_conv, NULL); -} - -static int proc_do_cad_pid(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - struct pid *new_pid; - pid_t tmp; - int r; - - tmp = pid_vnr(cad_pid); - - r = __do_proc_dointvec(&tmp, table, write, buffer, - lenp, ppos, NULL, NULL); - if (r || !write) - return r; - - new_pid = find_get_pid(tmp); - if (!new_pid) - return -ESRCH; - - put_pid(xchg(&cad_pid, new_pid)); - return 0; -} - -/** - * proc_do_large_bitmap - read/write from/to a large bitmap - * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file - * @buffer: the user buffer - * @lenp: the size of the user buffer - * @ppos: file position - * - * The bitmap is stored at table->data and the bitmap length (in bits) - * in table->maxlen. - * - * We use a range comma separated format (e.g. 1,3-4,10-10) so that - * large bitmaps may be represented in a compact manner. Writing into - * the file will clear the bitmap then update it with the given input. - * - * Returns 0 on success. - */ -int proc_do_large_bitmap(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - int err = 0; - bool first = 1; - size_t left = *lenp; - unsigned long bitmap_len = table->maxlen; - unsigned long *bitmap = *(unsigned long **) table->data; - unsigned long *tmp_bitmap = NULL; - char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c; - - if (!bitmap || !bitmap_len || !left || (*ppos && !write)) { - *lenp = 0; - return 0; - } - - if (write) { - char *kbuf, *p; - size_t skipped = 0; - - if (left > PAGE_SIZE - 1) { - left = PAGE_SIZE - 1; - /* How much of the buffer we'll skip this pass */ - skipped = *lenp - left; - } - - p = kbuf = memdup_user_nul(buffer, left); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); - - tmp_bitmap = bitmap_zalloc(bitmap_len, GFP_KERNEL); - if (!tmp_bitmap) { - kfree(kbuf); - return -ENOMEM; - } - proc_skip_char(&p, &left, '\n'); - while (!err && left) { - unsigned long val_a, val_b; - bool neg; - size_t saved_left; - - /* In case we stop parsing mid-number, we can reset */ - saved_left = left; - err = proc_get_long(&p, &left, &val_a, &neg, tr_a, - sizeof(tr_a), &c); - /* - * If we consumed the entirety of a truncated buffer or - * only one char is left (may be a "-"), then stop here, - * reset, & come back for more. - */ - if ((left <= 1) && skipped) { - left = saved_left; - break; - } - - if (err) - break; - if (val_a >= bitmap_len || neg) { - err = -EINVAL; - break; - } - - val_b = val_a; - if (left) { - p++; - left--; - } - - if (c == '-') { - err = proc_get_long(&p, &left, &val_b, - &neg, tr_b, sizeof(tr_b), - &c); - /* - * If we consumed all of a truncated buffer or - * then stop here, reset, & come back for more. - */ - if (!left && skipped) { - left = saved_left; - break; - } - - if (err) - break; - if (val_b >= bitmap_len || neg || - val_a > val_b) { - err = -EINVAL; - break; - } - if (left) { - p++; - left--; - } - } - - bitmap_set(tmp_bitmap, val_a, val_b - val_a + 1); - first = 0; - proc_skip_char(&p, &left, '\n'); - } - kfree(kbuf); - left += skipped; - } else { - unsigned long bit_a, bit_b = 0; - - while (left) { - bit_a = find_next_bit(bitmap, bitmap_len, bit_b); - if (bit_a >= bitmap_len) - break; - bit_b = find_next_zero_bit(bitmap, bitmap_len, - bit_a + 1) - 1; - - if (!first) { - err = proc_put_char(&buffer, &left, ','); - if (err) - break; - } - err = proc_put_long(&buffer, &left, bit_a, false); - if (err) - break; - if (bit_a != bit_b) { - err = proc_put_char(&buffer, &left, '-'); - if (err) - break; - err = proc_put_long(&buffer, &left, bit_b, false); - if (err) - break; - } - - first = 0; bit_b++; - } - if (!err) - err = proc_put_char(&buffer, &left, '\n'); - } - - if (!err) { - if (write) { - if (*ppos) - bitmap_or(bitmap, bitmap, tmp_bitmap, bitmap_len); - else - bitmap_copy(bitmap, tmp_bitmap, bitmap_len); - } - *lenp -= left; - *ppos += *lenp; - } - - bitmap_free(tmp_bitmap); - return err; -} - -#else /* CONFIG_PROC_SYSCTL */ - -int proc_dostring(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -int proc_dointvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -int proc_douintvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -int proc_dointvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -int proc_douintvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -int proc_dointvec_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -int proc_doulongvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -int proc_do_large_bitmap(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -#endif /* CONFIG_PROC_SYSCTL */ - -#if defined(CONFIG_SYSCTL) -int proc_do_static_key(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) -{ - struct static_key *key = (struct static_key *)table->data; - static DEFINE_MUTEX(static_key_mutex); - int val, ret; - struct ctl_table tmp = { - .data = &val, - .maxlen = sizeof(val), - .mode = table->mode, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }; - - if (write && !capable(CAP_SYS_ADMIN)) - return -EPERM; - - mutex_lock(&static_key_mutex); - val = static_key_enabled(key); - ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); - if (write && !ret) { - if (val) - static_key_enable(key); - else - static_key_disable(key); - } - mutex_unlock(&static_key_mutex); - return ret; -} -#endif /* * No sense putting this after each symbol definition, twice, * exception granted :-) diff --git a/kernel/task_work.c b/kernel/task_work.c index 825f28259a19..5c0848ca1287 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -25,9 +25,10 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */ * 0 if succeeds or -ESRCH. */ int -task_work_add(struct task_struct *task, struct callback_head *work, bool notify) +task_work_add(struct task_struct *task, struct callback_head *work, int notify) { struct callback_head *head; + unsigned long flags; do { head = READ_ONCE(task->task_works); @@ -36,8 +37,19 @@ task_work_add(struct task_struct *task, struct callback_head *work, bool notify) work->next = head; } while (cmpxchg(&task->task_works, head, work) != head); - if (notify) + switch (notify) { + case TWA_RESUME: set_notify_resume(task); + break; + case TWA_SIGNAL: + if (lock_task_sighand(task, &flags)) { + task->jobctl |= JOBCTL_TASK_WORK; + signal_wake_up(task, 0); + unlock_task_sighand(task, &flags); + } + break; + } + return 0; } diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 7cb09c4cf21c..02441ead3c3b 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -928,14 +928,12 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) clocksource_arch_init(cs); -#ifdef CONFIG_GENERIC_VDSO_CLOCK_MODE if (cs->vdso_clock_mode < 0 || cs->vdso_clock_mode >= VDSO_CLOCKMODE_MAX) { pr_warn("clocksource %s registered with invalid VDSO mode %d. Disabling VDSO support.\n", cs->name, cs->vdso_clock_mode); cs->vdso_clock_mode = VDSO_CLOCKMODE_NONE; } -#endif /* Initialize mult/shift and max_idle_ns */ __clocksource_update_freq_scale(cs, scale, freq); diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 53bce347cd50..afc65e6be33e 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -280,24 +280,24 @@ static void timens_put(struct ns_common *ns) put_time_ns(to_time_ns(ns)); } -static int timens_install(struct nsproxy *nsproxy, struct ns_common *new) +void timens_commit(struct task_struct *tsk, struct time_namespace *ns) { + timens_set_vvar_page(tsk, ns); + vdso_join_timens(tsk, ns); +} + +static int timens_install(struct nsset *nsset, struct ns_common *new) +{ + struct nsproxy *nsproxy = nsset->nsproxy; struct time_namespace *ns = to_time_ns(new); - int err; if (!current_is_single_threaded()) return -EUSERS; if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || - !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) + !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) return -EPERM; - timens_set_vvar_page(current, ns); - - err = vdso_join_timens(current, ns); - if (err) - return err; - get_time_ns(ns); put_time_ns(nsproxy->time_ns); nsproxy->time_ns = ns; @@ -312,22 +312,17 @@ int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk) { struct ns_common *nsc = &nsproxy->time_ns_for_children->ns; struct time_namespace *ns = to_time_ns(nsc); - int err; /* create_new_namespaces() already incremented the ref counter */ if (nsproxy->time_ns == nsproxy->time_ns_for_children) return 0; - timens_set_vvar_page(tsk, ns); - - err = vdso_join_timens(tsk, ns); - if (err) - return err; - get_time_ns(ns); put_time_ns(nsproxy->time_ns); nsproxy->time_ns = ns; + timens_commit(tsk, ns); + return 0; } diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 2fd3b3fa68bf..165117996ea0 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -47,85 +47,65 @@ void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new) /* * Functions for validating access to tasks. */ -static struct task_struct *lookup_task(const pid_t pid, bool thread, - bool gettime) +static struct pid *pid_for_clock(const clockid_t clock, bool gettime) { - struct task_struct *p; + const bool thread = !!CPUCLOCK_PERTHREAD(clock); + const pid_t upid = CPUCLOCK_PID(clock); + struct pid *pid; + + if (CPUCLOCK_WHICH(clock) >= CPUCLOCK_MAX) + return NULL; /* * If the encoded PID is 0, then the timer is targeted at current * or the process to which current belongs. */ - if (!pid) - return thread ? current : current->group_leader; + if (upid == 0) + return thread ? task_pid(current) : task_tgid(current); - p = find_task_by_vpid(pid); - if (!p) - return p; - - if (thread) - return same_thread_group(p, current) ? p : NULL; + pid = find_vpid(upid); + if (!pid) + return NULL; - if (gettime) { - /* - * For clock_gettime(PROCESS) the task does not need to be - * the actual group leader. tsk->sighand gives - * access to the group's clock. - * - * Timers need the group leader because they take a - * reference on it and store the task pointer until the - * timer is destroyed. - */ - return (p == current || thread_group_leader(p)) ? p : NULL; + if (thread) { + struct task_struct *tsk = pid_task(pid, PIDTYPE_PID); + return (tsk && same_thread_group(tsk, current)) ? pid : NULL; } /* - * For processes require that p is group leader. + * For clock_gettime(PROCESS) allow finding the process by + * with the pid of the current task. The code needs the tgid + * of the process so that pid_task(pid, PIDTYPE_TGID) can be + * used to find the process. */ - return has_group_leader_pid(p) ? p : NULL; + if (gettime && (pid == task_pid(current))) + return task_tgid(current); + + /* + * For processes require that pid identifies a process. + */ + return pid_has_task(pid, PIDTYPE_TGID) ? pid : NULL; } -static struct task_struct *__get_task_for_clock(const clockid_t clock, - bool getref, bool gettime) +static inline int validate_clock_permissions(const clockid_t clock) { - const bool thread = !!CPUCLOCK_PERTHREAD(clock); - const pid_t pid = CPUCLOCK_PID(clock); - struct task_struct *p; - - if (CPUCLOCK_WHICH(clock) >= CPUCLOCK_MAX) - return NULL; + int ret; rcu_read_lock(); - p = lookup_task(pid, thread, gettime); - if (p && getref) - get_task_struct(p); + ret = pid_for_clock(clock, false) ? 0 : -EINVAL; rcu_read_unlock(); - return p; -} - -static inline struct task_struct *get_task_for_clock(const clockid_t clock) -{ - return __get_task_for_clock(clock, true, false); -} -static inline struct task_struct *get_task_for_clock_get(const clockid_t clock) -{ - return __get_task_for_clock(clock, true, true); -} - -static inline int validate_clock_permissions(const clockid_t clock) -{ - return __get_task_for_clock(clock, false, false) ? 0 : -EINVAL; + return ret; } -static inline enum pid_type cpu_timer_pid_type(struct k_itimer *timer) +static inline enum pid_type clock_pid_type(const clockid_t clock) { - return CPUCLOCK_PERTHREAD(timer->it_clock) ? PIDTYPE_PID : PIDTYPE_TGID; + return CPUCLOCK_PERTHREAD(clock) ? PIDTYPE_PID : PIDTYPE_TGID; } static inline struct task_struct *cpu_timer_task_rcu(struct k_itimer *timer) { - return pid_task(timer->it.cpu.pid, cpu_timer_pid_type(timer)); + return pid_task(timer->it.cpu.pid, clock_pid_type(timer->it_clock)); } /* @@ -373,15 +353,18 @@ static int posix_cpu_clock_get(const clockid_t clock, struct timespec64 *tp) struct task_struct *tsk; u64 t; - tsk = get_task_for_clock_get(clock); - if (!tsk) + rcu_read_lock(); + tsk = pid_task(pid_for_clock(clock, true), clock_pid_type(clock)); + if (!tsk) { + rcu_read_unlock(); return -EINVAL; + } if (CPUCLOCK_PERTHREAD(clock)) t = cpu_clock_sample(clkid, tsk); else t = cpu_clock_sample_group(clkid, tsk, false); - put_task_struct(tsk); + rcu_read_unlock(); *tp = ns_to_timespec64(t); return 0; @@ -394,19 +377,19 @@ static int posix_cpu_clock_get(const clockid_t clock, struct timespec64 *tp) */ static int posix_cpu_timer_create(struct k_itimer *new_timer) { - struct task_struct *p = get_task_for_clock(new_timer->it_clock); + struct pid *pid; - if (!p) + rcu_read_lock(); + pid = pid_for_clock(new_timer->it_clock, false); + if (!pid) { + rcu_read_unlock(); return -EINVAL; + } new_timer->kclock = &clock_posix_cpu; timerqueue_init(&new_timer->it.cpu.node); - new_timer->it.cpu.pid = get_task_pid(p, cpu_timer_pid_type(new_timer)); - /* - * get_task_for_clock() took a reference on @p. Drop it as the timer - * holds a reference on the pid of @p. - */ - put_task_struct(p); + new_timer->it.cpu.pid = get_pid(pid); + rcu_read_unlock(); return 0; } diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index fa3f800d7d76..0deaf4b79fb4 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -20,31 +20,6 @@ #include "timekeeping.h" /** - * struct clock_read_data - data required to read from sched_clock() - * - * @epoch_ns: sched_clock() value at last update - * @epoch_cyc: Clock cycle value at last update. - * @sched_clock_mask: Bitmask for two's complement subtraction of non 64bit - * clocks. - * @read_sched_clock: Current clock source (or dummy source when suspended). - * @mult: Multipler for scaled math conversion. - * @shift: Shift value for scaled math conversion. - * - * Care must be taken when updating this structure; it is read by - * some very hot code paths. It occupies <=40 bytes and, when combined - * with the seqcount used to synchronize access, comfortably fits into - * a 64 byte cache line. - */ -struct clock_read_data { - u64 epoch_ns; - u64 epoch_cyc; - u64 sched_clock_mask; - u64 (*read_sched_clock)(void); - u32 mult; - u32 shift; -}; - -/** * struct clock_data - all data needed for sched_clock() (including * registration of a new clock source) * @@ -93,6 +68,17 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) return (cyc * mult) >> shift; } +struct clock_read_data *sched_clock_read_begin(unsigned int *seq) +{ + *seq = raw_read_seqcount_latch(&cd.seq); + return cd.read_data + (*seq & 1); +} + +int sched_clock_read_retry(unsigned int seq) +{ + return read_seqcount_retry(&cd.seq, seq); +} + unsigned long long notrace sched_clock(void) { u64 cyc, res; @@ -100,13 +86,12 @@ unsigned long long notrace sched_clock(void) struct clock_read_data *rd; do { - seq = raw_read_seqcount(&cd.seq); - rd = cd.read_data + (seq & 1); + rd = sched_clock_read_begin(&seq); cyc = (rd->read_sched_clock() - rd->epoch_cyc) & rd->sched_clock_mask; res = rd->epoch_ns + cyc_to_ns(cyc, rd->mult, rd->shift); - } while (read_seqcount_retry(&cd.seq, seq)); + } while (sched_clock_read_retry(seq)); return res; } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 3e2dc9b8858c..f0199a4ba1ad 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -351,16 +351,24 @@ void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit) EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_cpu); /* - * Set a per-task tick dependency. Posix CPU timers need this in order to elapse - * per task timers. + * Set a per-task tick dependency. RCU need this. Also posix CPU timers + * in order to elapse per task timers. */ void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit) { - /* - * We could optimize this with just kicking the target running the task - * if that noise matters for nohz full users. - */ - tick_nohz_dep_set_all(&tsk->tick_dep_mask, bit); + if (!atomic_fetch_or(BIT(bit), &tsk->tick_dep_mask)) { + if (tsk == current) { + preempt_disable(); + tick_nohz_full_kick(); + preempt_enable(); + } else { + /* + * Some future tick_nohz_full_kick_task() + * should optimize this. + */ + tick_nohz_full_kick_all(); + } + } } EXPORT_SYMBOL_GPL(tick_nohz_dep_set_task); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 9ebaab13339d..63a632f9896c 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -953,7 +953,7 @@ EXPORT_SYMBOL_GPL(ktime_get_real_seconds); * but without the sequence counter protect. This internal function * is called just when timekeeping lock is already held. */ -time64_t __ktime_get_real_seconds(void) +noinstr time64_t __ktime_get_real_seconds(void) { struct timekeeper *tk = &tk_core.timekeeper; @@ -2193,7 +2193,7 @@ EXPORT_SYMBOL(ktime_get_coarse_ts64); void do_timer(unsigned long ticks) { jiffies_64 += ticks; - calc_global_load(ticks); + calc_global_load(); } /** diff --git a/kernel/time/timer.c b/kernel/time/timer.c index a5221abb4594..ae5029f984a8 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -43,6 +43,7 @@ #include <linux/sched/debug.h> #include <linux/slab.h> #include <linux/compat.h> +#include <linux/random.h> #include <linux/uaccess.h> #include <asm/unistd.h> @@ -156,7 +157,8 @@ EXPORT_SYMBOL(jiffies_64); /* * The time start value for each level to select the bucket at enqueue - * time. + * time. We start from the last possible delta of the previous level + * so that we can later add an extra LVL_GRAN(n) to n (see calc_index()). */ #define LVL_START(n) ((LVL_SIZE - 1) << (((n) - 1) * LVL_CLK_SHIFT)) @@ -203,8 +205,8 @@ struct timer_base { unsigned long clk; unsigned long next_expiry; unsigned int cpu; + bool next_expiry_recalc; bool is_idle; - bool must_forward_clk; DECLARE_BITMAP(pending_map, WHEEL_SIZE); struct hlist_head vectors[WHEEL_SIZE]; } ____cacheline_aligned; @@ -249,8 +251,7 @@ void timers_update_nohz(void) } int timer_migration_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -488,71 +489,61 @@ static inline void timer_set_idx(struct timer_list *timer, unsigned int idx) * Helper function to calculate the array index for a given expiry * time. */ -static inline unsigned calc_index(unsigned expires, unsigned lvl) +static inline unsigned calc_index(unsigned long expires, unsigned lvl, + unsigned long *bucket_expiry) { + + /* + * The timer wheel has to guarantee that a timer does not fire + * early. Early expiry can happen due to: + * - Timer is armed at the edge of a tick + * - Truncation of the expiry time in the outer wheel levels + * + * Round up with level granularity to prevent this. + */ expires = (expires + LVL_GRAN(lvl)) >> LVL_SHIFT(lvl); + *bucket_expiry = expires << LVL_SHIFT(lvl); return LVL_OFFS(lvl) + (expires & LVL_MASK); } -static int calc_wheel_index(unsigned long expires, unsigned long clk) +static int calc_wheel_index(unsigned long expires, unsigned long clk, + unsigned long *bucket_expiry) { unsigned long delta = expires - clk; unsigned int idx; if (delta < LVL_START(1)) { - idx = calc_index(expires, 0); + idx = calc_index(expires, 0, bucket_expiry); } else if (delta < LVL_START(2)) { - idx = calc_index(expires, 1); + idx = calc_index(expires, 1, bucket_expiry); } else if (delta < LVL_START(3)) { - idx = calc_index(expires, 2); + idx = calc_index(expires, 2, bucket_expiry); } else if (delta < LVL_START(4)) { - idx = calc_index(expires, 3); + idx = calc_index(expires, 3, bucket_expiry); } else if (delta < LVL_START(5)) { - idx = calc_index(expires, 4); + idx = calc_index(expires, 4, bucket_expiry); } else if (delta < LVL_START(6)) { - idx = calc_index(expires, 5); + idx = calc_index(expires, 5, bucket_expiry); } else if (delta < LVL_START(7)) { - idx = calc_index(expires, 6); + idx = calc_index(expires, 6, bucket_expiry); } else if (LVL_DEPTH > 8 && delta < LVL_START(8)) { - idx = calc_index(expires, 7); + idx = calc_index(expires, 7, bucket_expiry); } else if ((long) delta < 0) { idx = clk & LVL_MASK; + *bucket_expiry = clk; } else { /* * Force expire obscene large timeouts to expire at the * capacity limit of the wheel. */ - if (expires >= WHEEL_TIMEOUT_CUTOFF) - expires = WHEEL_TIMEOUT_MAX; + if (delta >= WHEEL_TIMEOUT_CUTOFF) + expires = clk + WHEEL_TIMEOUT_MAX; - idx = calc_index(expires, LVL_DEPTH - 1); + idx = calc_index(expires, LVL_DEPTH - 1, bucket_expiry); } return idx; } -/* - * Enqueue the timer into the hash bucket, mark it pending in - * the bitmap and store the index in the timer flags. - */ -static void enqueue_timer(struct timer_base *base, struct timer_list *timer, - unsigned int idx) -{ - hlist_add_head(&timer->entry, base->vectors + idx); - __set_bit(idx, base->pending_map); - timer_set_idx(timer, idx); - - trace_timer_start(timer, timer->expires, timer->flags); -} - -static void -__internal_add_timer(struct timer_base *base, struct timer_list *timer) -{ - unsigned int idx; - - idx = calc_wheel_index(timer->expires, base->clk); - enqueue_timer(base, timer, idx); -} - static void trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer) { @@ -574,26 +565,48 @@ trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer) * timer is not deferrable. If the other CPU is on the way to idle * then it can't set base->is_idle as we hold the base lock: */ - if (!base->is_idle) - return; + if (base->is_idle) + wake_up_nohz_cpu(base->cpu); +} - /* Check whether this is the new first expiring timer: */ - if (time_after_eq(timer->expires, base->next_expiry)) - return; +/* + * Enqueue the timer into the hash bucket, mark it pending in + * the bitmap, store the index in the timer flags then wake up + * the target CPU if needed. + */ +static void enqueue_timer(struct timer_base *base, struct timer_list *timer, + unsigned int idx, unsigned long bucket_expiry) +{ + + hlist_add_head(&timer->entry, base->vectors + idx); + __set_bit(idx, base->pending_map); + timer_set_idx(timer, idx); + + trace_timer_start(timer, timer->expires, timer->flags); /* - * Set the next expiry time and kick the CPU so it can reevaluate the - * wheel: + * Check whether this is the new first expiring timer. The + * effective expiry time of the timer is required here + * (bucket_expiry) instead of timer->expires. */ - base->next_expiry = timer->expires; - wake_up_nohz_cpu(base->cpu); + if (time_before(bucket_expiry, base->next_expiry)) { + /* + * Set the next expiry time and kick the CPU so it + * can reevaluate the wheel: + */ + base->next_expiry = bucket_expiry; + base->next_expiry_recalc = false; + trigger_dyntick_cpu(base, timer); + } } -static void -internal_add_timer(struct timer_base *base, struct timer_list *timer) +static void internal_add_timer(struct timer_base *base, struct timer_list *timer) { - __internal_add_timer(base, timer); - trigger_dyntick_cpu(base, timer); + unsigned long bucket_expiry; + unsigned int idx; + + idx = calc_wheel_index(timer->expires, base->clk, &bucket_expiry); + enqueue_timer(base, timer, idx, bucket_expiry); } #ifdef CONFIG_DEBUG_OBJECTS_TIMERS @@ -826,8 +839,10 @@ static int detach_if_pending(struct timer_list *timer, struct timer_base *base, if (!timer_pending(timer)) return 0; - if (hlist_is_singular_node(&timer->entry, base->vectors + idx)) + if (hlist_is_singular_node(&timer->entry, base->vectors + idx)) { __clear_bit(idx, base->pending_map); + base->next_expiry_recalc = true; + } detach_timer(timer, clear_pending); return 1; @@ -877,31 +892,27 @@ get_target_base(struct timer_base *base, unsigned tflags) static inline void forward_timer_base(struct timer_base *base) { -#ifdef CONFIG_NO_HZ_COMMON - unsigned long jnow; + unsigned long jnow = READ_ONCE(jiffies); /* - * We only forward the base when we are idle or have just come out of - * idle (must_forward_clk logic), and have a delta between base clock - * and jiffies. In the common case, run_timers will take care of it. + * No need to forward if we are close enough below jiffies. + * Also while executing timers, base->clk is 1 offset ahead + * of jiffies to avoid endless requeuing to current jffies. */ - if (likely(!base->must_forward_clk)) - return; - - jnow = READ_ONCE(jiffies); - base->must_forward_clk = base->is_idle; - if ((long)(jnow - base->clk) < 2) + if ((long)(jnow - base->clk) < 1) return; /* * If the next expiry value is > jiffies, then we fast forward to * jiffies otherwise we forward to the next expiry value. */ - if (time_after(base->next_expiry, jnow)) + if (time_after(base->next_expiry, jnow)) { base->clk = jnow; - else + } else { + if (WARN_ON_ONCE(time_before(base->next_expiry, base->clk))) + return; base->clk = base->next_expiry; -#endif + } } @@ -949,9 +960,9 @@ static struct timer_base *lock_timer_base(struct timer_list *timer, static inline int __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int options) { + unsigned long clk = 0, flags, bucket_expiry; struct timer_base *base, *new_base; unsigned int idx = UINT_MAX; - unsigned long clk = 0, flags; int ret = 0; BUG_ON(!timer->function); @@ -990,7 +1001,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option } clk = base->clk; - idx = calc_wheel_index(expires, clk); + idx = calc_wheel_index(expires, clk, &bucket_expiry); /* * Retrieve and compare the array index of the pending @@ -1043,16 +1054,13 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option /* * If 'idx' was calculated above and the base time did not advance * between calculating 'idx' and possibly switching the base, only - * enqueue_timer() and trigger_dyntick_cpu() is required. Otherwise - * we need to (re)calculate the wheel index via - * internal_add_timer(). + * enqueue_timer() is required. Otherwise we need to (re)calculate + * the wheel index via internal_add_timer(). */ - if (idx != UINT_MAX && clk == base->clk) { - enqueue_timer(base, timer, idx); - trigger_dyntick_cpu(base, timer); - } else { + if (idx != UINT_MAX && clk == base->clk) + enqueue_timer(base, timer, idx, bucket_expiry); + else internal_add_timer(base, timer); - } out_unlock: raw_spin_unlock_irqrestore(&base->lock, flags); @@ -1455,10 +1463,10 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head) } } -static int __collect_expired_timers(struct timer_base *base, - struct hlist_head *heads) +static int collect_expired_timers(struct timer_base *base, + struct hlist_head *heads) { - unsigned long clk = base->clk; + unsigned long clk = base->clk = base->next_expiry; struct hlist_head *vec; int i, levels = 0; unsigned int idx; @@ -1480,7 +1488,6 @@ static int __collect_expired_timers(struct timer_base *base, return levels; } -#ifdef CONFIG_NO_HZ_COMMON /* * Find the next pending bucket of a level. Search from level start (@offset) * + @clk upwards and if nothing there, search from start of the level @@ -1513,6 +1520,7 @@ static unsigned long __next_timer_interrupt(struct timer_base *base) clk = base->clk; for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) { int pos = next_pending_bucket(base, offset, clk & LVL_MASK); + unsigned long lvl_clk = clk & LVL_CLK_MASK; if (pos >= 0) { unsigned long tmp = clk + (unsigned long) pos; @@ -1520,6 +1528,13 @@ static unsigned long __next_timer_interrupt(struct timer_base *base) tmp <<= LVL_SHIFT(lvl); if (time_before(tmp, next)) next = tmp; + + /* + * If the next expiration happens before we reach + * the next level, no need to check further. + */ + if (pos <= ((LVL_CLK_DIV - lvl_clk) & LVL_CLK_MASK)) + break; } /* * Clock for the next level. If the current level clock lower @@ -1557,13 +1572,17 @@ static unsigned long __next_timer_interrupt(struct timer_base *base) * So the simple check whether the lower bits of the current * level are 0 or not is sufficient for all cases. */ - adj = clk & LVL_CLK_MASK ? 1 : 0; + adj = lvl_clk ? 1 : 0; clk >>= LVL_CLK_SHIFT; clk += adj; } + + base->next_expiry_recalc = false; + return next; } +#ifdef CONFIG_NO_HZ_COMMON /* * Check, if the next hrtimer event is before the next timer wheel * event: @@ -1620,9 +1639,11 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) return expires; raw_spin_lock(&base->lock); - nextevt = __next_timer_interrupt(base); + if (base->next_expiry_recalc) + base->next_expiry = __next_timer_interrupt(base); + nextevt = base->next_expiry; is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA); - base->next_expiry = nextevt; + /* * We have a fresh next event. Check whether we can forward the * base. We can only do that when @basej is past base->clk @@ -1648,10 +1669,8 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) * logic is only maintained for the BASE_STD base, deferrable * timers may still see large granularity skew (by design). */ - if ((expires - basem) > TICK_NSEC) { - base->must_forward_clk = true; + if ((expires - basem) > TICK_NSEC) base->is_idle = true; - } } raw_spin_unlock(&base->lock); @@ -1675,42 +1694,6 @@ void timer_clear_idle(void) */ base->is_idle = false; } - -static int collect_expired_timers(struct timer_base *base, - struct hlist_head *heads) -{ - unsigned long now = READ_ONCE(jiffies); - - /* - * NOHZ optimization. After a long idle sleep we need to forward the - * base to current jiffies. Avoid a loop by searching the bitfield for - * the next expiring timer. - */ - if ((long)(now - base->clk) > 2) { - unsigned long next = __next_timer_interrupt(base); - - /* - * If the next timer is ahead of time forward to current - * jiffies, otherwise forward to the next expiry time: - */ - if (time_after(next, now)) { - /* - * The call site will increment base->clk and then - * terminate the expiry loop immediately. - */ - base->clk = now; - return 0; - } - base->clk = next; - } - return __collect_expired_timers(base, heads); -} -#else -static inline int collect_expired_timers(struct timer_base *base, - struct hlist_head *heads) -{ - return __collect_expired_timers(base, heads); -} #endif /* @@ -1732,6 +1715,13 @@ void update_process_times(int user_tick) scheduler_tick(); if (IS_ENABLED(CONFIG_POSIX_TIMERS)) run_posix_cpu_timers(); + + /* The current CPU might make use of net randoms without receiving IRQs + * to renew them often enough. Let's update the net_rand_state from a + * non-constant value that's not affine to the number of calls to make + * sure it's updated when there's some activity (we don't care in idle). + */ + this_cpu_add(net_rand_state.s1, rol32(jiffies, 24) + user_tick); } /** @@ -1743,32 +1733,23 @@ static inline void __run_timers(struct timer_base *base) struct hlist_head heads[LVL_DEPTH]; int levels; - if (!time_after_eq(jiffies, base->clk)) + if (time_before(jiffies, base->next_expiry)) return; timer_base_lock_expiry(base); raw_spin_lock_irq(&base->lock); - /* - * timer_base::must_forward_clk must be cleared before running - * timers so that any timer functions that call mod_timer() will - * not try to forward the base. Idle tracking / clock forwarding - * logic is only used with BASE_STD timers. - * - * The must_forward_clk flag is cleared unconditionally also for - * the deferrable base. The deferrable base is not affected by idle - * tracking and never forwarded, so clearing the flag is a NOOP. - * - * The fact that the deferrable base is never forwarded can cause - * large variations in granularity for deferrable timers, but they - * can be deferred for long periods due to idle anyway. - */ - base->must_forward_clk = false; - - while (time_after_eq(jiffies, base->clk)) { - + while (time_after_eq(jiffies, base->clk) && + time_after_eq(jiffies, base->next_expiry)) { levels = collect_expired_timers(base, heads); + /* + * The only possible reason for not finding any expired + * timer at this clk is that all matching timers have been + * dequeued. + */ + WARN_ON_ONCE(!levels && !base->next_expiry_recalc); base->clk++; + base->next_expiry = __next_timer_interrupt(base); while (levels--) expire_timers(base, heads + levels); @@ -1798,12 +1779,12 @@ void run_local_timers(void) hrtimer_run_queues(); /* Raise the softirq only if required. */ - if (time_before(jiffies, base->clk)) { + if (time_before(jiffies, base->next_expiry)) { if (!IS_ENABLED(CONFIG_NO_HZ_COMMON)) return; /* CPU is awake, so check the deferrable base. */ base++; - if (time_before(jiffies, base->clk)) + if (time_before(jiffies, base->next_expiry)) return; } raise_softirq(TIMER_SOFTIRQ); @@ -1968,7 +1949,6 @@ int timers_prepare_cpu(unsigned int cpu) base->clk = jiffies; base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA; base->is_idle = false; - base->must_forward_clk = true; } return 0; } @@ -2021,6 +2001,7 @@ static void __init init_timer_cpu(int cpu) base->cpu = cpu; raw_spin_lock_init(&base->lock); base->clk = jiffies; + base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA; timer_base_init_expiry_lock(base); } } diff --git a/kernel/torture.c b/kernel/torture.c index a1a41484ff6d..1061492f14bd 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -45,6 +45,9 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>"); static bool disable_onoff_at_boot; module_param(disable_onoff_at_boot, bool, 0444); +static bool ftrace_dump_at_shutdown; +module_param(ftrace_dump_at_shutdown, bool, 0444); + static char *torture_type; static int verbose; @@ -527,7 +530,8 @@ static int torture_shutdown(void *arg) torture_shutdown_hook(); else VERBOSE_TOROUT_STRING("No torture_shutdown_hook(), skipping."); - rcu_ftrace_dump(DUMP_ALL); + if (ftrace_dump_at_shutdown) + rcu_ftrace_dump(DUMP_ALL); kernel_power_off(); /* Shut down the system. */ return 0; } diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 743647005f64..a4020c0b4508 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -10,11 +10,6 @@ config USER_STACKTRACE_SUPPORT config NOP_TRACER bool -config HAVE_FTRACE_NMI_ENTER - bool - help - See Documentation/trace/ftrace-design.rst - config HAVE_FUNCTION_TRACER bool help @@ -72,11 +67,6 @@ config RING_BUFFER select TRACE_CLOCK select IRQ_WORK -config FTRACE_NMI_ENTER - bool - depends on HAVE_FTRACE_NMI_ENTER - default y - config EVENT_TRACING select CONTEXT_SWITCH_TRACER select GLOB @@ -158,6 +148,7 @@ config FUNCTION_TRACER select CONTEXT_SWITCH_TRACER select GLOB select TASKS_RCU if PREEMPTION + select TASKS_RUDE_RCU help Enable the kernel to trace every kernel function. This is done by using a compiler feature to insert a small, 5-byte No-Operation @@ -258,15 +249,6 @@ config TRACE_PREEMPT_TOGGLE Enables hooks which will be called when preemption is first disabled, and last enabled. -config PREEMPTIRQ_EVENTS - bool "Enable trace events for preempt and irq disable/enable" - select TRACE_IRQFLAGS - select TRACE_PREEMPT_TOGGLE if PREEMPTION - select GENERIC_TRACER - default n - help - Enable tracing of disable and enable events for preemption and irqs. - config IRQSOFF_TRACER bool "Interrupts-off Latency Tracer" default n @@ -623,12 +605,30 @@ config TRACING_MAP generally used outside of that context, and is normally selected by tracers that use it. +config SYNTH_EVENTS + bool "Synthetic trace events" + select TRACING + select DYNAMIC_EVENTS + default n + help + Synthetic events are user-defined trace events that can be + used to combine data from other trace events or in fact any + data source. Synthetic events can be generated indirectly + via the trace() action of histogram triggers or directly + by way of an in-kernel API. + + See Documentation/trace/events.rst or + Documentation/trace/histogram.rst for details and examples. + + If in doubt, say N. + config HIST_TRIGGERS bool "Histogram triggers" depends on ARCH_HAVE_NMI_SAFE_CMPXCHG select TRACING_MAP select TRACING select DYNAMIC_EVENTS + select SYNTH_EVENTS default n help Hist triggers allow one or more arbitrary trace event fields @@ -824,7 +824,7 @@ config PREEMPTIRQ_DELAY_TEST config SYNTH_EVENT_GEN_TEST tristate "Test module for in-kernel synthetic event generation" - depends on HIST_TRIGGERS + depends on SYNTH_EVENTS help This option creates a test module to check the base functionality of in-kernel synthetic event definition and @@ -847,6 +847,29 @@ config KPROBE_EVENT_GEN_TEST If unsure, say N. +config HIST_TRIGGERS_DEBUG + bool "Hist trigger debug support" + depends on HIST_TRIGGERS + help + Add "hist_debug" file for each event, which when read will + dump out a bunch of internal details about the hist triggers + defined on that event. + + The hist_debug file serves a couple of purposes: + + - Helps developers verify that nothing is broken. + + - Provides educational information to support the details + of the hist trigger internals as described by + Documentation/trace/histogram-design.rst. + + The hist_debug output only covers the data structures + related to the histogram definitions themselves and doesn't + display the internals of map buckets or variable values of + running histograms. + + If unsure, say N. + endif # FTRACE endif # TRACING_SUPPORT diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index f9dcd19165fa..aeba5ee7325a 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -6,6 +6,9 @@ ifdef CONFIG_FUNCTION_TRACER ORIG_CFLAGS := $(KBUILD_CFLAGS) KBUILD_CFLAGS = $(subst $(CC_FLAGS_FTRACE),,$(ORIG_CFLAGS)) +# Avoid recursion due to instrumentation. +KCSAN_SANITIZE := n + ifdef CONFIG_FTRACE_SELFTEST # selftest needs instrumentation CFLAGS_trace_selftest_dynamic.o = $(CC_FLAGS_FTRACE) @@ -28,6 +31,8 @@ ifdef CONFIG_GCOV_PROFILE_FTRACE GCOV_PROFILE := y endif +CFLAGS_bpf_trace.o := -I$(src) + CFLAGS_trace_benchmark.o := -I$(src) CFLAGS_trace_events_filter.o := -I$(src) @@ -72,6 +77,7 @@ endif obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o obj-$(CONFIG_TRACE_EVENT_INJECT) += trace_events_inject.o +obj-$(CONFIG_SYNTH_EVENTS) += trace_events_synth.o obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o obj-$(CONFIG_KPROBE_EVENTS) += trace_kprobe.o diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index ca39dc3230cb..7ba62d68885a 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -3,6 +3,9 @@ * Copyright (C) 2006 Jens Axboe <axboe@kernel.dk> * */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/blkdev.h> #include <linux/blktrace_api.h> @@ -170,10 +173,10 @@ void __trace_note_message(struct blk_trace *bt, struct blkcg *blkcg, if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) blkcg = NULL; #ifdef CONFIG_BLK_CGROUP - trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, + trace_note(bt, current->pid, BLK_TN_MESSAGE, buf, n, blkcg ? cgroup_id(blkcg->css.cgroup) : 1); #else - trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, 0); + trace_note(bt, current->pid, BLK_TN_MESSAGE, buf, n, 0); #endif local_irq_restore(flags); } @@ -344,7 +347,8 @@ static int __blk_trace_remove(struct request_queue *q) { struct blk_trace *bt; - bt = xchg(&q->blk_trace, NULL); + bt = rcu_replace_pointer(q->blk_trace, NULL, + lockdep_is_held(&q->debugfs_mutex)); if (!bt) return -EINVAL; @@ -358,9 +362,9 @@ int blk_trace_remove(struct request_queue *q) { int ret; - mutex_lock(&q->blk_trace_mutex); + mutex_lock(&q->debugfs_mutex); ret = __blk_trace_remove(q); - mutex_unlock(&q->blk_trace_mutex); + mutex_unlock(&q->debugfs_mutex); return ret; } @@ -479,12 +483,11 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, struct dentry *dir = NULL; int ret; + lockdep_assert_held(&q->debugfs_mutex); + if (!buts->buf_size || !buts->buf_nr) return -EINVAL; - if (!blk_debugfs_root) - return -ENOENT; - strncpy(buts->name, name, BLKTRACE_BDEV_SIZE); buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0'; @@ -494,6 +497,17 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, */ strreplace(buts->name, '/', '_'); + /* + * bdev can be NULL, as with scsi-generic, this is a helpful as + * we can be. + */ + if (rcu_dereference_protected(q->blk_trace, + lockdep_is_held(&q->debugfs_mutex))) { + pr_warn("Concurrent blktraces are not allowed on %s\n", + buts->name); + return -EBUSY; + } + bt = kzalloc(sizeof(*bt), GFP_KERNEL); if (!bt) return -ENOMEM; @@ -507,12 +521,29 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (!bt->msg_data) goto err; - ret = -ENOENT; - - dir = debugfs_lookup(buts->name, blk_debugfs_root); - if (!dir) + /* + * When tracing the whole disk reuse the existing debugfs directory + * created by the block layer on init. For partitions block devices, + * and scsi-generic block devices we create a temporary new debugfs + * directory that will be removed once the trace ends. + */ + if (bdev && bdev == bdev->bd_contains) + dir = q->debugfs_dir; + else bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root); + /* + * As blktrace relies on debugfs for its interface the debugfs directory + * is required, contrary to the usual mantra of not checking for debugfs + * files or directories. + */ + if (IS_ERR_OR_NULL(dir)) { + pr_warn("debugfs_dir not present for %s so skipping\n", + buts->name); + ret = -ENOENT; + goto err; + } + bt->dev = dev; atomic_set(&bt->dropped, 0); INIT_LIST_HEAD(&bt->running_list); @@ -543,16 +574,11 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, bt->pid = buts->pid; bt->trace_state = Blktrace_setup; - ret = -EBUSY; - if (cmpxchg(&q->blk_trace, NULL, bt)) - goto err; - + rcu_assign_pointer(q->blk_trace, bt); get_probe_ref(); ret = 0; err: - if (dir && !bt->dir) - dput(dir); if (ret) blk_trace_free(bt); return ret; @@ -585,9 +611,9 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, { int ret; - mutex_lock(&q->blk_trace_mutex); + mutex_lock(&q->debugfs_mutex); ret = __blk_trace_setup(q, name, dev, bdev, arg); - mutex_unlock(&q->blk_trace_mutex); + mutex_unlock(&q->debugfs_mutex); return ret; } @@ -633,7 +659,7 @@ static int __blk_trace_startstop(struct request_queue *q, int start) struct blk_trace *bt; bt = rcu_dereference_protected(q->blk_trace, - lockdep_is_held(&q->blk_trace_mutex)); + lockdep_is_held(&q->debugfs_mutex)); if (bt == NULL) return -EINVAL; @@ -673,9 +699,9 @@ int blk_trace_startstop(struct request_queue *q, int start) { int ret; - mutex_lock(&q->blk_trace_mutex); + mutex_lock(&q->debugfs_mutex); ret = __blk_trace_startstop(q, start); - mutex_unlock(&q->blk_trace_mutex); + mutex_unlock(&q->debugfs_mutex); return ret; } @@ -704,7 +730,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) if (!q) return -ENXIO; - mutex_lock(&q->blk_trace_mutex); + mutex_lock(&q->debugfs_mutex); switch (cmd) { case BLKTRACESETUP: @@ -731,7 +757,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) break; } - mutex_unlock(&q->blk_trace_mutex); + mutex_unlock(&q->debugfs_mutex); return ret; } @@ -742,14 +768,14 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) **/ void blk_trace_shutdown(struct request_queue *q) { - mutex_lock(&q->blk_trace_mutex); + mutex_lock(&q->debugfs_mutex); if (rcu_dereference_protected(q->blk_trace, - lockdep_is_held(&q->blk_trace_mutex))) { + lockdep_is_held(&q->debugfs_mutex))) { __blk_trace_startstop(q, 0); __blk_trace_remove(q); } - mutex_unlock(&q->blk_trace_mutex); + mutex_unlock(&q->debugfs_mutex); } #ifdef CONFIG_BLK_CGROUP @@ -834,6 +860,13 @@ static void blk_add_trace_rq_issue(void *ignore, blk_trace_request_get_cgid(q, rq)); } +static void blk_add_trace_rq_merge(void *ignore, + struct request_queue *q, struct request *rq) +{ + blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_BACKMERGE, + blk_trace_request_get_cgid(q, rq)); +} + static void blk_add_trace_rq_requeue(void *ignore, struct request_queue *q, struct request *rq) @@ -885,10 +918,10 @@ static void blk_add_trace_bio_bounce(void *ignore, } static void blk_add_trace_bio_complete(void *ignore, - struct request_queue *q, struct bio *bio, - int error) + struct request_queue *q, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error); + blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, + blk_status_to_errno(bio->bi_status)); } static void blk_add_trace_bio_backmerge(void *ignore, @@ -995,8 +1028,10 @@ static void blk_add_trace_split(void *ignore, __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf, - BLK_TA_SPLIT, bio->bi_status, sizeof(rpdu), - &rpdu, blk_trace_bio_get_cgid(q, bio)); + BLK_TA_SPLIT, + blk_status_to_errno(bio->bi_status), + sizeof(rpdu), &rpdu, + blk_trace_bio_get_cgid(q, bio)); } rcu_read_unlock(); } @@ -1033,7 +1068,8 @@ static void blk_add_trace_bio_remap(void *ignore, r.sector_from = cpu_to_be64(from); __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, - bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_status, + bio_op(bio), bio->bi_opf, BLK_TA_REMAP, + blk_status_to_errno(bio->bi_status), sizeof(r), &r, blk_trace_bio_get_cgid(q, bio)); rcu_read_unlock(); } @@ -1115,6 +1151,8 @@ static void blk_register_tracepoints(void) WARN_ON(ret); ret = register_trace_block_rq_issue(blk_add_trace_rq_issue, NULL); WARN_ON(ret); + ret = register_trace_block_rq_merge(blk_add_trace_rq_merge, NULL); + WARN_ON(ret); ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL); WARN_ON(ret); ret = register_trace_block_rq_complete(blk_add_trace_rq_complete, NULL); @@ -1161,6 +1199,7 @@ static void blk_unregister_tracepoints(void) unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL); unregister_trace_block_rq_complete(blk_add_trace_rq_complete, NULL); unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL); + unregister_trace_block_rq_merge(blk_add_trace_rq_merge, NULL); unregister_trace_block_rq_issue(blk_add_trace_rq_issue, NULL); unregister_trace_block_rq_insert(blk_add_trace_rq_insert, NULL); @@ -1253,21 +1292,10 @@ static inline __u16 t_error(const struct trace_entry *ent) static __u64 get_pdu_int(const struct trace_entry *ent, bool has_cg) { - const __u64 *val = pdu_start(ent, has_cg); + const __be64 *val = pdu_start(ent, has_cg); return be64_to_cpu(*val); } -static void get_pdu_remap(const struct trace_entry *ent, - struct blk_io_trace_remap *r, bool has_cg) -{ - const struct blk_io_trace_remap *__r = pdu_start(ent, has_cg); - __u64 sector_from = __r->sector_from; - - r->device_from = be32_to_cpu(__r->device_from); - r->device_to = be32_to_cpu(__r->device_to); - r->sector_from = be64_to_cpu(sector_from); -} - typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act, bool has_cg); @@ -1407,13 +1435,13 @@ static void blk_log_with_error(struct trace_seq *s, static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) { - struct blk_io_trace_remap r = { .device_from = 0, }; + const struct blk_io_trace_remap *__r = pdu_start(ent, has_cg); - get_pdu_remap(ent, &r, has_cg); trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n", t_sector(ent), t_sec(ent), - MAJOR(r.device_from), MINOR(r.device_from), - (unsigned long long)r.sector_from); + MAJOR(be32_to_cpu(__r->device_from)), + MINOR(be32_to_cpu(__r->device_from)), + be64_to_cpu(__r->sector_from)); } static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) @@ -1637,7 +1665,8 @@ static int blk_trace_remove_queue(struct request_queue *q) { struct blk_trace *bt; - bt = xchg(&q->blk_trace, NULL); + bt = rcu_replace_pointer(q->blk_trace, NULL, + lockdep_is_held(&q->debugfs_mutex)); if (bt == NULL) return -EINVAL; @@ -1669,10 +1698,7 @@ static int blk_trace_setup_queue(struct request_queue *q, blk_trace_setup_lba(bt, bdev); - ret = -EBUSY; - if (cmpxchg(&q->blk_trace, NULL, bt)) - goto free_bt; - + rcu_assign_pointer(q->blk_trace, bt); get_probe_ref(); return 0; @@ -1815,10 +1841,10 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, if (q == NULL) goto out_bdput; - mutex_lock(&q->blk_trace_mutex); + mutex_lock(&q->debugfs_mutex); bt = rcu_dereference_protected(q->blk_trace, - lockdep_is_held(&q->blk_trace_mutex)); + lockdep_is_held(&q->debugfs_mutex)); if (attr == &dev_attr_enable) { ret = sprintf(buf, "%u\n", !!bt); goto out_unlock_bdev; @@ -1836,7 +1862,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, ret = sprintf(buf, "%llu\n", bt->end_lba); out_unlock_bdev: - mutex_unlock(&q->blk_trace_mutex); + mutex_unlock(&q->debugfs_mutex); out_bdput: bdput(bdev); out: @@ -1879,10 +1905,10 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, if (q == NULL) goto out_bdput; - mutex_lock(&q->blk_trace_mutex); + mutex_lock(&q->debugfs_mutex); bt = rcu_dereference_protected(q->blk_trace, - lockdep_is_held(&q->blk_trace_mutex)); + lockdep_is_held(&q->debugfs_mutex)); if (attr == &dev_attr_enable) { if (!!value == !!bt) { ret = 0; @@ -1899,7 +1925,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, if (bt == NULL) { ret = blk_trace_setup_queue(q, bdev); bt = rcu_dereference_protected(q->blk_trace, - lockdep_is_held(&q->blk_trace_mutex)); + lockdep_is_held(&q->debugfs_mutex)); } if (ret == 0) { @@ -1914,7 +1940,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, } out_unlock_bdev: - mutex_unlock(&q->blk_trace_mutex); + mutex_unlock(&q->debugfs_mutex); out_bdput: bdput(bdev); out: diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index a010edc37ee0..cb91ef902cc4 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -11,14 +11,19 @@ #include <linux/uaccess.h> #include <linux/ctype.h> #include <linux/kprobes.h> +#include <linux/spinlock.h> #include <linux/syscalls.h> #include <linux/error-injection.h> +#include <linux/btf_ids.h> #include <asm/tlb.h> #include "trace_probe.h" #include "trace.h" +#define CREATE_TRACE_POINTS +#include "bpf_trace.h" + #define bpf_event_rcu_dereference(p) \ rcu_dereference_protected(p, lockdep_is_held(&bpf_event_mutex)) @@ -136,18 +141,24 @@ static const struct bpf_func_proto bpf_override_return_proto = { }; #endif -BPF_CALL_3(bpf_probe_read_user, void *, dst, u32, size, - const void __user *, unsafe_ptr) +static __always_inline int +bpf_probe_read_user_common(void *dst, u32 size, const void __user *unsafe_ptr) { - int ret = probe_user_read(dst, unsafe_ptr, size); + int ret; + ret = copy_from_user_nofault(dst, unsafe_ptr, size); if (unlikely(ret < 0)) memset(dst, 0, size); - return ret; } -static const struct bpf_func_proto bpf_probe_read_user_proto = { +BPF_CALL_3(bpf_probe_read_user, void *, dst, u32, size, + const void __user *, unsafe_ptr) +{ + return bpf_probe_read_user_common(dst, size, unsafe_ptr); +} + +const struct bpf_func_proto bpf_probe_read_user_proto = { .func = bpf_probe_read_user, .gpl_only = true, .ret_type = RET_INTEGER, @@ -156,18 +167,25 @@ static const struct bpf_func_proto bpf_probe_read_user_proto = { .arg3_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_probe_read_user_str, void *, dst, u32, size, - const void __user *, unsafe_ptr) +static __always_inline int +bpf_probe_read_user_str_common(void *dst, u32 size, + const void __user *unsafe_ptr) { - int ret = strncpy_from_unsafe_user(dst, unsafe_ptr, size); + int ret; + ret = strncpy_from_user_nofault(dst, unsafe_ptr, size); if (unlikely(ret < 0)) memset(dst, 0, size); - return ret; } -static const struct bpf_func_proto bpf_probe_read_user_str_proto = { +BPF_CALL_3(bpf_probe_read_user_str, void *, dst, u32, size, + const void __user *, unsafe_ptr) +{ + return bpf_probe_read_user_str_common(dst, size, unsafe_ptr); +} + +const struct bpf_func_proto bpf_probe_read_user_str_proto = { .func = bpf_probe_read_user_str, .gpl_only = true, .ret_type = RET_INTEGER, @@ -177,28 +195,28 @@ static const struct bpf_func_proto bpf_probe_read_user_str_proto = { }; static __always_inline int -bpf_probe_read_kernel_common(void *dst, u32 size, const void *unsafe_ptr, - const bool compat) +bpf_probe_read_kernel_common(void *dst, u32 size, const void *unsafe_ptr) { int ret = security_locked_down(LOCKDOWN_BPF_READ); if (unlikely(ret < 0)) - goto out; - ret = compat ? probe_kernel_read(dst, unsafe_ptr, size) : - probe_kernel_read_strict(dst, unsafe_ptr, size); + goto fail; + ret = copy_from_kernel_nofault(dst, unsafe_ptr, size); if (unlikely(ret < 0)) -out: - memset(dst, 0, size); + goto fail; + return ret; +fail: + memset(dst, 0, size); return ret; } BPF_CALL_3(bpf_probe_read_kernel, void *, dst, u32, size, const void *, unsafe_ptr) { - return bpf_probe_read_kernel_common(dst, size, unsafe_ptr, false); + return bpf_probe_read_kernel_common(dst, size, unsafe_ptr); } -static const struct bpf_func_proto bpf_probe_read_kernel_proto = { +const struct bpf_func_proto bpf_probe_read_kernel_proto = { .func = bpf_probe_read_kernel, .gpl_only = true, .ret_type = RET_INTEGER, @@ -207,53 +225,40 @@ static const struct bpf_func_proto bpf_probe_read_kernel_proto = { .arg3_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_probe_read_compat, void *, dst, u32, size, - const void *, unsafe_ptr) -{ - return bpf_probe_read_kernel_common(dst, size, unsafe_ptr, true); -} - -static const struct bpf_func_proto bpf_probe_read_compat_proto = { - .func = bpf_probe_read_compat, - .gpl_only = true, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_UNINIT_MEM, - .arg2_type = ARG_CONST_SIZE_OR_ZERO, - .arg3_type = ARG_ANYTHING, -}; - static __always_inline int -bpf_probe_read_kernel_str_common(void *dst, u32 size, const void *unsafe_ptr, - const bool compat) +bpf_probe_read_kernel_str_common(void *dst, u32 size, const void *unsafe_ptr) { int ret = security_locked_down(LOCKDOWN_BPF_READ); if (unlikely(ret < 0)) - goto out; + goto fail; + /* - * The strncpy_from_unsafe_*() call will likely not fill the entire - * buffer, but that's okay in this circumstance as we're probing + * The strncpy_from_kernel_nofault() call will likely not fill the + * entire buffer, but that's okay in this circumstance as we're probing * arbitrary memory anyway similar to bpf_probe_read_*() and might * as well probe the stack. Thus, memory is explicitly cleared * only in error case, so that improper users ignoring return * code altogether don't copy garbage; otherwise length of string * is returned that can be used for bpf_perf_event_output() et al. */ - ret = compat ? strncpy_from_unsafe(dst, unsafe_ptr, size) : - strncpy_from_unsafe_strict(dst, unsafe_ptr, size); + ret = strncpy_from_kernel_nofault(dst, unsafe_ptr, size); if (unlikely(ret < 0)) -out: - memset(dst, 0, size); + goto fail; + + return ret; +fail: + memset(dst, 0, size); return ret; } BPF_CALL_3(bpf_probe_read_kernel_str, void *, dst, u32, size, const void *, unsafe_ptr) { - return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr, false); + return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr); } -static const struct bpf_func_proto bpf_probe_read_kernel_str_proto = { +const struct bpf_func_proto bpf_probe_read_kernel_str_proto = { .func = bpf_probe_read_kernel_str, .gpl_only = true, .ret_type = RET_INTEGER, @@ -262,10 +267,34 @@ static const struct bpf_func_proto bpf_probe_read_kernel_str_proto = { .arg3_type = ARG_ANYTHING, }; +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE +BPF_CALL_3(bpf_probe_read_compat, void *, dst, u32, size, + const void *, unsafe_ptr) +{ + if ((unsigned long)unsafe_ptr < TASK_SIZE) { + return bpf_probe_read_user_common(dst, size, + (__force void __user *)unsafe_ptr); + } + return bpf_probe_read_kernel_common(dst, size, unsafe_ptr); +} + +static const struct bpf_func_proto bpf_probe_read_compat_proto = { + .func = bpf_probe_read_compat, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, + .arg3_type = ARG_ANYTHING, +}; + BPF_CALL_3(bpf_probe_read_compat_str, void *, dst, u32, size, const void *, unsafe_ptr) { - return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr, true); + if ((unsigned long)unsafe_ptr < TASK_SIZE) { + return bpf_probe_read_user_str_common(dst, size, + (__force void __user *)unsafe_ptr); + } + return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr); } static const struct bpf_func_proto bpf_probe_read_compat_str_proto = { @@ -276,6 +305,7 @@ static const struct bpf_func_proto bpf_probe_read_compat_str_proto = { .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, }; +#endif /* CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE */ BPF_CALL_3(bpf_probe_write_user, void __user *, unsafe_ptr, const void *, src, u32, size) @@ -301,7 +331,7 @@ BPF_CALL_3(bpf_probe_write_user, void __user *, unsafe_ptr, const void *, src, if (unlikely(!nmi_uaccess_okay())) return -EPERM; - return probe_user_write(unsafe_ptr, src, size); + return copy_to_user_nofault(unsafe_ptr, src, size); } static const struct bpf_func_proto bpf_probe_write_user_proto = { @@ -315,15 +345,67 @@ static const struct bpf_func_proto bpf_probe_write_user_proto = { static const struct bpf_func_proto *bpf_get_probe_write_proto(void) { + if (!capable(CAP_SYS_ADMIN)) + return NULL; + pr_warn_ratelimited("%s[%d] is installing a program with bpf_probe_write_user helper that may corrupt user memory!", current->comm, task_pid_nr(current)); return &bpf_probe_write_user_proto; } +static void bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype, + size_t bufsz) +{ + void __user *user_ptr = (__force void __user *)unsafe_ptr; + + buf[0] = 0; + + switch (fmt_ptype) { + case 's': +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + if ((unsigned long)unsafe_ptr < TASK_SIZE) { + strncpy_from_user_nofault(buf, user_ptr, bufsz); + break; + } + fallthrough; +#endif + case 'k': + strncpy_from_kernel_nofault(buf, unsafe_ptr, bufsz); + break; + case 'u': + strncpy_from_user_nofault(buf, user_ptr, bufsz); + break; + } +} + +static DEFINE_RAW_SPINLOCK(trace_printk_lock); + +#define BPF_TRACE_PRINTK_SIZE 1024 + +static inline __printf(1, 0) int bpf_do_trace_printk(const char *fmt, ...) +{ + static char buf[BPF_TRACE_PRINTK_SIZE]; + unsigned long flags; + va_list ap; + int ret; + + raw_spin_lock_irqsave(&trace_printk_lock, flags); + va_start(ap, fmt); + ret = vsnprintf(buf, sizeof(buf), fmt, ap); + va_end(ap); + /* vsnprintf() will not append null for zero-length strings */ + if (ret == 0) + buf[0] = '\0'; + trace_bpf_trace_printk(buf); + raw_spin_unlock_irqrestore(&trace_printk_lock, flags); + + return ret; +} + /* * Only limited trace_printk() conversion specifiers allowed: - * %d %i %u %x %ld %li %lu %lx %lld %lli %llu %llx %p %pks %pus %s + * %d %i %u %x %ld %li %lu %lx %lld %lli %llu %llx %p %pB %pks %pus %s */ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, u64, arg2, u64, arg3) @@ -367,6 +449,11 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, goto fmt_str; } + if (fmt[i + 1] == 'B') { + i++; + goto fmt_next; + } + /* disallow any further format extensions */ if (fmt[i + 1] != 0 && !isspace(fmt[i + 1]) && @@ -403,24 +490,8 @@ fmt_str: break; } - buf[0] = 0; - switch (fmt_ptype) { - case 's': -#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE - strncpy_from_unsafe(buf, unsafe_ptr, - sizeof(buf)); - break; -#endif - case 'k': - strncpy_from_unsafe_strict(buf, unsafe_ptr, - sizeof(buf)); - break; - case 'u': - strncpy_from_unsafe_user(buf, - (__force void __user *)unsafe_ptr, - sizeof(buf)); - break; - } + bpf_trace_copy_string(buf, unsafe_ptr, fmt_ptype, + sizeof(buf)); goto fmt_next; } @@ -441,8 +512,7 @@ fmt_next: */ #define __BPF_TP_EMIT() __BPF_ARG3_TP() #define __BPF_TP(...) \ - __trace_printk(0 /* Fake ip */, \ - fmt, ##__VA_ARGS__) + bpf_do_trace_printk(fmt, ##__VA_ARGS__) #define __BPF_ARG1_TP(...) \ ((mod[0] == 2 || (mod[0] == 1 && __BITS_PER_LONG == 64)) \ @@ -479,14 +549,233 @@ static const struct bpf_func_proto bpf_trace_printk_proto = { const struct bpf_func_proto *bpf_get_trace_printk_proto(void) { /* - * this program might be calling bpf_trace_printk, - * so allocate per-cpu printk buffers + * This program might be calling bpf_trace_printk, + * so enable the associated bpf_trace/bpf_trace_printk event. + * Repeat this each time as it is possible a user has + * disabled bpf_trace_printk events. By loading a program + * calling bpf_trace_printk() however the user has expressed + * the intent to see such events. */ - trace_printk_init_buffers(); + if (trace_set_clr_event("bpf_trace", "bpf_trace_printk", 1)) + pr_warn_ratelimited("could not enable bpf_trace_printk events"); return &bpf_trace_printk_proto; } +#define MAX_SEQ_PRINTF_VARARGS 12 +#define MAX_SEQ_PRINTF_MAX_MEMCPY 6 +#define MAX_SEQ_PRINTF_STR_LEN 128 + +struct bpf_seq_printf_buf { + char buf[MAX_SEQ_PRINTF_MAX_MEMCPY][MAX_SEQ_PRINTF_STR_LEN]; +}; +static DEFINE_PER_CPU(struct bpf_seq_printf_buf, bpf_seq_printf_buf); +static DEFINE_PER_CPU(int, bpf_seq_printf_buf_used); + +BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size, + const void *, data, u32, data_len) +{ + int err = -EINVAL, fmt_cnt = 0, memcpy_cnt = 0; + int i, buf_used, copy_size, num_args; + u64 params[MAX_SEQ_PRINTF_VARARGS]; + struct bpf_seq_printf_buf *bufs; + const u64 *args = data; + + buf_used = this_cpu_inc_return(bpf_seq_printf_buf_used); + if (WARN_ON_ONCE(buf_used > 1)) { + err = -EBUSY; + goto out; + } + + bufs = this_cpu_ptr(&bpf_seq_printf_buf); + + /* + * bpf_check()->check_func_arg()->check_stack_boundary() + * guarantees that fmt points to bpf program stack, + * fmt_size bytes of it were initialized and fmt_size > 0 + */ + if (fmt[--fmt_size] != 0) + goto out; + + if (data_len & 7) + goto out; + + for (i = 0; i < fmt_size; i++) { + if (fmt[i] == '%') { + if (fmt[i + 1] == '%') + i++; + else if (!data || !data_len) + goto out; + } + } + + num_args = data_len / 8; + + /* check format string for allowed specifiers */ + for (i = 0; i < fmt_size; i++) { + /* only printable ascii for now. */ + if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) { + err = -EINVAL; + goto out; + } + + if (fmt[i] != '%') + continue; + + if (fmt[i + 1] == '%') { + i++; + continue; + } + + if (fmt_cnt >= MAX_SEQ_PRINTF_VARARGS) { + err = -E2BIG; + goto out; + } + + if (fmt_cnt >= num_args) { + err = -EINVAL; + goto out; + } + + /* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */ + i++; + + /* skip optional "[0 +-][num]" width formating field */ + while (fmt[i] == '0' || fmt[i] == '+' || fmt[i] == '-' || + fmt[i] == ' ') + i++; + if (fmt[i] >= '1' && fmt[i] <= '9') { + i++; + while (fmt[i] >= '0' && fmt[i] <= '9') + i++; + } + + if (fmt[i] == 's') { + void *unsafe_ptr; + + /* try our best to copy */ + if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) { + err = -E2BIG; + goto out; + } + + unsafe_ptr = (void *)(long)args[fmt_cnt]; + err = strncpy_from_kernel_nofault(bufs->buf[memcpy_cnt], + unsafe_ptr, MAX_SEQ_PRINTF_STR_LEN); + if (err < 0) + bufs->buf[memcpy_cnt][0] = '\0'; + params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt]; + + fmt_cnt++; + memcpy_cnt++; + continue; + } + + if (fmt[i] == 'p') { + if (fmt[i + 1] == 0 || + fmt[i + 1] == 'K' || + fmt[i + 1] == 'x' || + fmt[i + 1] == 'B') { + /* just kernel pointers */ + params[fmt_cnt] = args[fmt_cnt]; + fmt_cnt++; + continue; + } + + /* only support "%pI4", "%pi4", "%pI6" and "%pi6". */ + if (fmt[i + 1] != 'i' && fmt[i + 1] != 'I') { + err = -EINVAL; + goto out; + } + if (fmt[i + 2] != '4' && fmt[i + 2] != '6') { + err = -EINVAL; + goto out; + } + + if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) { + err = -E2BIG; + goto out; + } + + + copy_size = (fmt[i + 2] == '4') ? 4 : 16; + + err = copy_from_kernel_nofault(bufs->buf[memcpy_cnt], + (void *) (long) args[fmt_cnt], + copy_size); + if (err < 0) + memset(bufs->buf[memcpy_cnt], 0, copy_size); + params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt]; + + i += 2; + fmt_cnt++; + memcpy_cnt++; + continue; + } + + if (fmt[i] == 'l') { + i++; + if (fmt[i] == 'l') + i++; + } + + if (fmt[i] != 'i' && fmt[i] != 'd' && + fmt[i] != 'u' && fmt[i] != 'x' && + fmt[i] != 'X') { + err = -EINVAL; + goto out; + } + + params[fmt_cnt] = args[fmt_cnt]; + fmt_cnt++; + } + + /* Maximumly we can have MAX_SEQ_PRINTF_VARARGS parameter, just give + * all of them to seq_printf(). + */ + seq_printf(m, fmt, params[0], params[1], params[2], params[3], + params[4], params[5], params[6], params[7], params[8], + params[9], params[10], params[11]); + + err = seq_has_overflowed(m) ? -EOVERFLOW : 0; +out: + this_cpu_dec(bpf_seq_printf_buf_used); + return err; +} + +BTF_ID_LIST(bpf_seq_printf_btf_ids) +BTF_ID(struct, seq_file) + +static const struct bpf_func_proto bpf_seq_printf_proto = { + .func = bpf_seq_printf, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_PTR_TO_MEM_OR_NULL, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, + .btf_id = bpf_seq_printf_btf_ids, +}; + +BPF_CALL_3(bpf_seq_write, struct seq_file *, m, const void *, data, u32, len) +{ + return seq_write(m, data, len) ? -EOVERFLOW : 0; +} + +BTF_ID_LIST(bpf_seq_write_btf_ids) +BTF_ID(struct, seq_file) + +static const struct bpf_func_proto bpf_seq_write_proto = { + .func = bpf_seq_write, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .btf_id = bpf_seq_write_btf_ids, +}; + static __always_inline int get_map_perf_counter(struct bpf_map *map, u64 flags, u64 *value, u64 *enabled, u64 *running) @@ -698,7 +987,7 @@ BPF_CALL_0(bpf_get_current_task) return (long) current; } -static const struct bpf_func_proto bpf_get_current_task_proto = { +const struct bpf_func_proto bpf_get_current_task_proto = { .func = bpf_get_current_task, .gpl_only = true, .ret_type = RET_INTEGER, @@ -827,6 +1116,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_map_peek_elem_proto; case BPF_FUNC_ktime_get_ns: return &bpf_ktime_get_ns_proto; + case BPF_FUNC_ktime_get_boot_ns: + return &bpf_ktime_get_boot_ns_proto; case BPF_FUNC_tail_call: return &bpf_tail_call_proto; case BPF_FUNC_get_current_pid_tgid: @@ -877,6 +1168,20 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_perf_event_read_value_proto; case BPF_FUNC_get_ns_current_pid_tgid: return &bpf_get_ns_current_pid_tgid_proto; + case BPF_FUNC_ringbuf_output: + return &bpf_ringbuf_output_proto; + case BPF_FUNC_ringbuf_reserve: + return &bpf_ringbuf_reserve_proto; + case BPF_FUNC_ringbuf_submit: + return &bpf_ringbuf_submit_proto; + case BPF_FUNC_ringbuf_discard: + return &bpf_ringbuf_discard_proto; + case BPF_FUNC_ringbuf_query: + return &bpf_ringbuf_query_proto; + case BPF_FUNC_jiffies64: + return &bpf_jiffies64_proto; + case BPF_FUNC_get_task_stack: + return &bpf_get_task_stack_proto; default: return NULL; } @@ -1106,9 +1411,9 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_perf_event_output: return &bpf_perf_event_output_proto_tp; case BPF_FUNC_get_stackid: - return &bpf_get_stackid_proto_tp; + return &bpf_get_stackid_proto_pe; case BPF_FUNC_get_stack: - return &bpf_get_stack_proto_tp; + return &bpf_get_stack_proto_pe; case BPF_FUNC_perf_prog_read_value: return &bpf_perf_prog_read_value_proto; case BPF_FUNC_read_branch_records: @@ -1246,7 +1551,7 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } -static const struct bpf_func_proto * +const struct bpf_func_proto * tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { @@ -1255,7 +1560,25 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skb_output_proto; case BPF_FUNC_xdp_output: return &bpf_xdp_output_proto; + case BPF_FUNC_skc_to_tcp6_sock: + return &bpf_skc_to_tcp6_sock_proto; + case BPF_FUNC_skc_to_tcp_sock: + return &bpf_skc_to_tcp_sock_proto; + case BPF_FUNC_skc_to_tcp_timewait_sock: + return &bpf_skc_to_tcp_timewait_sock_proto; + case BPF_FUNC_skc_to_tcp_request_sock: + return &bpf_skc_to_tcp_request_sock_proto; + case BPF_FUNC_skc_to_udp6_sock: + return &bpf_skc_to_udp6_sock_proto; #endif + case BPF_FUNC_seq_printf: + return prog->expected_attach_type == BPF_TRACE_ITER ? + &bpf_seq_printf_proto : + NULL; + case BPF_FUNC_seq_write: + return prog->expected_attach_type == BPF_TRACE_ITER ? + &bpf_seq_write_proto : + NULL; default: return raw_tp_prog_func_proto(func_id, prog); } @@ -1500,7 +1823,7 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info) u32 *ids, prog_cnt, ids_len; int ret; - if (!capable(CAP_SYS_ADMIN)) + if (!perfmon_capable()) return -EPERM; if (event->attr.type != PERF_TYPE_TRACEPOINT) return -EINVAL; diff --git a/kernel/trace/bpf_trace.h b/kernel/trace/bpf_trace.h new file mode 100644 index 000000000000..9acbc11ac7bb --- /dev/null +++ b/kernel/trace/bpf_trace.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM bpf_trace + +#if !defined(_TRACE_BPF_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) + +#define _TRACE_BPF_TRACE_H + +#include <linux/tracepoint.h> + +TRACE_EVENT(bpf_trace_printk, + + TP_PROTO(const char *bpf_string), + + TP_ARGS(bpf_string), + + TP_STRUCT__entry( + __string(bpf_string, bpf_string) + ), + + TP_fast_assign( + __assign_str(bpf_string, bpf_string); + ), + + TP_printk("%s", __get_str(bpf_string)) +); + +#endif /* _TRACE_BPF_TRACE_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE bpf_trace + +#include <trace/define_trace.h> diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index bd030b1b9514..72064541bef2 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -160,17 +160,6 @@ static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, op->saved_func(ip, parent_ip, op, regs); } -static void ftrace_sync(struct work_struct *work) -{ - /* - * This function is just a stub to implement a hard force - * of synchronize_rcu(). This requires synchronizing - * tasks even in userspace and idle. - * - * Yes, function tracing is rude. - */ -} - static void ftrace_sync_ipi(void *data) { /* Probably not needed, but do it anyway */ @@ -256,7 +245,7 @@ static void update_ftrace_function(void) * Make sure all CPUs see this. Yes this is slow, but static * tracing is slow and nasty to have enabled. */ - schedule_on_each_cpu(ftrace_sync); + synchronize_rcu_tasks_rude(); /* Now all cpus are using the list ops. */ function_trace_op = set_function_trace_op; /* Make sure the function_trace_op is visible on all CPUs */ @@ -2027,16 +2016,16 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec) { unsigned long ip = rec ? rec->ip : 0; + pr_info("------------[ ftrace bug ]------------\n"); + switch (failed) { case -EFAULT: - FTRACE_WARN_ON_ONCE(1); pr_info("ftrace faulted on modifying "); - print_ip_sym(ip); + print_ip_sym(KERN_INFO, ip); break; case -EINVAL: - FTRACE_WARN_ON_ONCE(1); pr_info("ftrace failed to modify "); - print_ip_sym(ip); + print_ip_sym(KERN_INFO, ip); print_ip_ins(" actual: ", (unsigned char *)ip); pr_cont("\n"); if (ftrace_expected) { @@ -2045,14 +2034,12 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec) } break; case -EPERM: - FTRACE_WARN_ON_ONCE(1); pr_info("ftrace faulted on writing "); - print_ip_sym(ip); + print_ip_sym(KERN_INFO, ip); break; default: - FTRACE_WARN_ON_ONCE(1); pr_info("ftrace faulted on unknown error "); - print_ip_sym(ip); + print_ip_sym(KERN_INFO, ip); } print_bug_type(); if (rec) { @@ -2077,6 +2064,8 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec) ip = ftrace_get_addr_curr(rec); pr_cont("\n expected tramp: %lx\n", ip); } + + FTRACE_WARN_ON_ONCE(1); } static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update) @@ -2271,7 +2260,7 @@ ftrace_find_tramp_ops_next(struct dyn_ftrace *rec, if (hash_contains_ip(ip, op->func_hash)) return op; - } + } return NULL; } @@ -2775,6 +2764,50 @@ void __weak arch_ftrace_trampoline_free(struct ftrace_ops *ops) { } +/* List of trace_ops that have allocated trampolines */ +static LIST_HEAD(ftrace_ops_trampoline_list); + +static void ftrace_add_trampoline_to_kallsyms(struct ftrace_ops *ops) +{ + lockdep_assert_held(&ftrace_lock); + list_add_rcu(&ops->list, &ftrace_ops_trampoline_list); +} + +static void ftrace_remove_trampoline_from_kallsyms(struct ftrace_ops *ops) +{ + lockdep_assert_held(&ftrace_lock); + list_del_rcu(&ops->list); +} + +/* + * "__builtin__ftrace" is used as a module name in /proc/kallsyms for symbols + * for pages allocated for ftrace purposes, even though "__builtin__ftrace" is + * not a module. + */ +#define FTRACE_TRAMPOLINE_MOD "__builtin__ftrace" +#define FTRACE_TRAMPOLINE_SYM "ftrace_trampoline" + +static void ftrace_trampoline_free(struct ftrace_ops *ops) +{ + if (ops && (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP) && + ops->trampoline) { + /* + * Record the text poke event before the ksymbol unregister + * event. + */ + perf_event_text_poke((void *)ops->trampoline, + (void *)ops->trampoline, + ops->trampoline_size, NULL, 0); + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, + ops->trampoline, ops->trampoline_size, + true, FTRACE_TRAMPOLINE_SYM); + /* Remove from kallsyms after the perf events */ + ftrace_remove_trampoline_from_kallsyms(ops); + } + + arch_ftrace_trampoline_free(ops); +} + static void ftrace_startup_enable(int command) { if (saved_ftrace_func != ftrace_trace_function) { @@ -2932,7 +2965,7 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command) * infrastructure to do the synchronization, thus we must do it * ourselves. */ - schedule_on_each_cpu(ftrace_sync); + synchronize_rcu_tasks_rude(); /* * When the kernel is preeptive, tasks can be preempted @@ -2945,7 +2978,7 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command) synchronize_rcu_tasks(); free_ops: - arch_ftrace_trampoline_free(ops); + ftrace_trampoline_free(ops); } return 0; @@ -3610,7 +3643,7 @@ static int t_show(struct seq_file *m, void *v) if (direct) seq_printf(m, "\n\tdirect-->%pS", (void *)direct); } - } + } seq_putc(m, '\n'); @@ -5888,7 +5921,7 @@ ftrace_graph_release(struct inode *inode, struct file *file) * infrastructure to do the synchronization, thus we must do it * ourselves. */ - schedule_on_each_cpu(ftrace_sync); + synchronize_rcu_tasks_rude(); free_ftrace_hash(old_hash); } @@ -6189,6 +6222,27 @@ struct ftrace_mod_map { unsigned int num_funcs; }; +static int ftrace_get_trampoline_kallsym(unsigned int symnum, + unsigned long *value, char *type, + char *name, char *module_name, + int *exported) +{ + struct ftrace_ops *op; + + list_for_each_entry_rcu(op, &ftrace_ops_trampoline_list, list) { + if (!op->trampoline || symnum--) + continue; + *value = op->trampoline; + *type = 't'; + strlcpy(name, FTRACE_TRAMPOLINE_SYM, KSYM_NAME_LEN); + strlcpy(module_name, FTRACE_TRAMPOLINE_MOD, MODULE_NAME_LEN); + *exported = 0; + return 0; + } + + return -ERANGE; +} + #ifdef CONFIG_MODULES #define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next) @@ -6525,6 +6579,7 @@ int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value, { struct ftrace_mod_map *mod_map; struct ftrace_mod_func *mod_func; + int ret; preempt_disable(); list_for_each_entry_rcu(mod_map, &ftrace_mod_maps, list) { @@ -6551,8 +6606,10 @@ int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value, WARN_ON(1); break; } + ret = ftrace_get_trampoline_kallsym(symnum, value, type, name, + module_name, exported); preempt_enable(); - return -ERANGE; + return ret; } #else @@ -6564,6 +6621,18 @@ allocate_ftrace_mod_map(struct module *mod, { return NULL; } +int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value, + char *type, char *name, char *module_name, + int *exported) +{ + int ret; + + preempt_disable(); + ret = ftrace_get_trampoline_kallsym(symnum, value, type, name, + module_name, exported); + preempt_enable(); + return ret; +} #endif /* CONFIG_MODULES */ struct ftrace_init_func { @@ -6744,7 +6813,24 @@ void __weak arch_ftrace_update_trampoline(struct ftrace_ops *ops) static void ftrace_update_trampoline(struct ftrace_ops *ops) { + unsigned long trampoline = ops->trampoline; + arch_ftrace_update_trampoline(ops); + if (ops->trampoline && ops->trampoline != trampoline && + (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) { + /* Add to kallsyms before the perf events */ + ftrace_add_trampoline_to_kallsyms(ops); + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, + ops->trampoline, ops->trampoline_size, false, + FTRACE_TRAMPOLINE_SYM); + /* + * Record the perf text poke event after the ksymbol register + * event. + */ + perf_event_text_poke((void *)ops->trampoline, NULL, 0, + (void *)ops->trampoline, + ops->trampoline_size); + } } void ftrace_init_trace_array(struct trace_array *tr) @@ -7162,6 +7248,10 @@ static int pid_open(struct inode *inode, struct file *file, int type) case TRACE_NO_PIDS: seq_ops = &ftrace_no_pid_sops; break; + default: + trace_array_put(tr); + WARN_ON_ONCE(1); + return -EINVAL; } ret = seq_open(file, seq_ops); @@ -7240,6 +7330,10 @@ pid_write(struct file *filp, const char __user *ubuf, other_pids = rcu_dereference_protected(tr->function_pids, lockdep_is_held(&ftrace_lock)); break; + default: + ret = -EINVAL; + WARN_ON_ONCE(1); + goto out; } ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index b8e1ca48be50..f15471ce969e 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -577,7 +577,7 @@ static void rb_wake_up_waiters(struct irq_work *work) */ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full) { - struct ring_buffer_per_cpu *uninitialized_var(cpu_buffer); + struct ring_buffer_per_cpu *cpu_buffer; DEFINE_WAIT(wait); struct rb_irq_work *work; int ret = 0; @@ -2427,7 +2427,7 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, if (unlikely(info->add_timestamp)) { bool abs = ring_buffer_time_stamp_abs(cpu_buffer->buffer); - event = rb_add_time_stamp(event, info->delta, abs); + event = rb_add_time_stamp(event, abs ? info->delta : delta, abs); length -= RB_LEN_TIME_EXTEND; delta = 0; } diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 8df0aa810950..78e576575b79 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -45,8 +45,8 @@ MODULE_PARM_DESC(write_iteration, "# of writes between timestamp readings"); static int producer_nice = MAX_NICE; static int consumer_nice = MAX_NICE; -static int producer_fifo = -1; -static int consumer_fifo = -1; +static int producer_fifo; +static int consumer_fifo; module_param(producer_nice, int, 0644); MODULE_PARM_DESC(producer_nice, "nice prio for producer"); @@ -55,10 +55,10 @@ module_param(consumer_nice, int, 0644); MODULE_PARM_DESC(consumer_nice, "nice prio for consumer"); module_param(producer_fifo, int, 0644); -MODULE_PARM_DESC(producer_fifo, "fifo prio for producer"); +MODULE_PARM_DESC(producer_fifo, "use fifo for producer: 0 - disabled, 1 - low prio, 2 - fifo"); module_param(consumer_fifo, int, 0644); -MODULE_PARM_DESC(consumer_fifo, "fifo prio for consumer"); +MODULE_PARM_DESC(consumer_fifo, "use fifo for consumer: 0 - disabled, 1 - low prio, 2 - fifo"); static int read_events; @@ -303,22 +303,22 @@ static void ring_buffer_producer(void) trace_printk("ERROR!\n"); if (!disable_reader) { - if (consumer_fifo < 0) + if (consumer_fifo) + trace_printk("Running Consumer at SCHED_FIFO %s\n", + consumer_fifo == 1 ? "low" : "high"); + else trace_printk("Running Consumer at nice: %d\n", consumer_nice); - else - trace_printk("Running Consumer at SCHED_FIFO %d\n", - consumer_fifo); } - if (producer_fifo < 0) + if (producer_fifo) + trace_printk("Running Producer at SCHED_FIFO %s\n", + producer_fifo == 1 ? "low" : "high"); + else trace_printk("Running Producer at nice: %d\n", producer_nice); - else - trace_printk("Running Producer at SCHED_FIFO %d\n", - producer_fifo); /* Let the user know that the test is running at low priority */ - if (producer_fifo < 0 && consumer_fifo < 0 && + if (!producer_fifo && !consumer_fifo && producer_nice == MAX_NICE && consumer_nice == MAX_NICE) trace_printk("WARNING!!! This test is running at lowest priority.\n"); @@ -455,21 +455,19 @@ static int __init ring_buffer_benchmark_init(void) * Run them as low-prio background tasks by default: */ if (!disable_reader) { - if (consumer_fifo >= 0) { - struct sched_param param = { - .sched_priority = consumer_fifo - }; - sched_setscheduler(consumer, SCHED_FIFO, ¶m); - } else + if (consumer_fifo >= 2) + sched_set_fifo(consumer); + else if (consumer_fifo == 1) + sched_set_fifo_low(consumer); + else set_user_nice(consumer, consumer_nice); } - if (producer_fifo >= 0) { - struct sched_param param = { - .sched_priority = producer_fifo - }; - sched_setscheduler(producer, SCHED_FIFO, ¶m); - } else + if (producer_fifo >= 2) + sched_set_fifo(producer); + else if (producer_fifo == 1) + sched_set_fifo_low(producer); + else set_user_nice(producer, producer_nice); return 0; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 29615f15a820..848f67a5f16d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1299,8 +1299,11 @@ EXPORT_SYMBOL_GPL(tracing_off); void disable_trace_on_warning(void) { - if (__disable_trace_on_warning) + if (__disable_trace_on_warning) { + trace_array_printk_buf(global_trace.array_buffer.buffer, _THIS_IP_, + "Disabling tracing due to warning\n"); tracing_off(); + } } /** @@ -2662,7 +2665,7 @@ static void output_printk(struct trace_event_buffer *fbuffer) } int tracepoint_printk_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, + void *buffer, size_t *lenp, loff_t *ppos) { int save_tracepoint_printk; @@ -3567,7 +3570,6 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos) void tracing_iter_reset(struct trace_iterator *iter, int cpu) { - struct ring_buffer_event *event; struct ring_buffer_iter *buf_iter; unsigned long entries = 0; u64 ts; @@ -3585,7 +3587,7 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu) * that a reset never took place on a cpu. This is evident * by the timestamp being before the start of the buffer. */ - while ((event = ring_buffer_iter_peek(buf_iter, &ts))) { + while (ring_buffer_iter_peek(buf_iter, &ts)) { if (ts >= iter->array_buffer->time_start) break; entries++; @@ -6305,13 +6307,6 @@ static void tracing_spd_release_pipe(struct splice_pipe_desc *spd, __free_page(spd->pages[idx]); } -static const struct pipe_buf_operations tracing_pipe_buf_ops = { - .confirm = generic_pipe_buf_confirm, - .release = generic_pipe_buf_release, - .steal = generic_pipe_buf_steal, - .get = generic_pipe_buf_get, -}; - static size_t tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter) { @@ -6373,7 +6368,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, .partial = partial_def, .nr_pages = 0, /* This gets updated below. */ .nr_pages_max = PIPE_DEF_BUFFERS, - .ops = &tracing_pipe_buf_ops, + .ops = &default_pipe_buf_ops, .spd_release = tracing_spd_release_pipe, }; ssize_t ret; @@ -7582,9 +7577,7 @@ static bool buffer_pipe_buf_get(struct pipe_inode_info *pipe, /* Pipe buffer operations for a buffer. */ static const struct pipe_buf_operations buffer_pipe_buf_ops = { - .confirm = generic_pipe_buf_confirm, .release = buffer_pipe_buf_release, - .steal = generic_pipe_buf_nosteal, .get = buffer_pipe_buf_get, }; @@ -8527,18 +8520,6 @@ static int allocate_trace_buffers(struct trace_array *tr, int size) allocate_snapshot = false; #endif - /* - * Because of some magic with the way alloc_percpu() works on - * x86_64, we need to synchronize the pgd of all the tables, - * otherwise the trace events that happen in x86_64 page fault - * handlers can't cope with accessing the chance that a - * alloc_percpu()'d memory might be touched in the page fault trace - * event. Oh, and we need to audit all other alloc_percpu() and vmalloc() - * calls in tracing, because something might get triggered within a - * page fault trace event! - */ - vmalloc_sync_mappings(); - return 0; } @@ -8964,9 +8945,7 @@ struct dentry *tracing_init_dentry(void) if (tr->dir) return NULL; - if (WARN_ON(!tracefs_initialized()) || - (IS_ENABLED(CONFIG_DEBUG_FS) && - WARN_ON(!debugfs_initialized()))) + if (WARN_ON(!tracefs_initialized())) return ERR_PTR(-ENODEV); /* diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 4eb1d004d5f2..13db4000af3f 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -61,6 +61,9 @@ enum trace_type { #undef __field_desc #define __field_desc(type, container, item) +#undef __field_packed +#define __field_packed(type, container, item) + #undef __array #define __array(type, item, size) type item[size]; @@ -1661,6 +1664,7 @@ extern struct list_head ftrace_events; extern const struct file_operations event_trigger_fops; extern const struct file_operations event_hist_fops; +extern const struct file_operations event_hist_debug_fops; extern const struct file_operations event_inject_fops; #ifdef CONFIG_HIST_TRIGGERS diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 9de29bb45a27..fa0fc08c6ef8 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -101,12 +101,16 @@ trace_boot_add_kprobe_event(struct xbc_node *node, const char *event) kprobe_event_cmd_init(&cmd, buf, MAX_BUF_LEN); ret = kprobe_event_gen_cmd_start(&cmd, event, val); - if (ret) + if (ret) { + pr_err("Failed to generate probe: %s\n", buf); break; + } ret = kprobe_event_gen_cmd_end(&cmd); - if (ret) + if (ret) { pr_err("Failed to add probe: %s\n", buf); + break; + } } return ret; @@ -120,7 +124,7 @@ trace_boot_add_kprobe_event(struct xbc_node *node, const char *event) } #endif -#ifdef CONFIG_HIST_TRIGGERS +#ifdef CONFIG_SYNTH_EVENTS static int __init trace_boot_add_synth_event(struct xbc_node *node, const char *event) { diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index a523da0dae0a..18c4a58aff79 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -78,8 +78,8 @@ FTRACE_ENTRY_PACKED(funcgraph_entry, ftrace_graph_ent_entry, F_STRUCT( __field_struct( struct ftrace_graph_ent, graph_ent ) - __field_desc( unsigned long, graph_ent, func ) - __field_desc( int, graph_ent, depth ) + __field_packed( unsigned long, graph_ent, func ) + __field_packed( int, graph_ent, depth ) ), F_printk("--> %ps (%d)", (void *)__entry->func, __entry->depth) @@ -92,11 +92,11 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, F_STRUCT( __field_struct( struct ftrace_graph_ret, ret ) - __field_desc( unsigned long, ret, func ) - __field_desc( unsigned long, ret, overrun ) - __field_desc( unsigned long long, ret, calltime) - __field_desc( unsigned long long, ret, rettime ) - __field_desc( int, ret, depth ) + __field_packed( unsigned long, ret, func ) + __field_packed( unsigned long, ret, overrun ) + __field_packed( unsigned long long, ret, calltime) + __field_packed( unsigned long long, ret, rettime ) + __field_packed( int, ret, depth ) ), F_printk("<-- %ps (%d) (start: %llx end: %llx) over: %d", diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 242f59e7f17d..f6f55682d3e2 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2209,6 +2209,10 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file) trace_create_file("hist", 0444, file->dir, file, &event_hist_fops); #endif +#ifdef CONFIG_HIST_TRIGGERS_DEBUG + trace_create_file("hist_debug", 0444, file->dir, file, + &event_hist_debug_fops); +#endif trace_create_file("format", 0444, file->dir, call, &ftrace_event_format_fops); diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index fcab11cc6833..0b933546142e 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -19,13 +19,7 @@ #include <trace/events/mmflags.h> #include "tracing_map.h" -#include "trace.h" -#include "trace_dynevent.h" - -#define SYNTH_SYSTEM "synthetic" -#define SYNTH_FIELDS_MAX 32 - -#define STR_VAR_LEN_MAX 32 /* must be multiple of sizeof(u64) */ +#include "trace_synth.h" #define ERRORS \ C(NONE, "No error"), \ @@ -380,69 +374,6 @@ struct hist_trigger_data { unsigned int n_save_var_str; }; -static int create_synth_event(int argc, const char **argv); -static int synth_event_show(struct seq_file *m, struct dyn_event *ev); -static int synth_event_release(struct dyn_event *ev); -static bool synth_event_is_busy(struct dyn_event *ev); -static bool synth_event_match(const char *system, const char *event, - int argc, const char **argv, struct dyn_event *ev); - -static struct dyn_event_operations synth_event_ops = { - .create = create_synth_event, - .show = synth_event_show, - .is_busy = synth_event_is_busy, - .free = synth_event_release, - .match = synth_event_match, -}; - -struct synth_field { - char *type; - char *name; - size_t size; - unsigned int offset; - bool is_signed; - bool is_string; -}; - -struct synth_event { - struct dyn_event devent; - int ref; - char *name; - struct synth_field **fields; - unsigned int n_fields; - unsigned int n_u64; - struct trace_event_class class; - struct trace_event_call call; - struct tracepoint *tp; - struct module *mod; -}; - -static bool is_synth_event(struct dyn_event *ev) -{ - return ev->ops == &synth_event_ops; -} - -static struct synth_event *to_synth_event(struct dyn_event *ev) -{ - return container_of(ev, struct synth_event, devent); -} - -static bool synth_event_is_busy(struct dyn_event *ev) -{ - struct synth_event *event = to_synth_event(ev); - - return event->ref != 0; -} - -static bool synth_event_match(const char *system, const char *event, - int argc, const char **argv, struct dyn_event *ev) -{ - struct synth_event *sev = to_synth_event(ev); - - return strcmp(sev->name, event) == 0 && - (!system || strcmp(system, SYNTH_SYSTEM) == 0); -} - struct action_data; typedef void (*action_fn_t) (struct hist_trigger_data *hist_data, @@ -589,6 +520,7 @@ static struct track_data *track_data_alloc(unsigned int key_len, track_data_free(data); return ERR_PTR(-ENOMEM); } + data->elt.private_data = elt_data; elt_data->comm = kzalloc(TASK_COMM_LEN, GFP_KERNEL); @@ -621,7 +553,6 @@ static void last_cmd_set(struct trace_event_file *file, char *str) if (file) { call = file->event_call; - system = call->class->system; if (system) { name = trace_event_name(call); @@ -646,510 +577,6 @@ static void hist_err_clear(void) last_cmd_loc[0] = '\0'; } -struct synth_trace_event { - struct trace_entry ent; - u64 fields[]; -}; - -static int synth_event_define_fields(struct trace_event_call *call) -{ - struct synth_trace_event trace; - int offset = offsetof(typeof(trace), fields); - struct synth_event *event = call->data; - unsigned int i, size, n_u64; - char *name, *type; - bool is_signed; - int ret = 0; - - for (i = 0, n_u64 = 0; i < event->n_fields; i++) { - size = event->fields[i]->size; - is_signed = event->fields[i]->is_signed; - type = event->fields[i]->type; - name = event->fields[i]->name; - ret = trace_define_field(call, type, name, offset, size, - is_signed, FILTER_OTHER); - if (ret) - break; - - event->fields[i]->offset = n_u64; - - if (event->fields[i]->is_string) { - offset += STR_VAR_LEN_MAX; - n_u64 += STR_VAR_LEN_MAX / sizeof(u64); - } else { - offset += sizeof(u64); - n_u64++; - } - } - - event->n_u64 = n_u64; - - return ret; -} - -static bool synth_field_signed(char *type) -{ - if (str_has_prefix(type, "u")) - return false; - if (strcmp(type, "gfp_t") == 0) - return false; - - return true; -} - -static int synth_field_is_string(char *type) -{ - if (strstr(type, "char[") != NULL) - return true; - - return false; -} - -static int synth_field_string_size(char *type) -{ - char buf[4], *end, *start; - unsigned int len; - int size, err; - - start = strstr(type, "char["); - if (start == NULL) - return -EINVAL; - start += sizeof("char[") - 1; - - end = strchr(type, ']'); - if (!end || end < start) - return -EINVAL; - - len = end - start; - if (len > 3) - return -EINVAL; - - strncpy(buf, start, len); - buf[len] = '\0'; - - err = kstrtouint(buf, 0, &size); - if (err) - return err; - - if (size > STR_VAR_LEN_MAX) - return -EINVAL; - - return size; -} - -static int synth_field_size(char *type) -{ - int size = 0; - - if (strcmp(type, "s64") == 0) - size = sizeof(s64); - else if (strcmp(type, "u64") == 0) - size = sizeof(u64); - else if (strcmp(type, "s32") == 0) - size = sizeof(s32); - else if (strcmp(type, "u32") == 0) - size = sizeof(u32); - else if (strcmp(type, "s16") == 0) - size = sizeof(s16); - else if (strcmp(type, "u16") == 0) - size = sizeof(u16); - else if (strcmp(type, "s8") == 0) - size = sizeof(s8); - else if (strcmp(type, "u8") == 0) - size = sizeof(u8); - else if (strcmp(type, "char") == 0) - size = sizeof(char); - else if (strcmp(type, "unsigned char") == 0) - size = sizeof(unsigned char); - else if (strcmp(type, "int") == 0) - size = sizeof(int); - else if (strcmp(type, "unsigned int") == 0) - size = sizeof(unsigned int); - else if (strcmp(type, "long") == 0) - size = sizeof(long); - else if (strcmp(type, "unsigned long") == 0) - size = sizeof(unsigned long); - else if (strcmp(type, "pid_t") == 0) - size = sizeof(pid_t); - else if (strcmp(type, "gfp_t") == 0) - size = sizeof(gfp_t); - else if (synth_field_is_string(type)) - size = synth_field_string_size(type); - - return size; -} - -static const char *synth_field_fmt(char *type) -{ - const char *fmt = "%llu"; - - if (strcmp(type, "s64") == 0) - fmt = "%lld"; - else if (strcmp(type, "u64") == 0) - fmt = "%llu"; - else if (strcmp(type, "s32") == 0) - fmt = "%d"; - else if (strcmp(type, "u32") == 0) - fmt = "%u"; - else if (strcmp(type, "s16") == 0) - fmt = "%d"; - else if (strcmp(type, "u16") == 0) - fmt = "%u"; - else if (strcmp(type, "s8") == 0) - fmt = "%d"; - else if (strcmp(type, "u8") == 0) - fmt = "%u"; - else if (strcmp(type, "char") == 0) - fmt = "%d"; - else if (strcmp(type, "unsigned char") == 0) - fmt = "%u"; - else if (strcmp(type, "int") == 0) - fmt = "%d"; - else if (strcmp(type, "unsigned int") == 0) - fmt = "%u"; - else if (strcmp(type, "long") == 0) - fmt = "%ld"; - else if (strcmp(type, "unsigned long") == 0) - fmt = "%lu"; - else if (strcmp(type, "pid_t") == 0) - fmt = "%d"; - else if (strcmp(type, "gfp_t") == 0) - fmt = "%x"; - else if (synth_field_is_string(type)) - fmt = "%s"; - - return fmt; -} - -static void print_synth_event_num_val(struct trace_seq *s, - char *print_fmt, char *name, - int size, u64 val, char *space) -{ - switch (size) { - case 1: - trace_seq_printf(s, print_fmt, name, (u8)val, space); - break; - - case 2: - trace_seq_printf(s, print_fmt, name, (u16)val, space); - break; - - case 4: - trace_seq_printf(s, print_fmt, name, (u32)val, space); - break; - - default: - trace_seq_printf(s, print_fmt, name, val, space); - break; - } -} - -static enum print_line_t print_synth_event(struct trace_iterator *iter, - int flags, - struct trace_event *event) -{ - struct trace_array *tr = iter->tr; - struct trace_seq *s = &iter->seq; - struct synth_trace_event *entry; - struct synth_event *se; - unsigned int i, n_u64; - char print_fmt[32]; - const char *fmt; - - entry = (struct synth_trace_event *)iter->ent; - se = container_of(event, struct synth_event, call.event); - - trace_seq_printf(s, "%s: ", se->name); - - for (i = 0, n_u64 = 0; i < se->n_fields; i++) { - if (trace_seq_has_overflowed(s)) - goto end; - - fmt = synth_field_fmt(se->fields[i]->type); - - /* parameter types */ - if (tr && tr->trace_flags & TRACE_ITER_VERBOSE) - trace_seq_printf(s, "%s ", fmt); - - snprintf(print_fmt, sizeof(print_fmt), "%%s=%s%%s", fmt); - - /* parameter values */ - if (se->fields[i]->is_string) { - trace_seq_printf(s, print_fmt, se->fields[i]->name, - (char *)&entry->fields[n_u64], - i == se->n_fields - 1 ? "" : " "); - n_u64 += STR_VAR_LEN_MAX / sizeof(u64); - } else { - struct trace_print_flags __flags[] = { - __def_gfpflag_names, {-1, NULL} }; - char *space = (i == se->n_fields - 1 ? "" : " "); - - print_synth_event_num_val(s, print_fmt, - se->fields[i]->name, - se->fields[i]->size, - entry->fields[n_u64], - space); - - if (strcmp(se->fields[i]->type, "gfp_t") == 0) { - trace_seq_puts(s, " ("); - trace_print_flags_seq(s, "|", - entry->fields[n_u64], - __flags); - trace_seq_putc(s, ')'); - } - n_u64++; - } - } -end: - trace_seq_putc(s, '\n'); - - return trace_handle_return(s); -} - -static struct trace_event_functions synth_event_funcs = { - .trace = print_synth_event -}; - -static notrace void trace_event_raw_event_synth(void *__data, - u64 *var_ref_vals, - unsigned int *var_ref_idx) -{ - struct trace_event_file *trace_file = __data; - struct synth_trace_event *entry; - struct trace_event_buffer fbuffer; - struct trace_buffer *buffer; - struct synth_event *event; - unsigned int i, n_u64, val_idx; - int fields_size = 0; - - event = trace_file->event_call->data; - - if (trace_trigger_soft_disabled(trace_file)) - return; - - fields_size = event->n_u64 * sizeof(u64); - - /* - * Avoid ring buffer recursion detection, as this event - * is being performed within another event. - */ - buffer = trace_file->tr->array_buffer.buffer; - ring_buffer_nest_start(buffer); - - entry = trace_event_buffer_reserve(&fbuffer, trace_file, - sizeof(*entry) + fields_size); - if (!entry) - goto out; - - for (i = 0, n_u64 = 0; i < event->n_fields; i++) { - val_idx = var_ref_idx[i]; - if (event->fields[i]->is_string) { - char *str_val = (char *)(long)var_ref_vals[val_idx]; - char *str_field = (char *)&entry->fields[n_u64]; - - strscpy(str_field, str_val, STR_VAR_LEN_MAX); - n_u64 += STR_VAR_LEN_MAX / sizeof(u64); - } else { - struct synth_field *field = event->fields[i]; - u64 val = var_ref_vals[val_idx]; - - switch (field->size) { - case 1: - *(u8 *)&entry->fields[n_u64] = (u8)val; - break; - - case 2: - *(u16 *)&entry->fields[n_u64] = (u16)val; - break; - - case 4: - *(u32 *)&entry->fields[n_u64] = (u32)val; - break; - - default: - entry->fields[n_u64] = val; - break; - } - n_u64++; - } - } - - trace_event_buffer_commit(&fbuffer); -out: - ring_buffer_nest_end(buffer); -} - -static void free_synth_event_print_fmt(struct trace_event_call *call) -{ - if (call) { - kfree(call->print_fmt); - call->print_fmt = NULL; - } -} - -static int __set_synth_event_print_fmt(struct synth_event *event, - char *buf, int len) -{ - const char *fmt; - int pos = 0; - int i; - - /* When len=0, we just calculate the needed length */ -#define LEN_OR_ZERO (len ? len - pos : 0) - - pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); - for (i = 0; i < event->n_fields; i++) { - fmt = synth_field_fmt(event->fields[i]->type); - pos += snprintf(buf + pos, LEN_OR_ZERO, "%s=%s%s", - event->fields[i]->name, fmt, - i == event->n_fields - 1 ? "" : ", "); - } - pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); - - for (i = 0; i < event->n_fields; i++) { - pos += snprintf(buf + pos, LEN_OR_ZERO, - ", REC->%s", event->fields[i]->name); - } - -#undef LEN_OR_ZERO - - /* return the length of print_fmt */ - return pos; -} - -static int set_synth_event_print_fmt(struct trace_event_call *call) -{ - struct synth_event *event = call->data; - char *print_fmt; - int len; - - /* First: called with 0 length to calculate the needed length */ - len = __set_synth_event_print_fmt(event, NULL, 0); - - print_fmt = kmalloc(len + 1, GFP_KERNEL); - if (!print_fmt) - return -ENOMEM; - - /* Second: actually write the @print_fmt */ - __set_synth_event_print_fmt(event, print_fmt, len + 1); - call->print_fmt = print_fmt; - - return 0; -} - -static void free_synth_field(struct synth_field *field) -{ - kfree(field->type); - kfree(field->name); - kfree(field); -} - -static struct synth_field *parse_synth_field(int argc, const char **argv, - int *consumed) -{ - struct synth_field *field; - const char *prefix = NULL, *field_type = argv[0], *field_name, *array; - int len, ret = 0; - - if (field_type[0] == ';') - field_type++; - - if (!strcmp(field_type, "unsigned")) { - if (argc < 3) - return ERR_PTR(-EINVAL); - prefix = "unsigned "; - field_type = argv[1]; - field_name = argv[2]; - *consumed = 3; - } else { - field_name = argv[1]; - *consumed = 2; - } - - field = kzalloc(sizeof(*field), GFP_KERNEL); - if (!field) - return ERR_PTR(-ENOMEM); - - len = strlen(field_name); - array = strchr(field_name, '['); - if (array) - len -= strlen(array); - else if (field_name[len - 1] == ';') - len--; - - field->name = kmemdup_nul(field_name, len, GFP_KERNEL); - if (!field->name) { - ret = -ENOMEM; - goto free; - } - - if (field_type[0] == ';') - field_type++; - len = strlen(field_type) + 1; - if (array) - len += strlen(array); - if (prefix) - len += strlen(prefix); - - field->type = kzalloc(len, GFP_KERNEL); - if (!field->type) { - ret = -ENOMEM; - goto free; - } - if (prefix) - strcat(field->type, prefix); - strcat(field->type, field_type); - if (array) { - strcat(field->type, array); - if (field->type[len - 1] == ';') - field->type[len - 1] = '\0'; - } - - field->size = synth_field_size(field->type); - if (!field->size) { - ret = -EINVAL; - goto free; - } - - if (synth_field_is_string(field->type)) - field->is_string = true; - - field->is_signed = synth_field_signed(field->type); - - out: - return field; - free: - free_synth_field(field); - field = ERR_PTR(ret); - goto out; -} - -static void free_synth_tracepoint(struct tracepoint *tp) -{ - if (!tp) - return; - - kfree(tp->name); - kfree(tp); -} - -static struct tracepoint *alloc_synth_tracepoint(char *name) -{ - struct tracepoint *tp; - - tp = kzalloc(sizeof(*tp), GFP_KERNEL); - if (!tp) - return ERR_PTR(-ENOMEM); - - tp->name = kstrdup(name, GFP_KERNEL); - if (!tp->name) { - kfree(tp); - return ERR_PTR(-ENOMEM); - } - - return tp; -} - typedef void (*synth_probe_func_t) (void *__data, u64 *var_ref_vals, unsigned int *var_ref_idx); @@ -1177,145 +604,6 @@ static inline void trace_synth(struct synth_event *event, u64 *var_ref_vals, } } -static struct synth_event *find_synth_event(const char *name) -{ - struct dyn_event *pos; - struct synth_event *event; - - for_each_dyn_event(pos) { - if (!is_synth_event(pos)) - continue; - event = to_synth_event(pos); - if (strcmp(event->name, name) == 0) - return event; - } - - return NULL; -} - -static struct trace_event_fields synth_event_fields_array[] = { - { .type = TRACE_FUNCTION_TYPE, - .define_fields = synth_event_define_fields }, - {} -}; - -static int register_synth_event(struct synth_event *event) -{ - struct trace_event_call *call = &event->call; - int ret = 0; - - event->call.class = &event->class; - event->class.system = kstrdup(SYNTH_SYSTEM, GFP_KERNEL); - if (!event->class.system) { - ret = -ENOMEM; - goto out; - } - - event->tp = alloc_synth_tracepoint(event->name); - if (IS_ERR(event->tp)) { - ret = PTR_ERR(event->tp); - event->tp = NULL; - goto out; - } - - INIT_LIST_HEAD(&call->class->fields); - call->event.funcs = &synth_event_funcs; - call->class->fields_array = synth_event_fields_array; - - ret = register_trace_event(&call->event); - if (!ret) { - ret = -ENODEV; - goto out; - } - call->flags = TRACE_EVENT_FL_TRACEPOINT; - call->class->reg = trace_event_reg; - call->class->probe = trace_event_raw_event_synth; - call->data = event; - call->tp = event->tp; - - ret = trace_add_event_call(call); - if (ret) { - pr_warn("Failed to register synthetic event: %s\n", - trace_event_name(call)); - goto err; - } - - ret = set_synth_event_print_fmt(call); - if (ret < 0) { - trace_remove_event_call(call); - goto err; - } - out: - return ret; - err: - unregister_trace_event(&call->event); - goto out; -} - -static int unregister_synth_event(struct synth_event *event) -{ - struct trace_event_call *call = &event->call; - int ret; - - ret = trace_remove_event_call(call); - - return ret; -} - -static void free_synth_event(struct synth_event *event) -{ - unsigned int i; - - if (!event) - return; - - for (i = 0; i < event->n_fields; i++) - free_synth_field(event->fields[i]); - - kfree(event->fields); - kfree(event->name); - kfree(event->class.system); - free_synth_tracepoint(event->tp); - free_synth_event_print_fmt(&event->call); - kfree(event); -} - -static struct synth_event *alloc_synth_event(const char *name, int n_fields, - struct synth_field **fields) -{ - struct synth_event *event; - unsigned int i; - - event = kzalloc(sizeof(*event), GFP_KERNEL); - if (!event) { - event = ERR_PTR(-ENOMEM); - goto out; - } - - event->name = kstrdup(name, GFP_KERNEL); - if (!event->name) { - kfree(event); - event = ERR_PTR(-ENOMEM); - goto out; - } - - event->fields = kcalloc(n_fields, sizeof(*event->fields), GFP_KERNEL); - if (!event->fields) { - free_synth_event(event); - event = ERR_PTR(-ENOMEM); - goto out; - } - - dyn_event_init(&event->devent, &synth_event_ops); - - for (i = 0; i < n_fields; i++) - event->fields[i] = fields[i]; - - event->n_fields = n_fields; - out: - return event; -} - static void action_trace(struct hist_trigger_data *hist_data, struct tracing_map_elt *elt, void *rec, struct ring_buffer_event *rbe, void *key, @@ -1331,1056 +619,6 @@ struct hist_var_data { struct hist_trigger_data *hist_data; }; -static int synth_event_check_arg_fn(void *data) -{ - struct dynevent_arg_pair *arg_pair = data; - int size; - - size = synth_field_size((char *)arg_pair->lhs); - - return size ? 0 : -EINVAL; -} - -/** - * synth_event_add_field - Add a new field to a synthetic event cmd - * @cmd: A pointer to the dynevent_cmd struct representing the new event - * @type: The type of the new field to add - * @name: The name of the new field to add - * - * Add a new field to a synthetic event cmd object. Field ordering is in - * the same order the fields are added. - * - * See synth_field_size() for available types. If field_name contains - * [n] the field is considered to be an array. - * - * Return: 0 if successful, error otherwise. - */ -int synth_event_add_field(struct dynevent_cmd *cmd, const char *type, - const char *name) -{ - struct dynevent_arg_pair arg_pair; - int ret; - - if (cmd->type != DYNEVENT_TYPE_SYNTH) - return -EINVAL; - - if (!type || !name) - return -EINVAL; - - dynevent_arg_pair_init(&arg_pair, 0, ';'); - - arg_pair.lhs = type; - arg_pair.rhs = name; - - ret = dynevent_arg_pair_add(cmd, &arg_pair, synth_event_check_arg_fn); - if (ret) - return ret; - - if (++cmd->n_fields > SYNTH_FIELDS_MAX) - ret = -EINVAL; - - return ret; -} -EXPORT_SYMBOL_GPL(synth_event_add_field); - -/** - * synth_event_add_field_str - Add a new field to a synthetic event cmd - * @cmd: A pointer to the dynevent_cmd struct representing the new event - * @type_name: The type and name of the new field to add, as a single string - * - * Add a new field to a synthetic event cmd object, as a single - * string. The @type_name string is expected to be of the form 'type - * name', which will be appended by ';'. No sanity checking is done - - * what's passed in is assumed to already be well-formed. Field - * ordering is in the same order the fields are added. - * - * See synth_field_size() for available types. If field_name contains - * [n] the field is considered to be an array. - * - * Return: 0 if successful, error otherwise. - */ -int synth_event_add_field_str(struct dynevent_cmd *cmd, const char *type_name) -{ - struct dynevent_arg arg; - int ret; - - if (cmd->type != DYNEVENT_TYPE_SYNTH) - return -EINVAL; - - if (!type_name) - return -EINVAL; - - dynevent_arg_init(&arg, ';'); - - arg.str = type_name; - - ret = dynevent_arg_add(cmd, &arg, NULL); - if (ret) - return ret; - - if (++cmd->n_fields > SYNTH_FIELDS_MAX) - ret = -EINVAL; - - return ret; -} -EXPORT_SYMBOL_GPL(synth_event_add_field_str); - -/** - * synth_event_add_fields - Add multiple fields to a synthetic event cmd - * @cmd: A pointer to the dynevent_cmd struct representing the new event - * @fields: An array of type/name field descriptions - * @n_fields: The number of field descriptions contained in the fields array - * - * Add a new set of fields to a synthetic event cmd object. The event - * fields that will be defined for the event should be passed in as an - * array of struct synth_field_desc, and the number of elements in the - * array passed in as n_fields. Field ordering will retain the - * ordering given in the fields array. - * - * See synth_field_size() for available types. If field_name contains - * [n] the field is considered to be an array. - * - * Return: 0 if successful, error otherwise. - */ -int synth_event_add_fields(struct dynevent_cmd *cmd, - struct synth_field_desc *fields, - unsigned int n_fields) -{ - unsigned int i; - int ret = 0; - - for (i = 0; i < n_fields; i++) { - if (fields[i].type == NULL || fields[i].name == NULL) { - ret = -EINVAL; - break; - } - - ret = synth_event_add_field(cmd, fields[i].type, fields[i].name); - if (ret) - break; - } - - return ret; -} -EXPORT_SYMBOL_GPL(synth_event_add_fields); - -/** - * __synth_event_gen_cmd_start - Start a synthetic event command from arg list - * @cmd: A pointer to the dynevent_cmd struct representing the new event - * @name: The name of the synthetic event - * @mod: The module creating the event, NULL if not created from a module - * @args: Variable number of arg (pairs), one pair for each field - * - * NOTE: Users normally won't want to call this function directly, but - * rather use the synth_event_gen_cmd_start() wrapper, which - * automatically adds a NULL to the end of the arg list. If this - * function is used directly, make sure the last arg in the variable - * arg list is NULL. - * - * Generate a synthetic event command to be executed by - * synth_event_gen_cmd_end(). This function can be used to generate - * the complete command or only the first part of it; in the latter - * case, synth_event_add_field(), synth_event_add_field_str(), or - * synth_event_add_fields() can be used to add more fields following - * this. - * - * There should be an even number variable args, each pair consisting - * of a type followed by a field name. - * - * See synth_field_size() for available types. If field_name contains - * [n] the field is considered to be an array. - * - * Return: 0 if successful, error otherwise. - */ -int __synth_event_gen_cmd_start(struct dynevent_cmd *cmd, const char *name, - struct module *mod, ...) -{ - struct dynevent_arg arg; - va_list args; - int ret; - - cmd->event_name = name; - cmd->private_data = mod; - - if (cmd->type != DYNEVENT_TYPE_SYNTH) - return -EINVAL; - - dynevent_arg_init(&arg, 0); - arg.str = name; - ret = dynevent_arg_add(cmd, &arg, NULL); - if (ret) - return ret; - - va_start(args, mod); - for (;;) { - const char *type, *name; - - type = va_arg(args, const char *); - if (!type) - break; - name = va_arg(args, const char *); - if (!name) - break; - - if (++cmd->n_fields > SYNTH_FIELDS_MAX) { - ret = -EINVAL; - break; - } - - ret = synth_event_add_field(cmd, type, name); - if (ret) - break; - } - va_end(args); - - return ret; -} -EXPORT_SYMBOL_GPL(__synth_event_gen_cmd_start); - -/** - * synth_event_gen_cmd_array_start - Start synthetic event command from an array - * @cmd: A pointer to the dynevent_cmd struct representing the new event - * @name: The name of the synthetic event - * @fields: An array of type/name field descriptions - * @n_fields: The number of field descriptions contained in the fields array - * - * Generate a synthetic event command to be executed by - * synth_event_gen_cmd_end(). This function can be used to generate - * the complete command or only the first part of it; in the latter - * case, synth_event_add_field(), synth_event_add_field_str(), or - * synth_event_add_fields() can be used to add more fields following - * this. - * - * The event fields that will be defined for the event should be - * passed in as an array of struct synth_field_desc, and the number of - * elements in the array passed in as n_fields. Field ordering will - * retain the ordering given in the fields array. - * - * See synth_field_size() for available types. If field_name contains - * [n] the field is considered to be an array. - * - * Return: 0 if successful, error otherwise. - */ -int synth_event_gen_cmd_array_start(struct dynevent_cmd *cmd, const char *name, - struct module *mod, - struct synth_field_desc *fields, - unsigned int n_fields) -{ - struct dynevent_arg arg; - unsigned int i; - int ret = 0; - - cmd->event_name = name; - cmd->private_data = mod; - - if (cmd->type != DYNEVENT_TYPE_SYNTH) - return -EINVAL; - - if (n_fields > SYNTH_FIELDS_MAX) - return -EINVAL; - - dynevent_arg_init(&arg, 0); - arg.str = name; - ret = dynevent_arg_add(cmd, &arg, NULL); - if (ret) - return ret; - - for (i = 0; i < n_fields; i++) { - if (fields[i].type == NULL || fields[i].name == NULL) - return -EINVAL; - - ret = synth_event_add_field(cmd, fields[i].type, fields[i].name); - if (ret) - break; - } - - return ret; -} -EXPORT_SYMBOL_GPL(synth_event_gen_cmd_array_start); - -static int __create_synth_event(int argc, const char *name, const char **argv) -{ - struct synth_field *field, *fields[SYNTH_FIELDS_MAX]; - struct synth_event *event = NULL; - int i, consumed = 0, n_fields = 0, ret = 0; - - /* - * Argument syntax: - * - Add synthetic event: <event_name> field[;field] ... - * - Remove synthetic event: !<event_name> field[;field] ... - * where 'field' = type field_name - */ - - if (name[0] == '\0' || argc < 1) - return -EINVAL; - - mutex_lock(&event_mutex); - - event = find_synth_event(name); - if (event) { - ret = -EEXIST; - goto out; - } - - for (i = 0; i < argc - 1; i++) { - if (strcmp(argv[i], ";") == 0) - continue; - if (n_fields == SYNTH_FIELDS_MAX) { - ret = -EINVAL; - goto err; - } - - field = parse_synth_field(argc - i, &argv[i], &consumed); - if (IS_ERR(field)) { - ret = PTR_ERR(field); - goto err; - } - fields[n_fields++] = field; - i += consumed - 1; - } - - if (i < argc && strcmp(argv[i], ";") != 0) { - ret = -EINVAL; - goto err; - } - - event = alloc_synth_event(name, n_fields, fields); - if (IS_ERR(event)) { - ret = PTR_ERR(event); - event = NULL; - goto err; - } - ret = register_synth_event(event); - if (!ret) - dyn_event_add(&event->devent); - else - free_synth_event(event); - out: - mutex_unlock(&event_mutex); - - return ret; - err: - for (i = 0; i < n_fields; i++) - free_synth_field(fields[i]); - - goto out; -} - -/** - * synth_event_create - Create a new synthetic event - * @name: The name of the new sythetic event - * @fields: An array of type/name field descriptions - * @n_fields: The number of field descriptions contained in the fields array - * @mod: The module creating the event, NULL if not created from a module - * - * Create a new synthetic event with the given name under the - * trace/events/synthetic/ directory. The event fields that will be - * defined for the event should be passed in as an array of struct - * synth_field_desc, and the number elements in the array passed in as - * n_fields. Field ordering will retain the ordering given in the - * fields array. - * - * If the new synthetic event is being created from a module, the mod - * param must be non-NULL. This will ensure that the trace buffer - * won't contain unreadable events. - * - * The new synth event should be deleted using synth_event_delete() - * function. The new synthetic event can be generated from modules or - * other kernel code using trace_synth_event() and related functions. - * - * Return: 0 if successful, error otherwise. - */ -int synth_event_create(const char *name, struct synth_field_desc *fields, - unsigned int n_fields, struct module *mod) -{ - struct dynevent_cmd cmd; - char *buf; - int ret; - - buf = kzalloc(MAX_DYNEVENT_CMD_LEN, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - synth_event_cmd_init(&cmd, buf, MAX_DYNEVENT_CMD_LEN); - - ret = synth_event_gen_cmd_array_start(&cmd, name, mod, - fields, n_fields); - if (ret) - goto out; - - ret = synth_event_gen_cmd_end(&cmd); - out: - kfree(buf); - - return ret; -} -EXPORT_SYMBOL_GPL(synth_event_create); - -static int destroy_synth_event(struct synth_event *se) -{ - int ret; - - if (se->ref) - ret = -EBUSY; - else { - ret = unregister_synth_event(se); - if (!ret) { - dyn_event_remove(&se->devent); - free_synth_event(se); - } - } - - return ret; -} - -/** - * synth_event_delete - Delete a synthetic event - * @event_name: The name of the new sythetic event - * - * Delete a synthetic event that was created with synth_event_create(). - * - * Return: 0 if successful, error otherwise. - */ -int synth_event_delete(const char *event_name) -{ - struct synth_event *se = NULL; - struct module *mod = NULL; - int ret = -ENOENT; - - mutex_lock(&event_mutex); - se = find_synth_event(event_name); - if (se) { - mod = se->mod; - ret = destroy_synth_event(se); - } - mutex_unlock(&event_mutex); - - if (mod) { - mutex_lock(&trace_types_lock); - /* - * It is safest to reset the ring buffer if the module - * being unloaded registered any events that were - * used. The only worry is if a new module gets - * loaded, and takes on the same id as the events of - * this module. When printing out the buffer, traced - * events left over from this module may be passed to - * the new module events and unexpected results may - * occur. - */ - tracing_reset_all_online_cpus(); - mutex_unlock(&trace_types_lock); - } - - return ret; -} -EXPORT_SYMBOL_GPL(synth_event_delete); - -static int create_or_delete_synth_event(int argc, char **argv) -{ - const char *name = argv[0]; - int ret; - - /* trace_run_command() ensures argc != 0 */ - if (name[0] == '!') { - ret = synth_event_delete(name + 1); - return ret; - } - - ret = __create_synth_event(argc - 1, name, (const char **)argv + 1); - return ret == -ECANCELED ? -EINVAL : ret; -} - -static int synth_event_run_command(struct dynevent_cmd *cmd) -{ - struct synth_event *se; - int ret; - - ret = trace_run_command(cmd->seq.buffer, create_or_delete_synth_event); - if (ret) - return ret; - - se = find_synth_event(cmd->event_name); - if (WARN_ON(!se)) - return -ENOENT; - - se->mod = cmd->private_data; - - return ret; -} - -/** - * synth_event_cmd_init - Initialize a synthetic event command object - * @cmd: A pointer to the dynevent_cmd struct representing the new event - * @buf: A pointer to the buffer used to build the command - * @maxlen: The length of the buffer passed in @buf - * - * Initialize a synthetic event command object. Use this before - * calling any of the other dyenvent_cmd functions. - */ -void synth_event_cmd_init(struct dynevent_cmd *cmd, char *buf, int maxlen) -{ - dynevent_cmd_init(cmd, buf, maxlen, DYNEVENT_TYPE_SYNTH, - synth_event_run_command); -} -EXPORT_SYMBOL_GPL(synth_event_cmd_init); - -static inline int -__synth_event_trace_start(struct trace_event_file *file, - struct synth_event_trace_state *trace_state) -{ - int entry_size, fields_size = 0; - int ret = 0; - - memset(trace_state, '\0', sizeof(*trace_state)); - - /* - * Normal event tracing doesn't get called at all unless the - * ENABLED bit is set (which attaches the probe thus allowing - * this code to be called, etc). Because this is called - * directly by the user, we don't have that but we still need - * to honor not logging when disabled. For the the iterated - * trace case, we save the enabed state upon start and just - * ignore the following data calls. - */ - if (!(file->flags & EVENT_FILE_FL_ENABLED) || - trace_trigger_soft_disabled(file)) { - trace_state->disabled = true; - ret = -ENOENT; - goto out; - } - - trace_state->event = file->event_call->data; - - fields_size = trace_state->event->n_u64 * sizeof(u64); - - /* - * Avoid ring buffer recursion detection, as this event - * is being performed within another event. - */ - trace_state->buffer = file->tr->array_buffer.buffer; - ring_buffer_nest_start(trace_state->buffer); - - entry_size = sizeof(*trace_state->entry) + fields_size; - trace_state->entry = trace_event_buffer_reserve(&trace_state->fbuffer, - file, - entry_size); - if (!trace_state->entry) { - ring_buffer_nest_end(trace_state->buffer); - ret = -EINVAL; - } -out: - return ret; -} - -static inline void -__synth_event_trace_end(struct synth_event_trace_state *trace_state) -{ - trace_event_buffer_commit(&trace_state->fbuffer); - - ring_buffer_nest_end(trace_state->buffer); -} - -/** - * synth_event_trace - Trace a synthetic event - * @file: The trace_event_file representing the synthetic event - * @n_vals: The number of values in vals - * @args: Variable number of args containing the event values - * - * Trace a synthetic event using the values passed in the variable - * argument list. - * - * The argument list should be a list 'n_vals' u64 values. The number - * of vals must match the number of field in the synthetic event, and - * must be in the same order as the synthetic event fields. - * - * All vals should be cast to u64, and string vals are just pointers - * to strings, cast to u64. Strings will be copied into space - * reserved in the event for the string, using these pointers. - * - * Return: 0 on success, err otherwise. - */ -int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...) -{ - struct synth_event_trace_state state; - unsigned int i, n_u64; - va_list args; - int ret; - - ret = __synth_event_trace_start(file, &state); - if (ret) { - if (ret == -ENOENT) - ret = 0; /* just disabled, not really an error */ - return ret; - } - - if (n_vals != state.event->n_fields) { - ret = -EINVAL; - goto out; - } - - va_start(args, n_vals); - for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) { - u64 val; - - val = va_arg(args, u64); - - if (state.event->fields[i]->is_string) { - char *str_val = (char *)(long)val; - char *str_field = (char *)&state.entry->fields[n_u64]; - - strscpy(str_field, str_val, STR_VAR_LEN_MAX); - n_u64 += STR_VAR_LEN_MAX / sizeof(u64); - } else { - struct synth_field *field = state.event->fields[i]; - - switch (field->size) { - case 1: - *(u8 *)&state.entry->fields[n_u64] = (u8)val; - break; - - case 2: - *(u16 *)&state.entry->fields[n_u64] = (u16)val; - break; - - case 4: - *(u32 *)&state.entry->fields[n_u64] = (u32)val; - break; - - default: - state.entry->fields[n_u64] = val; - break; - } - n_u64++; - } - } - va_end(args); -out: - __synth_event_trace_end(&state); - - return ret; -} -EXPORT_SYMBOL_GPL(synth_event_trace); - -/** - * synth_event_trace_array - Trace a synthetic event from an array - * @file: The trace_event_file representing the synthetic event - * @vals: Array of values - * @n_vals: The number of values in vals - * - * Trace a synthetic event using the values passed in as 'vals'. - * - * The 'vals' array is just an array of 'n_vals' u64. The number of - * vals must match the number of field in the synthetic event, and - * must be in the same order as the synthetic event fields. - * - * All vals should be cast to u64, and string vals are just pointers - * to strings, cast to u64. Strings will be copied into space - * reserved in the event for the string, using these pointers. - * - * Return: 0 on success, err otherwise. - */ -int synth_event_trace_array(struct trace_event_file *file, u64 *vals, - unsigned int n_vals) -{ - struct synth_event_trace_state state; - unsigned int i, n_u64; - int ret; - - ret = __synth_event_trace_start(file, &state); - if (ret) { - if (ret == -ENOENT) - ret = 0; /* just disabled, not really an error */ - return ret; - } - - if (n_vals != state.event->n_fields) { - ret = -EINVAL; - goto out; - } - - for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) { - if (state.event->fields[i]->is_string) { - char *str_val = (char *)(long)vals[i]; - char *str_field = (char *)&state.entry->fields[n_u64]; - - strscpy(str_field, str_val, STR_VAR_LEN_MAX); - n_u64 += STR_VAR_LEN_MAX / sizeof(u64); - } else { - struct synth_field *field = state.event->fields[i]; - u64 val = vals[i]; - - switch (field->size) { - case 1: - *(u8 *)&state.entry->fields[n_u64] = (u8)val; - break; - - case 2: - *(u16 *)&state.entry->fields[n_u64] = (u16)val; - break; - - case 4: - *(u32 *)&state.entry->fields[n_u64] = (u32)val; - break; - - default: - state.entry->fields[n_u64] = val; - break; - } - n_u64++; - } - } -out: - __synth_event_trace_end(&state); - - return ret; -} -EXPORT_SYMBOL_GPL(synth_event_trace_array); - -/** - * synth_event_trace_start - Start piecewise synthetic event trace - * @file: The trace_event_file representing the synthetic event - * @trace_state: A pointer to object tracking the piecewise trace state - * - * Start the trace of a synthetic event field-by-field rather than all - * at once. - * - * This function 'opens' an event trace, which means space is reserved - * for the event in the trace buffer, after which the event's - * individual field values can be set through either - * synth_event_add_next_val() or synth_event_add_val(). - * - * A pointer to a trace_state object is passed in, which will keep - * track of the current event trace state until the event trace is - * closed (and the event finally traced) using - * synth_event_trace_end(). - * - * Note that synth_event_trace_end() must be called after all values - * have been added for each event trace, regardless of whether adding - * all field values succeeded or not. - * - * Note also that for a given event trace, all fields must be added - * using either synth_event_add_next_val() or synth_event_add_val() - * but not both together or interleaved. - * - * Return: 0 on success, err otherwise. - */ -int synth_event_trace_start(struct trace_event_file *file, - struct synth_event_trace_state *trace_state) -{ - int ret; - - if (!trace_state) - return -EINVAL; - - ret = __synth_event_trace_start(file, trace_state); - if (ret == -ENOENT) - ret = 0; /* just disabled, not really an error */ - - return ret; -} -EXPORT_SYMBOL_GPL(synth_event_trace_start); - -static int __synth_event_add_val(const char *field_name, u64 val, - struct synth_event_trace_state *trace_state) -{ - struct synth_field *field = NULL; - struct synth_trace_event *entry; - struct synth_event *event; - int i, ret = 0; - - if (!trace_state) { - ret = -EINVAL; - goto out; - } - - /* can't mix add_next_synth_val() with add_synth_val() */ - if (field_name) { - if (trace_state->add_next) { - ret = -EINVAL; - goto out; - } - trace_state->add_name = true; - } else { - if (trace_state->add_name) { - ret = -EINVAL; - goto out; - } - trace_state->add_next = true; - } - - if (trace_state->disabled) - goto out; - - event = trace_state->event; - if (trace_state->add_name) { - for (i = 0; i < event->n_fields; i++) { - field = event->fields[i]; - if (strcmp(field->name, field_name) == 0) - break; - } - if (!field) { - ret = -EINVAL; - goto out; - } - } else { - if (trace_state->cur_field >= event->n_fields) { - ret = -EINVAL; - goto out; - } - field = event->fields[trace_state->cur_field++]; - } - - entry = trace_state->entry; - if (field->is_string) { - char *str_val = (char *)(long)val; - char *str_field; - - if (!str_val) { - ret = -EINVAL; - goto out; - } - - str_field = (char *)&entry->fields[field->offset]; - strscpy(str_field, str_val, STR_VAR_LEN_MAX); - } else { - switch (field->size) { - case 1: - *(u8 *)&trace_state->entry->fields[field->offset] = (u8)val; - break; - - case 2: - *(u16 *)&trace_state->entry->fields[field->offset] = (u16)val; - break; - - case 4: - *(u32 *)&trace_state->entry->fields[field->offset] = (u32)val; - break; - - default: - trace_state->entry->fields[field->offset] = val; - break; - } - } - out: - return ret; -} - -/** - * synth_event_add_next_val - Add the next field's value to an open synth trace - * @val: The value to set the next field to - * @trace_state: A pointer to object tracking the piecewise trace state - * - * Set the value of the next field in an event that's been opened by - * synth_event_trace_start(). - * - * The val param should be the value cast to u64. If the value points - * to a string, the val param should be a char * cast to u64. - * - * This function assumes all the fields in an event are to be set one - * after another - successive calls to this function are made, one for - * each field, in the order of the fields in the event, until all - * fields have been set. If you'd rather set each field individually - * without regard to ordering, synth_event_add_val() can be used - * instead. - * - * Note however that synth_event_add_next_val() and - * synth_event_add_val() can't be intermixed for a given event trace - - * one or the other but not both can be used at the same time. - * - * Note also that synth_event_trace_end() must be called after all - * values have been added for each event trace, regardless of whether - * adding all field values succeeded or not. - * - * Return: 0 on success, err otherwise. - */ -int synth_event_add_next_val(u64 val, - struct synth_event_trace_state *trace_state) -{ - return __synth_event_add_val(NULL, val, trace_state); -} -EXPORT_SYMBOL_GPL(synth_event_add_next_val); - -/** - * synth_event_add_val - Add a named field's value to an open synth trace - * @field_name: The name of the synthetic event field value to set - * @val: The value to set the next field to - * @trace_state: A pointer to object tracking the piecewise trace state - * - * Set the value of the named field in an event that's been opened by - * synth_event_trace_start(). - * - * The val param should be the value cast to u64. If the value points - * to a string, the val param should be a char * cast to u64. - * - * This function looks up the field name, and if found, sets the field - * to the specified value. This lookup makes this function more - * expensive than synth_event_add_next_val(), so use that or the - * none-piecewise synth_event_trace() instead if efficiency is more - * important. - * - * Note however that synth_event_add_next_val() and - * synth_event_add_val() can't be intermixed for a given event trace - - * one or the other but not both can be used at the same time. - * - * Note also that synth_event_trace_end() must be called after all - * values have been added for each event trace, regardless of whether - * adding all field values succeeded or not. - * - * Return: 0 on success, err otherwise. - */ -int synth_event_add_val(const char *field_name, u64 val, - struct synth_event_trace_state *trace_state) -{ - return __synth_event_add_val(field_name, val, trace_state); -} -EXPORT_SYMBOL_GPL(synth_event_add_val); - -/** - * synth_event_trace_end - End piecewise synthetic event trace - * @trace_state: A pointer to object tracking the piecewise trace state - * - * End the trace of a synthetic event opened by - * synth_event_trace__start(). - * - * This function 'closes' an event trace, which basically means that - * it commits the reserved event and cleans up other loose ends. - * - * A pointer to a trace_state object is passed in, which will keep - * track of the current event trace state opened with - * synth_event_trace_start(). - * - * Note that this function must be called after all values have been - * added for each event trace, regardless of whether adding all field - * values succeeded or not. - * - * Return: 0 on success, err otherwise. - */ -int synth_event_trace_end(struct synth_event_trace_state *trace_state) -{ - if (!trace_state) - return -EINVAL; - - __synth_event_trace_end(trace_state); - - return 0; -} -EXPORT_SYMBOL_GPL(synth_event_trace_end); - -static int create_synth_event(int argc, const char **argv) -{ - const char *name = argv[0]; - int len; - - if (name[0] != 's' || name[1] != ':') - return -ECANCELED; - name += 2; - - /* This interface accepts group name prefix */ - if (strchr(name, '/')) { - len = str_has_prefix(name, SYNTH_SYSTEM "/"); - if (len == 0) - return -EINVAL; - name += len; - } - return __create_synth_event(argc - 1, name, argv + 1); -} - -static int synth_event_release(struct dyn_event *ev) -{ - struct synth_event *event = to_synth_event(ev); - int ret; - - if (event->ref) - return -EBUSY; - - ret = unregister_synth_event(event); - if (ret) - return ret; - - dyn_event_remove(ev); - free_synth_event(event); - return 0; -} - -static int __synth_event_show(struct seq_file *m, struct synth_event *event) -{ - struct synth_field *field; - unsigned int i; - - seq_printf(m, "%s\t", event->name); - - for (i = 0; i < event->n_fields; i++) { - field = event->fields[i]; - - /* parameter values */ - seq_printf(m, "%s %s%s", field->type, field->name, - i == event->n_fields - 1 ? "" : "; "); - } - - seq_putc(m, '\n'); - - return 0; -} - -static int synth_event_show(struct seq_file *m, struct dyn_event *ev) -{ - struct synth_event *event = to_synth_event(ev); - - seq_printf(m, "s:%s/", event->class.system); - - return __synth_event_show(m, event); -} - -static int synth_events_seq_show(struct seq_file *m, void *v) -{ - struct dyn_event *ev = v; - - if (!is_synth_event(ev)) - return 0; - - return __synth_event_show(m, to_synth_event(ev)); -} - -static const struct seq_operations synth_events_seq_op = { - .start = dyn_event_seq_start, - .next = dyn_event_seq_next, - .stop = dyn_event_seq_stop, - .show = synth_events_seq_show, -}; - -static int synth_events_open(struct inode *inode, struct file *file) -{ - int ret; - - ret = security_locked_down(LOCKDOWN_TRACEFS); - if (ret) - return ret; - - if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { - ret = dyn_events_release_all(&synth_event_ops); - if (ret < 0) - return ret; - } - - return seq_open(file, &synth_events_seq_op); -} - -static ssize_t synth_events_write(struct file *file, - const char __user *buffer, - size_t count, loff_t *ppos) -{ - return trace_parse_run_command(file, buffer, count, ppos, - create_or_delete_synth_event); -} - -static const struct file_operations synth_events_fops = { - .open = synth_events_open, - .write = synth_events_write, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - static u64 hist_field_timestamp(struct hist_field *hist_field, struct tracing_map_elt *elt, struct ring_buffer_event *rbe, @@ -6491,6 +4729,279 @@ const struct file_operations event_hist_fops = { .release = single_release, }; +#ifdef CONFIG_HIST_TRIGGERS_DEBUG +static void hist_field_debug_show_flags(struct seq_file *m, + unsigned long flags) +{ + seq_puts(m, " flags:\n"); + + if (flags & HIST_FIELD_FL_KEY) + seq_puts(m, " HIST_FIELD_FL_KEY\n"); + else if (flags & HIST_FIELD_FL_HITCOUNT) + seq_puts(m, " VAL: HIST_FIELD_FL_HITCOUNT\n"); + else if (flags & HIST_FIELD_FL_VAR) + seq_puts(m, " HIST_FIELD_FL_VAR\n"); + else if (flags & HIST_FIELD_FL_VAR_REF) + seq_puts(m, " HIST_FIELD_FL_VAR_REF\n"); + else + seq_puts(m, " VAL: normal u64 value\n"); + + if (flags & HIST_FIELD_FL_ALIAS) + seq_puts(m, " HIST_FIELD_FL_ALIAS\n"); +} + +static int hist_field_debug_show(struct seq_file *m, + struct hist_field *field, unsigned long flags) +{ + if ((field->flags & flags) != flags) { + seq_printf(m, "ERROR: bad flags - %lx\n", flags); + return -EINVAL; + } + + hist_field_debug_show_flags(m, field->flags); + if (field->field) + seq_printf(m, " ftrace_event_field name: %s\n", + field->field->name); + + if (field->flags & HIST_FIELD_FL_VAR) { + seq_printf(m, " var.name: %s\n", field->var.name); + seq_printf(m, " var.idx (into tracing_map_elt.vars[]): %u\n", + field->var.idx); + } + + if (field->flags & HIST_FIELD_FL_ALIAS) + seq_printf(m, " var_ref_idx (into hist_data->var_refs[]): %u\n", + field->var_ref_idx); + + if (field->flags & HIST_FIELD_FL_VAR_REF) { + seq_printf(m, " name: %s\n", field->name); + seq_printf(m, " var.idx (into tracing_map_elt.vars[]): %u\n", + field->var.idx); + seq_printf(m, " var.hist_data: %p\n", field->var.hist_data); + seq_printf(m, " var_ref_idx (into hist_data->var_refs[]): %u\n", + field->var_ref_idx); + if (field->system) + seq_printf(m, " system: %s\n", field->system); + if (field->event_name) + seq_printf(m, " event_name: %s\n", field->event_name); + } + + seq_printf(m, " type: %s\n", field->type); + seq_printf(m, " size: %u\n", field->size); + seq_printf(m, " is_signed: %u\n", field->is_signed); + + return 0; +} + +static int field_var_debug_show(struct seq_file *m, + struct field_var *field_var, unsigned int i, + bool save_vars) +{ + const char *vars_name = save_vars ? "save_vars" : "field_vars"; + struct hist_field *field; + int ret = 0; + + seq_printf(m, "\n hist_data->%s[%d]:\n", vars_name, i); + + field = field_var->var; + + seq_printf(m, "\n %s[%d].var:\n", vars_name, i); + + hist_field_debug_show_flags(m, field->flags); + seq_printf(m, " var.name: %s\n", field->var.name); + seq_printf(m, " var.idx (into tracing_map_elt.vars[]): %u\n", + field->var.idx); + + field = field_var->val; + + seq_printf(m, "\n %s[%d].val:\n", vars_name, i); + if (field->field) + seq_printf(m, " ftrace_event_field name: %s\n", + field->field->name); + else { + ret = -EINVAL; + goto out; + } + + seq_printf(m, " type: %s\n", field->type); + seq_printf(m, " size: %u\n", field->size); + seq_printf(m, " is_signed: %u\n", field->is_signed); +out: + return ret; +} + +static int hist_action_debug_show(struct seq_file *m, + struct action_data *data, int i) +{ + int ret = 0; + + if (data->handler == HANDLER_ONMAX || + data->handler == HANDLER_ONCHANGE) { + seq_printf(m, "\n hist_data->actions[%d].track_data.var_ref:\n", i); + ret = hist_field_debug_show(m, data->track_data.var_ref, + HIST_FIELD_FL_VAR_REF); + if (ret) + goto out; + + seq_printf(m, "\n hist_data->actions[%d].track_data.track_var:\n", i); + ret = hist_field_debug_show(m, data->track_data.track_var, + HIST_FIELD_FL_VAR); + if (ret) + goto out; + } + + if (data->handler == HANDLER_ONMATCH) { + seq_printf(m, "\n hist_data->actions[%d].match_data.event_system: %s\n", + i, data->match_data.event_system); + seq_printf(m, " hist_data->actions[%d].match_data.event: %s\n", + i, data->match_data.event); + } +out: + return ret; +} + +static int hist_actions_debug_show(struct seq_file *m, + struct hist_trigger_data *hist_data) +{ + int i, ret = 0; + + if (hist_data->n_actions) + seq_puts(m, "\n action tracking variables (for onmax()/onchange()/onmatch()):\n"); + + for (i = 0; i < hist_data->n_actions; i++) { + struct action_data *action = hist_data->actions[i]; + + ret = hist_action_debug_show(m, action, i); + if (ret) + goto out; + } + + if (hist_data->n_save_vars) + seq_puts(m, "\n save action variables (save() params):\n"); + + for (i = 0; i < hist_data->n_save_vars; i++) { + ret = field_var_debug_show(m, hist_data->save_vars[i], i, true); + if (ret) + goto out; + } +out: + return ret; +} + +static void hist_trigger_debug_show(struct seq_file *m, + struct event_trigger_data *data, int n) +{ + struct hist_trigger_data *hist_data; + int i, ret; + + if (n > 0) + seq_puts(m, "\n\n"); + + seq_puts(m, "# event histogram\n#\n# trigger info: "); + data->ops->print(m, data->ops, data); + seq_puts(m, "#\n\n"); + + hist_data = data->private_data; + + seq_printf(m, "hist_data: %p\n\n", hist_data); + seq_printf(m, " n_vals: %u\n", hist_data->n_vals); + seq_printf(m, " n_keys: %u\n", hist_data->n_keys); + seq_printf(m, " n_fields: %u\n", hist_data->n_fields); + + seq_puts(m, "\n val fields:\n\n"); + + seq_puts(m, " hist_data->fields[0]:\n"); + ret = hist_field_debug_show(m, hist_data->fields[0], + HIST_FIELD_FL_HITCOUNT); + if (ret) + return; + + for (i = 1; i < hist_data->n_vals; i++) { + seq_printf(m, "\n hist_data->fields[%d]:\n", i); + ret = hist_field_debug_show(m, hist_data->fields[i], 0); + if (ret) + return; + } + + seq_puts(m, "\n key fields:\n"); + + for (i = hist_data->n_vals; i < hist_data->n_fields; i++) { + seq_printf(m, "\n hist_data->fields[%d]:\n", i); + ret = hist_field_debug_show(m, hist_data->fields[i], + HIST_FIELD_FL_KEY); + if (ret) + return; + } + + if (hist_data->n_var_refs) + seq_puts(m, "\n variable reference fields:\n"); + + for (i = 0; i < hist_data->n_var_refs; i++) { + seq_printf(m, "\n hist_data->var_refs[%d]:\n", i); + ret = hist_field_debug_show(m, hist_data->var_refs[i], + HIST_FIELD_FL_VAR_REF); + if (ret) + return; + } + + if (hist_data->n_field_vars) + seq_puts(m, "\n field variables:\n"); + + for (i = 0; i < hist_data->n_field_vars; i++) { + ret = field_var_debug_show(m, hist_data->field_vars[i], i, false); + if (ret) + return; + } + + ret = hist_actions_debug_show(m, hist_data); + if (ret) + return; +} + +static int hist_debug_show(struct seq_file *m, void *v) +{ + struct event_trigger_data *data; + struct trace_event_file *event_file; + int n = 0, ret = 0; + + mutex_lock(&event_mutex); + + event_file = event_file_data(m->private); + if (unlikely(!event_file)) { + ret = -ENODEV; + goto out_unlock; + } + + list_for_each_entry(data, &event_file->triggers, list) { + if (data->cmd_ops->trigger_type == ETT_EVENT_HIST) + hist_trigger_debug_show(m, data, n++); + } + + out_unlock: + mutex_unlock(&event_mutex); + + return ret; +} + +static int event_hist_debug_open(struct inode *inode, struct file *file) +{ + int ret; + + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + + return single_open(file, hist_debug_show, file); +} + +const struct file_operations event_hist_debug_fops = { + .open = event_hist_debug_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif + static void hist_field_print(struct seq_file *m, struct hist_field *hist_field) { const char *field_name = hist_field_name(hist_field, 0); @@ -7393,37 +5904,3 @@ __init int register_trigger_hist_enable_disable_cmds(void) return ret; } - -static __init int trace_events_hist_init(void) -{ - struct dentry *entry = NULL; - struct dentry *d_tracer; - int err = 0; - - err = dyn_event_register(&synth_event_ops); - if (err) { - pr_warn("Could not register synth_event_ops\n"); - return err; - } - - d_tracer = tracing_init_dentry(); - if (IS_ERR(d_tracer)) { - err = PTR_ERR(d_tracer); - goto err; - } - - entry = tracefs_create_file("synthetic_events", 0644, d_tracer, - NULL, &synth_events_fops); - if (!entry) { - err = -ENODEV; - goto err; - } - - return err; - err: - pr_warn("Could not create tracefs 'synthetic_events' entry\n"); - - return err; -} - -fs_initcall(trace_events_hist_init); diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c new file mode 100644 index 000000000000..c6cca0d1d584 --- /dev/null +++ b/kernel/trace/trace_events_synth.c @@ -0,0 +1,1789 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * trace_events_synth - synthetic trace events + * + * Copyright (C) 2015, 2020 Tom Zanussi <tom.zanussi@linux.intel.com> + */ + +#include <linux/module.h> +#include <linux/kallsyms.h> +#include <linux/security.h> +#include <linux/mutex.h> +#include <linux/slab.h> +#include <linux/stacktrace.h> +#include <linux/rculist.h> +#include <linux/tracefs.h> + +/* for gfp flag names */ +#include <linux/trace_events.h> +#include <trace/events/mmflags.h> + +#include "trace_synth.h" + +static int create_synth_event(int argc, const char **argv); +static int synth_event_show(struct seq_file *m, struct dyn_event *ev); +static int synth_event_release(struct dyn_event *ev); +static bool synth_event_is_busy(struct dyn_event *ev); +static bool synth_event_match(const char *system, const char *event, + int argc, const char **argv, struct dyn_event *ev); + +static struct dyn_event_operations synth_event_ops = { + .create = create_synth_event, + .show = synth_event_show, + .is_busy = synth_event_is_busy, + .free = synth_event_release, + .match = synth_event_match, +}; + +static bool is_synth_event(struct dyn_event *ev) +{ + return ev->ops == &synth_event_ops; +} + +static struct synth_event *to_synth_event(struct dyn_event *ev) +{ + return container_of(ev, struct synth_event, devent); +} + +static bool synth_event_is_busy(struct dyn_event *ev) +{ + struct synth_event *event = to_synth_event(ev); + + return event->ref != 0; +} + +static bool synth_event_match(const char *system, const char *event, + int argc, const char **argv, struct dyn_event *ev) +{ + struct synth_event *sev = to_synth_event(ev); + + return strcmp(sev->name, event) == 0 && + (!system || strcmp(system, SYNTH_SYSTEM) == 0); +} + +struct synth_trace_event { + struct trace_entry ent; + u64 fields[]; +}; + +static int synth_event_define_fields(struct trace_event_call *call) +{ + struct synth_trace_event trace; + int offset = offsetof(typeof(trace), fields); + struct synth_event *event = call->data; + unsigned int i, size, n_u64; + char *name, *type; + bool is_signed; + int ret = 0; + + for (i = 0, n_u64 = 0; i < event->n_fields; i++) { + size = event->fields[i]->size; + is_signed = event->fields[i]->is_signed; + type = event->fields[i]->type; + name = event->fields[i]->name; + ret = trace_define_field(call, type, name, offset, size, + is_signed, FILTER_OTHER); + if (ret) + break; + + event->fields[i]->offset = n_u64; + + if (event->fields[i]->is_string) { + offset += STR_VAR_LEN_MAX; + n_u64 += STR_VAR_LEN_MAX / sizeof(u64); + } else { + offset += sizeof(u64); + n_u64++; + } + } + + event->n_u64 = n_u64; + + return ret; +} + +static bool synth_field_signed(char *type) +{ + if (str_has_prefix(type, "u")) + return false; + if (strcmp(type, "gfp_t") == 0) + return false; + + return true; +} + +static int synth_field_is_string(char *type) +{ + if (strstr(type, "char[") != NULL) + return true; + + return false; +} + +static int synth_field_string_size(char *type) +{ + char buf[4], *end, *start; + unsigned int len; + int size, err; + + start = strstr(type, "char["); + if (start == NULL) + return -EINVAL; + start += sizeof("char[") - 1; + + end = strchr(type, ']'); + if (!end || end < start) + return -EINVAL; + + len = end - start; + if (len > 3) + return -EINVAL; + + strncpy(buf, start, len); + buf[len] = '\0'; + + err = kstrtouint(buf, 0, &size); + if (err) + return err; + + if (size > STR_VAR_LEN_MAX) + return -EINVAL; + + return size; +} + +static int synth_field_size(char *type) +{ + int size = 0; + + if (strcmp(type, "s64") == 0) + size = sizeof(s64); + else if (strcmp(type, "u64") == 0) + size = sizeof(u64); + else if (strcmp(type, "s32") == 0) + size = sizeof(s32); + else if (strcmp(type, "u32") == 0) + size = sizeof(u32); + else if (strcmp(type, "s16") == 0) + size = sizeof(s16); + else if (strcmp(type, "u16") == 0) + size = sizeof(u16); + else if (strcmp(type, "s8") == 0) + size = sizeof(s8); + else if (strcmp(type, "u8") == 0) + size = sizeof(u8); + else if (strcmp(type, "char") == 0) + size = sizeof(char); + else if (strcmp(type, "unsigned char") == 0) + size = sizeof(unsigned char); + else if (strcmp(type, "int") == 0) + size = sizeof(int); + else if (strcmp(type, "unsigned int") == 0) + size = sizeof(unsigned int); + else if (strcmp(type, "long") == 0) + size = sizeof(long); + else if (strcmp(type, "unsigned long") == 0) + size = sizeof(unsigned long); + else if (strcmp(type, "pid_t") == 0) + size = sizeof(pid_t); + else if (strcmp(type, "gfp_t") == 0) + size = sizeof(gfp_t); + else if (synth_field_is_string(type)) + size = synth_field_string_size(type); + + return size; +} + +static const char *synth_field_fmt(char *type) +{ + const char *fmt = "%llu"; + + if (strcmp(type, "s64") == 0) + fmt = "%lld"; + else if (strcmp(type, "u64") == 0) + fmt = "%llu"; + else if (strcmp(type, "s32") == 0) + fmt = "%d"; + else if (strcmp(type, "u32") == 0) + fmt = "%u"; + else if (strcmp(type, "s16") == 0) + fmt = "%d"; + else if (strcmp(type, "u16") == 0) + fmt = "%u"; + else if (strcmp(type, "s8") == 0) + fmt = "%d"; + else if (strcmp(type, "u8") == 0) + fmt = "%u"; + else if (strcmp(type, "char") == 0) + fmt = "%d"; + else if (strcmp(type, "unsigned char") == 0) + fmt = "%u"; + else if (strcmp(type, "int") == 0) + fmt = "%d"; + else if (strcmp(type, "unsigned int") == 0) + fmt = "%u"; + else if (strcmp(type, "long") == 0) + fmt = "%ld"; + else if (strcmp(type, "unsigned long") == 0) + fmt = "%lu"; + else if (strcmp(type, "pid_t") == 0) + fmt = "%d"; + else if (strcmp(type, "gfp_t") == 0) + fmt = "%x"; + else if (synth_field_is_string(type)) + fmt = "%s"; + + return fmt; +} + +static void print_synth_event_num_val(struct trace_seq *s, + char *print_fmt, char *name, + int size, u64 val, char *space) +{ + switch (size) { + case 1: + trace_seq_printf(s, print_fmt, name, (u8)val, space); + break; + + case 2: + trace_seq_printf(s, print_fmt, name, (u16)val, space); + break; + + case 4: + trace_seq_printf(s, print_fmt, name, (u32)val, space); + break; + + default: + trace_seq_printf(s, print_fmt, name, val, space); + break; + } +} + +static enum print_line_t print_synth_event(struct trace_iterator *iter, + int flags, + struct trace_event *event) +{ + struct trace_array *tr = iter->tr; + struct trace_seq *s = &iter->seq; + struct synth_trace_event *entry; + struct synth_event *se; + unsigned int i, n_u64; + char print_fmt[32]; + const char *fmt; + + entry = (struct synth_trace_event *)iter->ent; + se = container_of(event, struct synth_event, call.event); + + trace_seq_printf(s, "%s: ", se->name); + + for (i = 0, n_u64 = 0; i < se->n_fields; i++) { + if (trace_seq_has_overflowed(s)) + goto end; + + fmt = synth_field_fmt(se->fields[i]->type); + + /* parameter types */ + if (tr && tr->trace_flags & TRACE_ITER_VERBOSE) + trace_seq_printf(s, "%s ", fmt); + + snprintf(print_fmt, sizeof(print_fmt), "%%s=%s%%s", fmt); + + /* parameter values */ + if (se->fields[i]->is_string) { + trace_seq_printf(s, print_fmt, se->fields[i]->name, + (char *)&entry->fields[n_u64], + i == se->n_fields - 1 ? "" : " "); + n_u64 += STR_VAR_LEN_MAX / sizeof(u64); + } else { + struct trace_print_flags __flags[] = { + __def_gfpflag_names, {-1, NULL} }; + char *space = (i == se->n_fields - 1 ? "" : " "); + + print_synth_event_num_val(s, print_fmt, + se->fields[i]->name, + se->fields[i]->size, + entry->fields[n_u64], + space); + + if (strcmp(se->fields[i]->type, "gfp_t") == 0) { + trace_seq_puts(s, " ("); + trace_print_flags_seq(s, "|", + entry->fields[n_u64], + __flags); + trace_seq_putc(s, ')'); + } + n_u64++; + } + } +end: + trace_seq_putc(s, '\n'); + + return trace_handle_return(s); +} + +static struct trace_event_functions synth_event_funcs = { + .trace = print_synth_event +}; + +static notrace void trace_event_raw_event_synth(void *__data, + u64 *var_ref_vals, + unsigned int *var_ref_idx) +{ + struct trace_event_file *trace_file = __data; + struct synth_trace_event *entry; + struct trace_event_buffer fbuffer; + struct trace_buffer *buffer; + struct synth_event *event; + unsigned int i, n_u64, val_idx; + int fields_size = 0; + + event = trace_file->event_call->data; + + if (trace_trigger_soft_disabled(trace_file)) + return; + + fields_size = event->n_u64 * sizeof(u64); + + /* + * Avoid ring buffer recursion detection, as this event + * is being performed within another event. + */ + buffer = trace_file->tr->array_buffer.buffer; + ring_buffer_nest_start(buffer); + + entry = trace_event_buffer_reserve(&fbuffer, trace_file, + sizeof(*entry) + fields_size); + if (!entry) + goto out; + + for (i = 0, n_u64 = 0; i < event->n_fields; i++) { + val_idx = var_ref_idx[i]; + if (event->fields[i]->is_string) { + char *str_val = (char *)(long)var_ref_vals[val_idx]; + char *str_field = (char *)&entry->fields[n_u64]; + + strscpy(str_field, str_val, STR_VAR_LEN_MAX); + n_u64 += STR_VAR_LEN_MAX / sizeof(u64); + } else { + struct synth_field *field = event->fields[i]; + u64 val = var_ref_vals[val_idx]; + + switch (field->size) { + case 1: + *(u8 *)&entry->fields[n_u64] = (u8)val; + break; + + case 2: + *(u16 *)&entry->fields[n_u64] = (u16)val; + break; + + case 4: + *(u32 *)&entry->fields[n_u64] = (u32)val; + break; + + default: + entry->fields[n_u64] = val; + break; + } + n_u64++; + } + } + + trace_event_buffer_commit(&fbuffer); +out: + ring_buffer_nest_end(buffer); +} + +static void free_synth_event_print_fmt(struct trace_event_call *call) +{ + if (call) { + kfree(call->print_fmt); + call->print_fmt = NULL; + } +} + +static int __set_synth_event_print_fmt(struct synth_event *event, + char *buf, int len) +{ + const char *fmt; + int pos = 0; + int i; + + /* When len=0, we just calculate the needed length */ +#define LEN_OR_ZERO (len ? len - pos : 0) + + pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); + for (i = 0; i < event->n_fields; i++) { + fmt = synth_field_fmt(event->fields[i]->type); + pos += snprintf(buf + pos, LEN_OR_ZERO, "%s=%s%s", + event->fields[i]->name, fmt, + i == event->n_fields - 1 ? "" : ", "); + } + pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); + + for (i = 0; i < event->n_fields; i++) { + pos += snprintf(buf + pos, LEN_OR_ZERO, + ", REC->%s", event->fields[i]->name); + } + +#undef LEN_OR_ZERO + + /* return the length of print_fmt */ + return pos; +} + +static int set_synth_event_print_fmt(struct trace_event_call *call) +{ + struct synth_event *event = call->data; + char *print_fmt; + int len; + + /* First: called with 0 length to calculate the needed length */ + len = __set_synth_event_print_fmt(event, NULL, 0); + + print_fmt = kmalloc(len + 1, GFP_KERNEL); + if (!print_fmt) + return -ENOMEM; + + /* Second: actually write the @print_fmt */ + __set_synth_event_print_fmt(event, print_fmt, len + 1); + call->print_fmt = print_fmt; + + return 0; +} + +static void free_synth_field(struct synth_field *field) +{ + kfree(field->type); + kfree(field->name); + kfree(field); +} + +static struct synth_field *parse_synth_field(int argc, const char **argv, + int *consumed) +{ + struct synth_field *field; + const char *prefix = NULL, *field_type = argv[0], *field_name, *array; + int len, ret = 0; + + if (field_type[0] == ';') + field_type++; + + if (!strcmp(field_type, "unsigned")) { + if (argc < 3) + return ERR_PTR(-EINVAL); + prefix = "unsigned "; + field_type = argv[1]; + field_name = argv[2]; + *consumed = 3; + } else { + field_name = argv[1]; + *consumed = 2; + } + + field = kzalloc(sizeof(*field), GFP_KERNEL); + if (!field) + return ERR_PTR(-ENOMEM); + + len = strlen(field_name); + array = strchr(field_name, '['); + if (array) + len -= strlen(array); + else if (field_name[len - 1] == ';') + len--; + + field->name = kmemdup_nul(field_name, len, GFP_KERNEL); + if (!field->name) { + ret = -ENOMEM; + goto free; + } + + if (field_type[0] == ';') + field_type++; + len = strlen(field_type) + 1; + if (array) + len += strlen(array); + if (prefix) + len += strlen(prefix); + + field->type = kzalloc(len, GFP_KERNEL); + if (!field->type) { + ret = -ENOMEM; + goto free; + } + if (prefix) + strcat(field->type, prefix); + strcat(field->type, field_type); + if (array) { + strcat(field->type, array); + if (field->type[len - 1] == ';') + field->type[len - 1] = '\0'; + } + + field->size = synth_field_size(field->type); + if (!field->size) { + ret = -EINVAL; + goto free; + } + + if (synth_field_is_string(field->type)) + field->is_string = true; + + field->is_signed = synth_field_signed(field->type); + + out: + return field; + free: + free_synth_field(field); + field = ERR_PTR(ret); + goto out; +} + +static void free_synth_tracepoint(struct tracepoint *tp) +{ + if (!tp) + return; + + kfree(tp->name); + kfree(tp); +} + +static struct tracepoint *alloc_synth_tracepoint(char *name) +{ + struct tracepoint *tp; + + tp = kzalloc(sizeof(*tp), GFP_KERNEL); + if (!tp) + return ERR_PTR(-ENOMEM); + + tp->name = kstrdup(name, GFP_KERNEL); + if (!tp->name) { + kfree(tp); + return ERR_PTR(-ENOMEM); + } + + return tp; +} + +struct synth_event *find_synth_event(const char *name) +{ + struct dyn_event *pos; + struct synth_event *event; + + for_each_dyn_event(pos) { + if (!is_synth_event(pos)) + continue; + event = to_synth_event(pos); + if (strcmp(event->name, name) == 0) + return event; + } + + return NULL; +} + +static struct trace_event_fields synth_event_fields_array[] = { + { .type = TRACE_FUNCTION_TYPE, + .define_fields = synth_event_define_fields }, + {} +}; + +static int register_synth_event(struct synth_event *event) +{ + struct trace_event_call *call = &event->call; + int ret = 0; + + event->call.class = &event->class; + event->class.system = kstrdup(SYNTH_SYSTEM, GFP_KERNEL); + if (!event->class.system) { + ret = -ENOMEM; + goto out; + } + + event->tp = alloc_synth_tracepoint(event->name); + if (IS_ERR(event->tp)) { + ret = PTR_ERR(event->tp); + event->tp = NULL; + goto out; + } + + INIT_LIST_HEAD(&call->class->fields); + call->event.funcs = &synth_event_funcs; + call->class->fields_array = synth_event_fields_array; + + ret = register_trace_event(&call->event); + if (!ret) { + ret = -ENODEV; + goto out; + } + call->flags = TRACE_EVENT_FL_TRACEPOINT; + call->class->reg = trace_event_reg; + call->class->probe = trace_event_raw_event_synth; + call->data = event; + call->tp = event->tp; + + ret = trace_add_event_call(call); + if (ret) { + pr_warn("Failed to register synthetic event: %s\n", + trace_event_name(call)); + goto err; + } + + ret = set_synth_event_print_fmt(call); + if (ret < 0) { + trace_remove_event_call(call); + goto err; + } + out: + return ret; + err: + unregister_trace_event(&call->event); + goto out; +} + +static int unregister_synth_event(struct synth_event *event) +{ + struct trace_event_call *call = &event->call; + int ret; + + ret = trace_remove_event_call(call); + + return ret; +} + +static void free_synth_event(struct synth_event *event) +{ + unsigned int i; + + if (!event) + return; + + for (i = 0; i < event->n_fields; i++) + free_synth_field(event->fields[i]); + + kfree(event->fields); + kfree(event->name); + kfree(event->class.system); + free_synth_tracepoint(event->tp); + free_synth_event_print_fmt(&event->call); + kfree(event); +} + +static struct synth_event *alloc_synth_event(const char *name, int n_fields, + struct synth_field **fields) +{ + struct synth_event *event; + unsigned int i; + + event = kzalloc(sizeof(*event), GFP_KERNEL); + if (!event) { + event = ERR_PTR(-ENOMEM); + goto out; + } + + event->name = kstrdup(name, GFP_KERNEL); + if (!event->name) { + kfree(event); + event = ERR_PTR(-ENOMEM); + goto out; + } + + event->fields = kcalloc(n_fields, sizeof(*event->fields), GFP_KERNEL); + if (!event->fields) { + free_synth_event(event); + event = ERR_PTR(-ENOMEM); + goto out; + } + + dyn_event_init(&event->devent, &synth_event_ops); + + for (i = 0; i < n_fields; i++) + event->fields[i] = fields[i]; + + event->n_fields = n_fields; + out: + return event; +} + +static int synth_event_check_arg_fn(void *data) +{ + struct dynevent_arg_pair *arg_pair = data; + int size; + + size = synth_field_size((char *)arg_pair->lhs); + + return size ? 0 : -EINVAL; +} + +/** + * synth_event_add_field - Add a new field to a synthetic event cmd + * @cmd: A pointer to the dynevent_cmd struct representing the new event + * @type: The type of the new field to add + * @name: The name of the new field to add + * + * Add a new field to a synthetic event cmd object. Field ordering is in + * the same order the fields are added. + * + * See synth_field_size() for available types. If field_name contains + * [n] the field is considered to be an array. + * + * Return: 0 if successful, error otherwise. + */ +int synth_event_add_field(struct dynevent_cmd *cmd, const char *type, + const char *name) +{ + struct dynevent_arg_pair arg_pair; + int ret; + + if (cmd->type != DYNEVENT_TYPE_SYNTH) + return -EINVAL; + + if (!type || !name) + return -EINVAL; + + dynevent_arg_pair_init(&arg_pair, 0, ';'); + + arg_pair.lhs = type; + arg_pair.rhs = name; + + ret = dynevent_arg_pair_add(cmd, &arg_pair, synth_event_check_arg_fn); + if (ret) + return ret; + + if (++cmd->n_fields > SYNTH_FIELDS_MAX) + ret = -EINVAL; + + return ret; +} +EXPORT_SYMBOL_GPL(synth_event_add_field); + +/** + * synth_event_add_field_str - Add a new field to a synthetic event cmd + * @cmd: A pointer to the dynevent_cmd struct representing the new event + * @type_name: The type and name of the new field to add, as a single string + * + * Add a new field to a synthetic event cmd object, as a single + * string. The @type_name string is expected to be of the form 'type + * name', which will be appended by ';'. No sanity checking is done - + * what's passed in is assumed to already be well-formed. Field + * ordering is in the same order the fields are added. + * + * See synth_field_size() for available types. If field_name contains + * [n] the field is considered to be an array. + * + * Return: 0 if successful, error otherwise. + */ +int synth_event_add_field_str(struct dynevent_cmd *cmd, const char *type_name) +{ + struct dynevent_arg arg; + int ret; + + if (cmd->type != DYNEVENT_TYPE_SYNTH) + return -EINVAL; + + if (!type_name) + return -EINVAL; + + dynevent_arg_init(&arg, ';'); + + arg.str = type_name; + + ret = dynevent_arg_add(cmd, &arg, NULL); + if (ret) + return ret; + + if (++cmd->n_fields > SYNTH_FIELDS_MAX) + ret = -EINVAL; + + return ret; +} +EXPORT_SYMBOL_GPL(synth_event_add_field_str); + +/** + * synth_event_add_fields - Add multiple fields to a synthetic event cmd + * @cmd: A pointer to the dynevent_cmd struct representing the new event + * @fields: An array of type/name field descriptions + * @n_fields: The number of field descriptions contained in the fields array + * + * Add a new set of fields to a synthetic event cmd object. The event + * fields that will be defined for the event should be passed in as an + * array of struct synth_field_desc, and the number of elements in the + * array passed in as n_fields. Field ordering will retain the + * ordering given in the fields array. + * + * See synth_field_size() for available types. If field_name contains + * [n] the field is considered to be an array. + * + * Return: 0 if successful, error otherwise. + */ +int synth_event_add_fields(struct dynevent_cmd *cmd, + struct synth_field_desc *fields, + unsigned int n_fields) +{ + unsigned int i; + int ret = 0; + + for (i = 0; i < n_fields; i++) { + if (fields[i].type == NULL || fields[i].name == NULL) { + ret = -EINVAL; + break; + } + + ret = synth_event_add_field(cmd, fields[i].type, fields[i].name); + if (ret) + break; + } + + return ret; +} +EXPORT_SYMBOL_GPL(synth_event_add_fields); + +/** + * __synth_event_gen_cmd_start - Start a synthetic event command from arg list + * @cmd: A pointer to the dynevent_cmd struct representing the new event + * @name: The name of the synthetic event + * @mod: The module creating the event, NULL if not created from a module + * @args: Variable number of arg (pairs), one pair for each field + * + * NOTE: Users normally won't want to call this function directly, but + * rather use the synth_event_gen_cmd_start() wrapper, which + * automatically adds a NULL to the end of the arg list. If this + * function is used directly, make sure the last arg in the variable + * arg list is NULL. + * + * Generate a synthetic event command to be executed by + * synth_event_gen_cmd_end(). This function can be used to generate + * the complete command or only the first part of it; in the latter + * case, synth_event_add_field(), synth_event_add_field_str(), or + * synth_event_add_fields() can be used to add more fields following + * this. + * + * There should be an even number variable args, each pair consisting + * of a type followed by a field name. + * + * See synth_field_size() for available types. If field_name contains + * [n] the field is considered to be an array. + * + * Return: 0 if successful, error otherwise. + */ +int __synth_event_gen_cmd_start(struct dynevent_cmd *cmd, const char *name, + struct module *mod, ...) +{ + struct dynevent_arg arg; + va_list args; + int ret; + + cmd->event_name = name; + cmd->private_data = mod; + + if (cmd->type != DYNEVENT_TYPE_SYNTH) + return -EINVAL; + + dynevent_arg_init(&arg, 0); + arg.str = name; + ret = dynevent_arg_add(cmd, &arg, NULL); + if (ret) + return ret; + + va_start(args, mod); + for (;;) { + const char *type, *name; + + type = va_arg(args, const char *); + if (!type) + break; + name = va_arg(args, const char *); + if (!name) + break; + + if (++cmd->n_fields > SYNTH_FIELDS_MAX) { + ret = -EINVAL; + break; + } + + ret = synth_event_add_field(cmd, type, name); + if (ret) + break; + } + va_end(args); + + return ret; +} +EXPORT_SYMBOL_GPL(__synth_event_gen_cmd_start); + +/** + * synth_event_gen_cmd_array_start - Start synthetic event command from an array + * @cmd: A pointer to the dynevent_cmd struct representing the new event + * @name: The name of the synthetic event + * @fields: An array of type/name field descriptions + * @n_fields: The number of field descriptions contained in the fields array + * + * Generate a synthetic event command to be executed by + * synth_event_gen_cmd_end(). This function can be used to generate + * the complete command or only the first part of it; in the latter + * case, synth_event_add_field(), synth_event_add_field_str(), or + * synth_event_add_fields() can be used to add more fields following + * this. + * + * The event fields that will be defined for the event should be + * passed in as an array of struct synth_field_desc, and the number of + * elements in the array passed in as n_fields. Field ordering will + * retain the ordering given in the fields array. + * + * See synth_field_size() for available types. If field_name contains + * [n] the field is considered to be an array. + * + * Return: 0 if successful, error otherwise. + */ +int synth_event_gen_cmd_array_start(struct dynevent_cmd *cmd, const char *name, + struct module *mod, + struct synth_field_desc *fields, + unsigned int n_fields) +{ + struct dynevent_arg arg; + unsigned int i; + int ret = 0; + + cmd->event_name = name; + cmd->private_data = mod; + + if (cmd->type != DYNEVENT_TYPE_SYNTH) + return -EINVAL; + + if (n_fields > SYNTH_FIELDS_MAX) + return -EINVAL; + + dynevent_arg_init(&arg, 0); + arg.str = name; + ret = dynevent_arg_add(cmd, &arg, NULL); + if (ret) + return ret; + + for (i = 0; i < n_fields; i++) { + if (fields[i].type == NULL || fields[i].name == NULL) + return -EINVAL; + + ret = synth_event_add_field(cmd, fields[i].type, fields[i].name); + if (ret) + break; + } + + return ret; +} +EXPORT_SYMBOL_GPL(synth_event_gen_cmd_array_start); + +static int __create_synth_event(int argc, const char *name, const char **argv) +{ + struct synth_field *field, *fields[SYNTH_FIELDS_MAX]; + struct synth_event *event = NULL; + int i, consumed = 0, n_fields = 0, ret = 0; + + /* + * Argument syntax: + * - Add synthetic event: <event_name> field[;field] ... + * - Remove synthetic event: !<event_name> field[;field] ... + * where 'field' = type field_name + */ + + if (name[0] == '\0' || argc < 1) + return -EINVAL; + + mutex_lock(&event_mutex); + + event = find_synth_event(name); + if (event) { + ret = -EEXIST; + goto out; + } + + for (i = 0; i < argc - 1; i++) { + if (strcmp(argv[i], ";") == 0) + continue; + if (n_fields == SYNTH_FIELDS_MAX) { + ret = -EINVAL; + goto err; + } + + field = parse_synth_field(argc - i, &argv[i], &consumed); + if (IS_ERR(field)) { + ret = PTR_ERR(field); + goto err; + } + fields[n_fields++] = field; + i += consumed - 1; + } + + if (i < argc && strcmp(argv[i], ";") != 0) { + ret = -EINVAL; + goto err; + } + + event = alloc_synth_event(name, n_fields, fields); + if (IS_ERR(event)) { + ret = PTR_ERR(event); + event = NULL; + goto err; + } + ret = register_synth_event(event); + if (!ret) + dyn_event_add(&event->devent); + else + free_synth_event(event); + out: + mutex_unlock(&event_mutex); + + return ret; + err: + for (i = 0; i < n_fields; i++) + free_synth_field(fields[i]); + + goto out; +} + +/** + * synth_event_create - Create a new synthetic event + * @name: The name of the new sythetic event + * @fields: An array of type/name field descriptions + * @n_fields: The number of field descriptions contained in the fields array + * @mod: The module creating the event, NULL if not created from a module + * + * Create a new synthetic event with the given name under the + * trace/events/synthetic/ directory. The event fields that will be + * defined for the event should be passed in as an array of struct + * synth_field_desc, and the number elements in the array passed in as + * n_fields. Field ordering will retain the ordering given in the + * fields array. + * + * If the new synthetic event is being created from a module, the mod + * param must be non-NULL. This will ensure that the trace buffer + * won't contain unreadable events. + * + * The new synth event should be deleted using synth_event_delete() + * function. The new synthetic event can be generated from modules or + * other kernel code using trace_synth_event() and related functions. + * + * Return: 0 if successful, error otherwise. + */ +int synth_event_create(const char *name, struct synth_field_desc *fields, + unsigned int n_fields, struct module *mod) +{ + struct dynevent_cmd cmd; + char *buf; + int ret; + + buf = kzalloc(MAX_DYNEVENT_CMD_LEN, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + synth_event_cmd_init(&cmd, buf, MAX_DYNEVENT_CMD_LEN); + + ret = synth_event_gen_cmd_array_start(&cmd, name, mod, + fields, n_fields); + if (ret) + goto out; + + ret = synth_event_gen_cmd_end(&cmd); + out: + kfree(buf); + + return ret; +} +EXPORT_SYMBOL_GPL(synth_event_create); + +static int destroy_synth_event(struct synth_event *se) +{ + int ret; + + if (se->ref) + ret = -EBUSY; + else { + ret = unregister_synth_event(se); + if (!ret) { + dyn_event_remove(&se->devent); + free_synth_event(se); + } + } + + return ret; +} + +/** + * synth_event_delete - Delete a synthetic event + * @event_name: The name of the new sythetic event + * + * Delete a synthetic event that was created with synth_event_create(). + * + * Return: 0 if successful, error otherwise. + */ +int synth_event_delete(const char *event_name) +{ + struct synth_event *se = NULL; + struct module *mod = NULL; + int ret = -ENOENT; + + mutex_lock(&event_mutex); + se = find_synth_event(event_name); + if (se) { + mod = se->mod; + ret = destroy_synth_event(se); + } + mutex_unlock(&event_mutex); + + if (mod) { + mutex_lock(&trace_types_lock); + /* + * It is safest to reset the ring buffer if the module + * being unloaded registered any events that were + * used. The only worry is if a new module gets + * loaded, and takes on the same id as the events of + * this module. When printing out the buffer, traced + * events left over from this module may be passed to + * the new module events and unexpected results may + * occur. + */ + tracing_reset_all_online_cpus(); + mutex_unlock(&trace_types_lock); + } + + return ret; +} +EXPORT_SYMBOL_GPL(synth_event_delete); + +static int create_or_delete_synth_event(int argc, char **argv) +{ + const char *name = argv[0]; + int ret; + + /* trace_run_command() ensures argc != 0 */ + if (name[0] == '!') { + ret = synth_event_delete(name + 1); + return ret; + } + + ret = __create_synth_event(argc - 1, name, (const char **)argv + 1); + return ret == -ECANCELED ? -EINVAL : ret; +} + +static int synth_event_run_command(struct dynevent_cmd *cmd) +{ + struct synth_event *se; + int ret; + + ret = trace_run_command(cmd->seq.buffer, create_or_delete_synth_event); + if (ret) + return ret; + + se = find_synth_event(cmd->event_name); + if (WARN_ON(!se)) + return -ENOENT; + + se->mod = cmd->private_data; + + return ret; +} + +/** + * synth_event_cmd_init - Initialize a synthetic event command object + * @cmd: A pointer to the dynevent_cmd struct representing the new event + * @buf: A pointer to the buffer used to build the command + * @maxlen: The length of the buffer passed in @buf + * + * Initialize a synthetic event command object. Use this before + * calling any of the other dyenvent_cmd functions. + */ +void synth_event_cmd_init(struct dynevent_cmd *cmd, char *buf, int maxlen) +{ + dynevent_cmd_init(cmd, buf, maxlen, DYNEVENT_TYPE_SYNTH, + synth_event_run_command); +} +EXPORT_SYMBOL_GPL(synth_event_cmd_init); + +static inline int +__synth_event_trace_start(struct trace_event_file *file, + struct synth_event_trace_state *trace_state) +{ + int entry_size, fields_size = 0; + int ret = 0; + + memset(trace_state, '\0', sizeof(*trace_state)); + + /* + * Normal event tracing doesn't get called at all unless the + * ENABLED bit is set (which attaches the probe thus allowing + * this code to be called, etc). Because this is called + * directly by the user, we don't have that but we still need + * to honor not logging when disabled. For the the iterated + * trace case, we save the enabed state upon start and just + * ignore the following data calls. + */ + if (!(file->flags & EVENT_FILE_FL_ENABLED) || + trace_trigger_soft_disabled(file)) { + trace_state->disabled = true; + ret = -ENOENT; + goto out; + } + + trace_state->event = file->event_call->data; + + fields_size = trace_state->event->n_u64 * sizeof(u64); + + /* + * Avoid ring buffer recursion detection, as this event + * is being performed within another event. + */ + trace_state->buffer = file->tr->array_buffer.buffer; + ring_buffer_nest_start(trace_state->buffer); + + entry_size = sizeof(*trace_state->entry) + fields_size; + trace_state->entry = trace_event_buffer_reserve(&trace_state->fbuffer, + file, + entry_size); + if (!trace_state->entry) { + ring_buffer_nest_end(trace_state->buffer); + ret = -EINVAL; + } +out: + return ret; +} + +static inline void +__synth_event_trace_end(struct synth_event_trace_state *trace_state) +{ + trace_event_buffer_commit(&trace_state->fbuffer); + + ring_buffer_nest_end(trace_state->buffer); +} + +/** + * synth_event_trace - Trace a synthetic event + * @file: The trace_event_file representing the synthetic event + * @n_vals: The number of values in vals + * @args: Variable number of args containing the event values + * + * Trace a synthetic event using the values passed in the variable + * argument list. + * + * The argument list should be a list 'n_vals' u64 values. The number + * of vals must match the number of field in the synthetic event, and + * must be in the same order as the synthetic event fields. + * + * All vals should be cast to u64, and string vals are just pointers + * to strings, cast to u64. Strings will be copied into space + * reserved in the event for the string, using these pointers. + * + * Return: 0 on success, err otherwise. + */ +int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...) +{ + struct synth_event_trace_state state; + unsigned int i, n_u64; + va_list args; + int ret; + + ret = __synth_event_trace_start(file, &state); + if (ret) { + if (ret == -ENOENT) + ret = 0; /* just disabled, not really an error */ + return ret; + } + + if (n_vals != state.event->n_fields) { + ret = -EINVAL; + goto out; + } + + va_start(args, n_vals); + for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) { + u64 val; + + val = va_arg(args, u64); + + if (state.event->fields[i]->is_string) { + char *str_val = (char *)(long)val; + char *str_field = (char *)&state.entry->fields[n_u64]; + + strscpy(str_field, str_val, STR_VAR_LEN_MAX); + n_u64 += STR_VAR_LEN_MAX / sizeof(u64); + } else { + struct synth_field *field = state.event->fields[i]; + + switch (field->size) { + case 1: + *(u8 *)&state.entry->fields[n_u64] = (u8)val; + break; + + case 2: + *(u16 *)&state.entry->fields[n_u64] = (u16)val; + break; + + case 4: + *(u32 *)&state.entry->fields[n_u64] = (u32)val; + break; + + default: + state.entry->fields[n_u64] = val; + break; + } + n_u64++; + } + } + va_end(args); +out: + __synth_event_trace_end(&state); + + return ret; +} +EXPORT_SYMBOL_GPL(synth_event_trace); + +/** + * synth_event_trace_array - Trace a synthetic event from an array + * @file: The trace_event_file representing the synthetic event + * @vals: Array of values + * @n_vals: The number of values in vals + * + * Trace a synthetic event using the values passed in as 'vals'. + * + * The 'vals' array is just an array of 'n_vals' u64. The number of + * vals must match the number of field in the synthetic event, and + * must be in the same order as the synthetic event fields. + * + * All vals should be cast to u64, and string vals are just pointers + * to strings, cast to u64. Strings will be copied into space + * reserved in the event for the string, using these pointers. + * + * Return: 0 on success, err otherwise. + */ +int synth_event_trace_array(struct trace_event_file *file, u64 *vals, + unsigned int n_vals) +{ + struct synth_event_trace_state state; + unsigned int i, n_u64; + int ret; + + ret = __synth_event_trace_start(file, &state); + if (ret) { + if (ret == -ENOENT) + ret = 0; /* just disabled, not really an error */ + return ret; + } + + if (n_vals != state.event->n_fields) { + ret = -EINVAL; + goto out; + } + + for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) { + if (state.event->fields[i]->is_string) { + char *str_val = (char *)(long)vals[i]; + char *str_field = (char *)&state.entry->fields[n_u64]; + + strscpy(str_field, str_val, STR_VAR_LEN_MAX); + n_u64 += STR_VAR_LEN_MAX / sizeof(u64); + } else { + struct synth_field *field = state.event->fields[i]; + u64 val = vals[i]; + + switch (field->size) { + case 1: + *(u8 *)&state.entry->fields[n_u64] = (u8)val; + break; + + case 2: + *(u16 *)&state.entry->fields[n_u64] = (u16)val; + break; + + case 4: + *(u32 *)&state.entry->fields[n_u64] = (u32)val; + break; + + default: + state.entry->fields[n_u64] = val; + break; + } + n_u64++; + } + } +out: + __synth_event_trace_end(&state); + + return ret; +} +EXPORT_SYMBOL_GPL(synth_event_trace_array); + +/** + * synth_event_trace_start - Start piecewise synthetic event trace + * @file: The trace_event_file representing the synthetic event + * @trace_state: A pointer to object tracking the piecewise trace state + * + * Start the trace of a synthetic event field-by-field rather than all + * at once. + * + * This function 'opens' an event trace, which means space is reserved + * for the event in the trace buffer, after which the event's + * individual field values can be set through either + * synth_event_add_next_val() or synth_event_add_val(). + * + * A pointer to a trace_state object is passed in, which will keep + * track of the current event trace state until the event trace is + * closed (and the event finally traced) using + * synth_event_trace_end(). + * + * Note that synth_event_trace_end() must be called after all values + * have been added for each event trace, regardless of whether adding + * all field values succeeded or not. + * + * Note also that for a given event trace, all fields must be added + * using either synth_event_add_next_val() or synth_event_add_val() + * but not both together or interleaved. + * + * Return: 0 on success, err otherwise. + */ +int synth_event_trace_start(struct trace_event_file *file, + struct synth_event_trace_state *trace_state) +{ + int ret; + + if (!trace_state) + return -EINVAL; + + ret = __synth_event_trace_start(file, trace_state); + if (ret == -ENOENT) + ret = 0; /* just disabled, not really an error */ + + return ret; +} +EXPORT_SYMBOL_GPL(synth_event_trace_start); + +static int __synth_event_add_val(const char *field_name, u64 val, + struct synth_event_trace_state *trace_state) +{ + struct synth_field *field = NULL; + struct synth_trace_event *entry; + struct synth_event *event; + int i, ret = 0; + + if (!trace_state) { + ret = -EINVAL; + goto out; + } + + /* can't mix add_next_synth_val() with add_synth_val() */ + if (field_name) { + if (trace_state->add_next) { + ret = -EINVAL; + goto out; + } + trace_state->add_name = true; + } else { + if (trace_state->add_name) { + ret = -EINVAL; + goto out; + } + trace_state->add_next = true; + } + + if (trace_state->disabled) + goto out; + + event = trace_state->event; + if (trace_state->add_name) { + for (i = 0; i < event->n_fields; i++) { + field = event->fields[i]; + if (strcmp(field->name, field_name) == 0) + break; + } + if (!field) { + ret = -EINVAL; + goto out; + } + } else { + if (trace_state->cur_field >= event->n_fields) { + ret = -EINVAL; + goto out; + } + field = event->fields[trace_state->cur_field++]; + } + + entry = trace_state->entry; + if (field->is_string) { + char *str_val = (char *)(long)val; + char *str_field; + + if (!str_val) { + ret = -EINVAL; + goto out; + } + + str_field = (char *)&entry->fields[field->offset]; + strscpy(str_field, str_val, STR_VAR_LEN_MAX); + } else { + switch (field->size) { + case 1: + *(u8 *)&trace_state->entry->fields[field->offset] = (u8)val; + break; + + case 2: + *(u16 *)&trace_state->entry->fields[field->offset] = (u16)val; + break; + + case 4: + *(u32 *)&trace_state->entry->fields[field->offset] = (u32)val; + break; + + default: + trace_state->entry->fields[field->offset] = val; + break; + } + } + out: + return ret; +} + +/** + * synth_event_add_next_val - Add the next field's value to an open synth trace + * @val: The value to set the next field to + * @trace_state: A pointer to object tracking the piecewise trace state + * + * Set the value of the next field in an event that's been opened by + * synth_event_trace_start(). + * + * The val param should be the value cast to u64. If the value points + * to a string, the val param should be a char * cast to u64. + * + * This function assumes all the fields in an event are to be set one + * after another - successive calls to this function are made, one for + * each field, in the order of the fields in the event, until all + * fields have been set. If you'd rather set each field individually + * without regard to ordering, synth_event_add_val() can be used + * instead. + * + * Note however that synth_event_add_next_val() and + * synth_event_add_val() can't be intermixed for a given event trace - + * one or the other but not both can be used at the same time. + * + * Note also that synth_event_trace_end() must be called after all + * values have been added for each event trace, regardless of whether + * adding all field values succeeded or not. + * + * Return: 0 on success, err otherwise. + */ +int synth_event_add_next_val(u64 val, + struct synth_event_trace_state *trace_state) +{ + return __synth_event_add_val(NULL, val, trace_state); +} +EXPORT_SYMBOL_GPL(synth_event_add_next_val); + +/** + * synth_event_add_val - Add a named field's value to an open synth trace + * @field_name: The name of the synthetic event field value to set + * @val: The value to set the next field to + * @trace_state: A pointer to object tracking the piecewise trace state + * + * Set the value of the named field in an event that's been opened by + * synth_event_trace_start(). + * + * The val param should be the value cast to u64. If the value points + * to a string, the val param should be a char * cast to u64. + * + * This function looks up the field name, and if found, sets the field + * to the specified value. This lookup makes this function more + * expensive than synth_event_add_next_val(), so use that or the + * none-piecewise synth_event_trace() instead if efficiency is more + * important. + * + * Note however that synth_event_add_next_val() and + * synth_event_add_val() can't be intermixed for a given event trace - + * one or the other but not both can be used at the same time. + * + * Note also that synth_event_trace_end() must be called after all + * values have been added for each event trace, regardless of whether + * adding all field values succeeded or not. + * + * Return: 0 on success, err otherwise. + */ +int synth_event_add_val(const char *field_name, u64 val, + struct synth_event_trace_state *trace_state) +{ + return __synth_event_add_val(field_name, val, trace_state); +} +EXPORT_SYMBOL_GPL(synth_event_add_val); + +/** + * synth_event_trace_end - End piecewise synthetic event trace + * @trace_state: A pointer to object tracking the piecewise trace state + * + * End the trace of a synthetic event opened by + * synth_event_trace__start(). + * + * This function 'closes' an event trace, which basically means that + * it commits the reserved event and cleans up other loose ends. + * + * A pointer to a trace_state object is passed in, which will keep + * track of the current event trace state opened with + * synth_event_trace_start(). + * + * Note that this function must be called after all values have been + * added for each event trace, regardless of whether adding all field + * values succeeded or not. + * + * Return: 0 on success, err otherwise. + */ +int synth_event_trace_end(struct synth_event_trace_state *trace_state) +{ + if (!trace_state) + return -EINVAL; + + __synth_event_trace_end(trace_state); + + return 0; +} +EXPORT_SYMBOL_GPL(synth_event_trace_end); + +static int create_synth_event(int argc, const char **argv) +{ + const char *name = argv[0]; + int len; + + if (name[0] != 's' || name[1] != ':') + return -ECANCELED; + name += 2; + + /* This interface accepts group name prefix */ + if (strchr(name, '/')) { + len = str_has_prefix(name, SYNTH_SYSTEM "/"); + if (len == 0) + return -EINVAL; + name += len; + } + return __create_synth_event(argc - 1, name, argv + 1); +} + +static int synth_event_release(struct dyn_event *ev) +{ + struct synth_event *event = to_synth_event(ev); + int ret; + + if (event->ref) + return -EBUSY; + + ret = unregister_synth_event(event); + if (ret) + return ret; + + dyn_event_remove(ev); + free_synth_event(event); + return 0; +} + +static int __synth_event_show(struct seq_file *m, struct synth_event *event) +{ + struct synth_field *field; + unsigned int i; + + seq_printf(m, "%s\t", event->name); + + for (i = 0; i < event->n_fields; i++) { + field = event->fields[i]; + + /* parameter values */ + seq_printf(m, "%s %s%s", field->type, field->name, + i == event->n_fields - 1 ? "" : "; "); + } + + seq_putc(m, '\n'); + + return 0; +} + +static int synth_event_show(struct seq_file *m, struct dyn_event *ev) +{ + struct synth_event *event = to_synth_event(ev); + + seq_printf(m, "s:%s/", event->class.system); + + return __synth_event_show(m, event); +} + +static int synth_events_seq_show(struct seq_file *m, void *v) +{ + struct dyn_event *ev = v; + + if (!is_synth_event(ev)) + return 0; + + return __synth_event_show(m, to_synth_event(ev)); +} + +static const struct seq_operations synth_events_seq_op = { + .start = dyn_event_seq_start, + .next = dyn_event_seq_next, + .stop = dyn_event_seq_stop, + .show = synth_events_seq_show, +}; + +static int synth_events_open(struct inode *inode, struct file *file) +{ + int ret; + + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { + ret = dyn_events_release_all(&synth_event_ops); + if (ret < 0) + return ret; + } + + return seq_open(file, &synth_events_seq_op); +} + +static ssize_t synth_events_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *ppos) +{ + return trace_parse_run_command(file, buffer, count, ppos, + create_or_delete_synth_event); +} + +static const struct file_operations synth_events_fops = { + .open = synth_events_open, + .write = synth_events_write, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static __init int trace_events_synth_init(void) +{ + struct dentry *entry = NULL; + struct dentry *d_tracer; + int err = 0; + + err = dyn_event_register(&synth_event_ops); + if (err) { + pr_warn("Could not register synth_event_ops\n"); + return err; + } + + d_tracer = tracing_init_dentry(); + if (IS_ERR(d_tracer)) { + err = PTR_ERR(d_tracer); + goto err; + } + + entry = tracefs_create_file("synthetic_events", 0644, d_tracer, + NULL, &synth_events_fops); + if (!entry) { + err = -ENODEV; + goto err; + } + + return err; + err: + pr_warn("Could not create tracefs 'synthetic_events' entry\n"); + + return err; +} + +fs_initcall(trace_events_synth_init); diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 3a74736da363..f725802160c0 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -216,11 +216,17 @@ static int event_trigger_regex_open(struct inode *inode, struct file *file) int trigger_process_regex(struct trace_event_file *file, char *buff) { - char *command, *next = buff; + char *command, *next; struct event_command *p; int ret = -EINVAL; + next = buff = skip_spaces(buff); command = strsep(&next, ": \t"); + if (next) { + next = skip_spaces(next); + if (!*next) + next = NULL; + } command = (command[0] != '!') ? command : command + 1; mutex_lock(&trigger_cmd_mutex); @@ -630,8 +636,14 @@ event_trigger_callback(struct event_command *cmd_ops, int ret; /* separate the trigger from the filter (t:n [if filter]) */ - if (param && isdigit(param[0])) + if (param && isdigit(param[0])) { trigger = strsep(¶m, " \t"); + if (param) { + param = skip_spaces(param); + if (!*param) + param = NULL; + } + } trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger); @@ -1368,6 +1380,11 @@ int event_enable_trigger_func(struct event_command *cmd_ops, trigger = strsep(¶m, " \t"); if (!trigger) return -EINVAL; + if (param) { + param = skip_spaces(param); + if (!*param) + param = NULL; + } system = strsep(&trigger, ":"); if (!trigger) diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 77ce5a3b6773..70d3d0a09053 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -45,6 +45,9 @@ static int ftrace_event_register(struct trace_event_call *call, #undef __field_desc #define __field_desc(type, container, item) type item; +#undef __field_packed +#define __field_packed(type, container, item) type item; + #undef __array #define __array(type, item, size) type item[size]; @@ -85,6 +88,13 @@ static void __always_unused ____ftrace_check_##name(void) \ .size = sizeof(_type), .align = __alignof__(_type), \ is_signed_type(_type), .filter_type = _filter_type }, + +#undef __field_ext_packed +#define __field_ext_packed(_type, _item, _filter_type) { \ + .type = #_type, .name = #_item, \ + .size = sizeof(_type), .align = 1, \ + is_signed_type(_type), .filter_type = _filter_type }, + #undef __field #define __field(_type, _item) __field_ext(_type, _item, FILTER_OTHER) @@ -94,6 +104,9 @@ static void __always_unused ____ftrace_check_##name(void) \ #undef __field_desc #define __field_desc(_type, _container, _item) __field_ext(_type, _item, FILTER_OTHER) +#undef __field_packed +#define __field_packed(_type, _container, _item) __field_ext_packed(_type, _item, FILTER_OTHER) + #undef __array #define __array(_type, _item, _len) { \ .type = #_type"["__stringify(_len)"]", .name = #_item, \ @@ -129,6 +142,9 @@ static struct trace_event_fields ftrace_event_fields_##name[] = { \ #undef __field_desc #define __field_desc(type, container, item) +#undef __field_packed +#define __field_packed(type, container, item) + #undef __array #define __array(type, item, len) diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 8a4c8d5c2c98..dd4dff71d89a 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -42,7 +42,7 @@ static int allocate_ftrace_ops(struct trace_array *tr) if (!ops) return -ENOMEM; - /* Currently only the non stack verision is supported */ + /* Currently only the non stack version is supported */ ops->func = function_trace_call; ops->flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_PID; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 35989383ae11..aefb6065b508 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1202,35 +1202,41 @@ static const struct file_operations kprobe_profile_ops = { /* Return the length of string -- including null terminal byte */ static nokprobe_inline int +fetch_store_strlen_user(unsigned long addr) +{ + const void __user *uaddr = (__force const void __user *)addr; + + return strnlen_user_nofault(uaddr, MAX_STRING_SIZE); +} + +/* Return the length of string -- including null terminal byte */ +static nokprobe_inline int fetch_store_strlen(unsigned long addr) { int ret, len = 0; u8 c; +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + if (addr < TASK_SIZE) + return fetch_store_strlen_user(addr); +#endif + do { - ret = probe_kernel_read(&c, (u8 *)addr + len, 1); + ret = copy_from_kernel_nofault(&c, (u8 *)addr + len, 1); len++; } while (c && ret == 0 && len < MAX_STRING_SIZE); return (ret < 0) ? ret : len; } -/* Return the length of string -- including null terminal byte */ -static nokprobe_inline int -fetch_store_strlen_user(unsigned long addr) -{ - const void __user *uaddr = (__force const void __user *)addr; - - return strnlen_unsafe_user(uaddr, MAX_STRING_SIZE); -} - /* - * Fetch a null-terminated string. Caller MUST set *(u32 *)buf with max - * length and relative data location. + * Fetch a null-terminated string from user. Caller MUST set *(u32 *)buf + * with max length and relative data location. */ static nokprobe_inline int -fetch_store_string(unsigned long addr, void *dest, void *base) +fetch_store_string_user(unsigned long addr, void *dest, void *base) { + const void __user *uaddr = (__force const void __user *)addr; int maxlen = get_loc_len(*(u32 *)dest); void *__dest; long ret; @@ -1240,11 +1246,7 @@ fetch_store_string(unsigned long addr, void *dest, void *base) __dest = get_loc_data(dest, base); - /* - * Try to get string again, since the string can be changed while - * probing. - */ - ret = strncpy_from_unsafe(__dest, (void *)addr, maxlen); + ret = strncpy_from_user_nofault(__dest, uaddr, maxlen); if (ret >= 0) *(u32 *)dest = make_data_loc(ret, __dest - base); @@ -1252,23 +1254,31 @@ fetch_store_string(unsigned long addr, void *dest, void *base) } /* - * Fetch a null-terminated string from user. Caller MUST set *(u32 *)buf - * with max length and relative data location. + * Fetch a null-terminated string. Caller MUST set *(u32 *)buf with max + * length and relative data location. */ static nokprobe_inline int -fetch_store_string_user(unsigned long addr, void *dest, void *base) +fetch_store_string(unsigned long addr, void *dest, void *base) { - const void __user *uaddr = (__force const void __user *)addr; int maxlen = get_loc_len(*(u32 *)dest); void *__dest; long ret; +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + if ((unsigned long)addr < TASK_SIZE) + return fetch_store_string_user(addr, dest, base); +#endif + if (unlikely(!maxlen)) return -ENOMEM; __dest = get_loc_data(dest, base); - ret = strncpy_from_unsafe_user(__dest, uaddr, maxlen); + /* + * Try to get string again, since the string can be changed while + * probing. + */ + ret = strncpy_from_kernel_nofault(__dest, (void *)addr, maxlen); if (ret >= 0) *(u32 *)dest = make_data_loc(ret, __dest - base); @@ -1276,17 +1286,21 @@ fetch_store_string_user(unsigned long addr, void *dest, void *base) } static nokprobe_inline int -probe_mem_read(void *dest, void *src, size_t size) +probe_mem_read_user(void *dest, void *src, size_t size) { - return probe_kernel_read(dest, src, size); + const void __user *uaddr = (__force const void __user *)src; + + return copy_from_user_nofault(dest, uaddr, size); } static nokprobe_inline int -probe_mem_read_user(void *dest, void *src, size_t size) +probe_mem_read(void *dest, void *src, size_t size) { - const void __user *uaddr = (__force const void __user *)src; - - return probe_user_read(dest, uaddr, size); +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + if ((unsigned long)src < TASK_SIZE) + return probe_mem_read_user(dest, src, size); +#endif + return copy_from_kernel_nofault(dest, src, size); } /* Note that we don't verify it, since the code does not come from user space */ @@ -1629,7 +1643,7 @@ int bpf_get_kprobe_info(const struct perf_event *event, u32 *fd_type, if (perf_type_tracepoint) tk = find_trace_kprobe(pevent, group); else - tk = event->tp_event->data; + tk = trace_kprobe_primary_from_call(event->tp_event); if (!tk) return -EINVAL; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 9a121e147102..73976de7f8cc 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -393,7 +393,7 @@ static int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, if (mm) { const struct vm_area_struct *vma; - down_read(&mm->mmap_sem); + mmap_read_lock(mm); vma = find_vma(mm, ip); if (vma) { file = vma->vm_file; @@ -405,7 +405,7 @@ static int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, trace_seq_printf(s, "[+0x%lx]", ip - vmstart); } - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); } if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file)) trace_seq_printf(s, " <" IP_FMT ">", ip); diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c index 4d8e99fdbbbe..f10073e62603 100644 --- a/kernel/trace/trace_preemptirq.c +++ b/kernel/trace/trace_preemptirq.c @@ -19,6 +19,24 @@ /* Per-cpu variable to prevent redundant calls when IRQs already off */ static DEFINE_PER_CPU(int, tracing_irq_cpu); +/* + * Like trace_hardirqs_on() but without the lockdep invocation. This is + * used in the low level entry code where the ordering vs. RCU is important + * and lockdep uses a staged approach which splits the lockdep hardirq + * tracking into a RCU on and a RCU off section. + */ +void trace_hardirqs_on_prepare(void) +{ + if (this_cpu_read(tracing_irq_cpu)) { + if (!in_nmi()) + trace_irq_enable(CALLER_ADDR0, CALLER_ADDR1); + tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1); + this_cpu_write(tracing_irq_cpu, 0); + } +} +EXPORT_SYMBOL(trace_hardirqs_on_prepare); +NOKPROBE_SYMBOL(trace_hardirqs_on_prepare); + void trace_hardirqs_on(void) { if (this_cpu_read(tracing_irq_cpu)) { @@ -28,21 +46,41 @@ void trace_hardirqs_on(void) this_cpu_write(tracing_irq_cpu, 0); } + lockdep_hardirqs_on_prepare(CALLER_ADDR0); lockdep_hardirqs_on(CALLER_ADDR0); } EXPORT_SYMBOL(trace_hardirqs_on); NOKPROBE_SYMBOL(trace_hardirqs_on); -void trace_hardirqs_off(void) +/* + * Like trace_hardirqs_off() but without the lockdep invocation. This is + * used in the low level entry code where the ordering vs. RCU is important + * and lockdep uses a staged approach which splits the lockdep hardirq + * tracking into a RCU on and a RCU off section. + */ +void trace_hardirqs_off_finish(void) { if (!this_cpu_read(tracing_irq_cpu)) { this_cpu_write(tracing_irq_cpu, 1); tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1); if (!in_nmi()) - trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1); + trace_irq_disable(CALLER_ADDR0, CALLER_ADDR1); } +} +EXPORT_SYMBOL(trace_hardirqs_off_finish); +NOKPROBE_SYMBOL(trace_hardirqs_off_finish); + +void trace_hardirqs_off(void) +{ lockdep_hardirqs_off(CALLER_ADDR0); + + if (!this_cpu_read(tracing_irq_cpu)) { + this_cpu_write(tracing_irq_cpu, 1); + tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1); + if (!in_nmi()) + trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1); + } } EXPORT_SYMBOL(trace_hardirqs_off); NOKPROBE_SYMBOL(trace_hardirqs_off); @@ -56,6 +94,7 @@ __visible void trace_hardirqs_on_caller(unsigned long caller_addr) this_cpu_write(tracing_irq_cpu, 0); } + lockdep_hardirqs_on_prepare(CALLER_ADDR0); lockdep_hardirqs_on(CALLER_ADDR0); } EXPORT_SYMBOL(trace_hardirqs_on_caller); diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index ab8b6436d53f..d2867ccc6aca 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -639,8 +639,8 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, ret = -EINVAL; goto fail; } - if ((code->op == FETCH_OP_IMM || code->op == FETCH_OP_COMM) || - parg->count) { + if ((code->op == FETCH_OP_IMM || code->op == FETCH_OP_COMM || + code->op == FETCH_OP_DATA) || parg->count) { /* * IMM, DATA and COMM is pointing actual address, those * must be kept, and if parg->count != 0, this is an @@ -1006,7 +1006,7 @@ int trace_probe_init(struct trace_probe *tp, const char *event, INIT_LIST_HEAD(&tp->event->class.fields); INIT_LIST_HEAD(&tp->event->probes); INIT_LIST_HEAD(&tp->list); - list_add(&tp->event->probes, &tp->list); + list_add(&tp->list, &tp->event->probes); call = trace_probe_event_call(tp); call->class = &tp->event->class; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index a0ff9e200ef6..a22b62813f8c 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -236,7 +236,7 @@ struct trace_probe_event { struct trace_event_call call; struct list_head files; struct list_head probes; - struct trace_uprobe_filter filter[0]; + struct trace_uprobe_filter filter[]; }; struct trace_probe { diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index c557f42a9397..98bba4764c52 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -515,9 +515,8 @@ static const struct file_operations stack_trace_filter_fops = { #endif /* CONFIG_DYNAMIC_FTRACE */ int -stack_trace_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +stack_trace_sysctl(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int was_enabled; int ret; diff --git a/kernel/trace/trace_synth.h b/kernel/trace/trace_synth.h new file mode 100644 index 000000000000..ac35c45207c4 --- /dev/null +++ b/kernel/trace/trace_synth.h @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef __TRACE_SYNTH_H +#define __TRACE_SYNTH_H + +#include "trace_dynevent.h" + +#define SYNTH_SYSTEM "synthetic" +#define SYNTH_FIELDS_MAX 32 + +#define STR_VAR_LEN_MAX 32 /* must be multiple of sizeof(u64) */ + +struct synth_field { + char *type; + char *name; + size_t size; + unsigned int offset; + bool is_signed; + bool is_string; +}; + +struct synth_event { + struct dyn_event devent; + int ref; + char *name; + struct synth_field **fields; + unsigned int n_fields; + unsigned int n_u64; + struct trace_event_class class; + struct trace_event_call call; + struct tracepoint *tp; + struct module *mod; +}; + +extern struct synth_event *find_synth_event(const char *name); + +#endif /* __TRACE_SYNTH_H */ diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 2a8e8e9c1c75..fdd47f99b18f 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1412,7 +1412,7 @@ int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type, if (perf_type_tracepoint) tu = find_probe_event(pevent, group); else - tu = event->tp_event->data; + tu = trace_uprobe_primary_from_call(event->tp_event); if (!tu) return -EINVAL; diff --git a/kernel/umh.c b/kernel/umh.c index 3474d6aa55d8..a25433f9cd9a 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -26,8 +26,6 @@ #include <linux/ptrace.h> #include <linux/async.h> #include <linux/uaccess.h> -#include <linux/shmem_fs.h> -#include <linux/pipe_fs_i.h> #include <trace/events/module.h> @@ -38,8 +36,6 @@ static kernel_cap_t usermodehelper_bset = CAP_FULL_SET; static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET; static DEFINE_SPINLOCK(umh_sysctl_lock); static DECLARE_RWSEM(umhelper_sem); -static LIST_HEAD(umh_list); -static DEFINE_MUTEX(umh_list_lock); static void call_usermodehelper_freeinfo(struct subprocess_info *info) { @@ -102,16 +98,9 @@ static int call_usermodehelper_exec_async(void *data) commit_creds(new); - sub_info->pid = task_pid_nr(current); - if (sub_info->file) { - retval = do_execve_file(sub_info->file, - sub_info->argv, sub_info->envp); - if (!retval) - current->flags |= PF_UMH; - } else - retval = do_execve(getname_kernel(sub_info->path), - (const char __user *const __user *)sub_info->argv, - (const char __user *const __user *)sub_info->envp); + retval = kernel_execve(sub_info->path, + (const char *const *)sub_info->argv, + (const char *const *)sub_info->envp); out: sub_info->retval = retval; /* @@ -405,140 +394,6 @@ struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv, } EXPORT_SYMBOL(call_usermodehelper_setup); -struct subprocess_info *call_usermodehelper_setup_file(struct file *file, - int (*init)(struct subprocess_info *info, struct cred *new), - void (*cleanup)(struct subprocess_info *info), void *data) -{ - struct subprocess_info *sub_info; - struct umh_info *info = data; - const char *cmdline = (info->cmdline) ? info->cmdline : "usermodehelper"; - - sub_info = kzalloc(sizeof(struct subprocess_info), GFP_KERNEL); - if (!sub_info) - return NULL; - - sub_info->argv = argv_split(GFP_KERNEL, cmdline, NULL); - if (!sub_info->argv) { - kfree(sub_info); - return NULL; - } - - INIT_WORK(&sub_info->work, call_usermodehelper_exec_work); - sub_info->path = "none"; - sub_info->file = file; - sub_info->init = init; - sub_info->cleanup = cleanup; - sub_info->data = data; - return sub_info; -} - -static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) -{ - struct umh_info *umh_info = info->data; - struct file *from_umh[2]; - struct file *to_umh[2]; - int err; - - /* create pipe to send data to umh */ - err = create_pipe_files(to_umh, 0); - if (err) - return err; - err = replace_fd(0, to_umh[0], 0); - fput(to_umh[0]); - if (err < 0) { - fput(to_umh[1]); - return err; - } - - /* create pipe to receive data from umh */ - err = create_pipe_files(from_umh, 0); - if (err) { - fput(to_umh[1]); - replace_fd(0, NULL, 0); - return err; - } - err = replace_fd(1, from_umh[1], 0); - fput(from_umh[1]); - if (err < 0) { - fput(to_umh[1]); - replace_fd(0, NULL, 0); - fput(from_umh[0]); - return err; - } - - umh_info->pipe_to_umh = to_umh[1]; - umh_info->pipe_from_umh = from_umh[0]; - return 0; -} - -static void umh_clean_and_save_pid(struct subprocess_info *info) -{ - struct umh_info *umh_info = info->data; - - /* cleanup if umh_pipe_setup() was successful but exec failed */ - if (info->pid && info->retval) { - fput(umh_info->pipe_to_umh); - fput(umh_info->pipe_from_umh); - } - - argv_free(info->argv); - umh_info->pid = info->pid; -} - -/** - * fork_usermode_blob - fork a blob of bytes as a usermode process - * @data: a blob of bytes that can be do_execv-ed as a file - * @len: length of the blob - * @info: information about usermode process (shouldn't be NULL) - * - * If info->cmdline is set it will be used as command line for the - * user process, else "usermodehelper" is used. - * - * Returns either negative error or zero which indicates success - * in executing a blob of bytes as a usermode process. In such - * case 'struct umh_info *info' is populated with two pipes - * and a pid of the process. The caller is responsible for health - * check of the user process, killing it via pid, and closing the - * pipes when user process is no longer needed. - */ -int fork_usermode_blob(void *data, size_t len, struct umh_info *info) -{ - struct subprocess_info *sub_info; - struct file *file; - ssize_t written; - loff_t pos = 0; - int err; - - file = shmem_kernel_file_setup("", len, 0); - if (IS_ERR(file)) - return PTR_ERR(file); - - written = kernel_write(file, data, len, &pos); - if (written != len) { - err = written; - if (err >= 0) - err = -ENOMEM; - goto out; - } - - err = -ENOMEM; - sub_info = call_usermodehelper_setup_file(file, umh_pipe_setup, - umh_clean_and_save_pid, info); - if (!sub_info) - goto out; - - err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); - if (!err) { - mutex_lock(&umh_list_lock); - list_add(&info->list, &umh_list); - mutex_unlock(&umh_list_lock); - } -out: - fput(file); - return err; -} -EXPORT_SYMBOL_GPL(fork_usermode_blob); - /** * call_usermodehelper_exec - start a usermode application * @sub_info: information about the subprocessa @@ -641,7 +496,7 @@ int call_usermodehelper(const char *path, char **argv, char **envp, int wait) EXPORT_SYMBOL(call_usermodehelper); static int proc_cap_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; unsigned long cap_array[_KERNEL_CAPABILITY_U32S]; @@ -700,26 +555,6 @@ static int proc_cap_handler(struct ctl_table *table, int write, return 0; } -void __exit_umh(struct task_struct *tsk) -{ - struct umh_info *info; - pid_t pid = tsk->pid; - - mutex_lock(&umh_list_lock); - list_for_each_entry(info, &umh_list, list) { - if (info->pid == pid) { - list_del(&info->list); - mutex_unlock(&umh_list_lock); - goto out; - } - } - mutex_unlock(&umh_list_lock); - return; -out: - if (info->cleanup) - info->cleanup(info); -} - struct ctl_table usermodehelper_table[] = { { .procname = "bset", diff --git a/kernel/user.c b/kernel/user.c index 5235d7f49982..b1635d94a1f2 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -82,7 +82,7 @@ EXPORT_SYMBOL_GPL(init_user_ns); #define uidhashentry(uid) (uidhash_table + __uidhashfn((__kuid_val(uid)))) static struct kmem_cache *uid_cachep; -struct hlist_head uidhash_table[UIDHASH_SZ]; +static struct hlist_head uidhash_table[UIDHASH_SZ]; /* * The uidhash_lock is mostly taken from process context, but it is diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 8eadadc478f9..87804e0371fe 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -1253,7 +1253,7 @@ static void userns_put(struct ns_common *ns) put_user_ns(to_user_ns(ns)); } -static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns) +static int userns_install(struct nsset *nsset, struct ns_common *ns) { struct user_namespace *user_ns = to_user_ns(ns); struct cred *cred; @@ -1274,14 +1274,14 @@ static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns) if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; - cred = prepare_creds(); + cred = nsset_cred(nsset); if (!cred) - return -ENOMEM; + return -EINVAL; put_user_ns(cred->user_ns); set_cred_user_ns(cred, get_user_ns(user_ns)); - return commit_creds(cred); + return 0; } struct ns_common *ns_get_owner(struct ns_common *ns) diff --git a/kernel/usermode_driver.c b/kernel/usermode_driver.c new file mode 100644 index 000000000000..0b35212ffc3d --- /dev/null +++ b/kernel/usermode_driver.c @@ -0,0 +1,182 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * umd - User mode driver support + */ +#include <linux/shmem_fs.h> +#include <linux/pipe_fs_i.h> +#include <linux/mount.h> +#include <linux/fs_struct.h> +#include <linux/task_work.h> +#include <linux/usermode_driver.h> + +static struct vfsmount *blob_to_mnt(const void *data, size_t len, const char *name) +{ + struct file_system_type *type; + struct vfsmount *mnt; + struct file *file; + ssize_t written; + loff_t pos = 0; + + type = get_fs_type("tmpfs"); + if (!type) + return ERR_PTR(-ENODEV); + + mnt = kern_mount(type); + put_filesystem(type); + if (IS_ERR(mnt)) + return mnt; + + file = file_open_root(mnt->mnt_root, mnt, name, O_CREAT | O_WRONLY, 0700); + if (IS_ERR(file)) { + mntput(mnt); + return ERR_CAST(file); + } + + written = kernel_write(file, data, len, &pos); + if (written != len) { + int err = written; + if (err >= 0) + err = -ENOMEM; + filp_close(file, NULL); + mntput(mnt); + return ERR_PTR(err); + } + + fput(file); + + /* Flush delayed fput so exec can open the file read-only */ + flush_delayed_fput(); + task_work_run(); + return mnt; +} + +/** + * umd_load_blob - Remember a blob of bytes for fork_usermode_driver + * @info: information about usermode driver + * @data: a blob of bytes that can be executed as a file + * @len: The lentgh of the blob + * + */ +int umd_load_blob(struct umd_info *info, const void *data, size_t len) +{ + struct vfsmount *mnt; + + if (WARN_ON_ONCE(info->wd.dentry || info->wd.mnt)) + return -EBUSY; + + mnt = blob_to_mnt(data, len, info->driver_name); + if (IS_ERR(mnt)) + return PTR_ERR(mnt); + + info->wd.mnt = mnt; + info->wd.dentry = mnt->mnt_root; + return 0; +} +EXPORT_SYMBOL_GPL(umd_load_blob); + +/** + * umd_unload_blob - Disassociate @info from a previously loaded blob + * @info: information about usermode driver + * + */ +int umd_unload_blob(struct umd_info *info) +{ + if (WARN_ON_ONCE(!info->wd.mnt || + !info->wd.dentry || + info->wd.mnt->mnt_root != info->wd.dentry)) + return -EINVAL; + + kern_unmount(info->wd.mnt); + info->wd.mnt = NULL; + info->wd.dentry = NULL; + return 0; +} +EXPORT_SYMBOL_GPL(umd_unload_blob); + +static int umd_setup(struct subprocess_info *info, struct cred *new) +{ + struct umd_info *umd_info = info->data; + struct file *from_umh[2]; + struct file *to_umh[2]; + int err; + + /* create pipe to send data to umh */ + err = create_pipe_files(to_umh, 0); + if (err) + return err; + err = replace_fd(0, to_umh[0], 0); + fput(to_umh[0]); + if (err < 0) { + fput(to_umh[1]); + return err; + } + + /* create pipe to receive data from umh */ + err = create_pipe_files(from_umh, 0); + if (err) { + fput(to_umh[1]); + replace_fd(0, NULL, 0); + return err; + } + err = replace_fd(1, from_umh[1], 0); + fput(from_umh[1]); + if (err < 0) { + fput(to_umh[1]); + replace_fd(0, NULL, 0); + fput(from_umh[0]); + return err; + } + + set_fs_pwd(current->fs, &umd_info->wd); + umd_info->pipe_to_umh = to_umh[1]; + umd_info->pipe_from_umh = from_umh[0]; + umd_info->tgid = get_pid(task_tgid(current)); + return 0; +} + +static void umd_cleanup(struct subprocess_info *info) +{ + struct umd_info *umd_info = info->data; + + /* cleanup if umh_setup() was successful but exec failed */ + if (info->retval) { + fput(umd_info->pipe_to_umh); + fput(umd_info->pipe_from_umh); + put_pid(umd_info->tgid); + umd_info->tgid = NULL; + } +} + +/** + * fork_usermode_driver - fork a usermode driver + * @info: information about usermode driver (shouldn't be NULL) + * + * Returns either negative error or zero which indicates success in + * executing a usermode driver. In such case 'struct umd_info *info' + * is populated with two pipes and a tgid of the process. The caller is + * responsible for health check of the user process, killing it via + * tgid, and closing the pipes when user process is no longer needed. + */ +int fork_usermode_driver(struct umd_info *info) +{ + struct subprocess_info *sub_info; + const char *argv[] = { info->driver_name, NULL }; + int err; + + if (WARN_ON_ONCE(info->tgid)) + return -EBUSY; + + err = -ENOMEM; + sub_info = call_usermodehelper_setup(info->driver_name, + (char **)argv, NULL, GFP_KERNEL, + umd_setup, umd_cleanup, info); + if (!sub_info) + goto out; + + err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); +out: + return err; +} +EXPORT_SYMBOL_GPL(fork_usermode_driver); + + diff --git a/kernel/utsname.c b/kernel/utsname.c index f0e491193009..e488d0e2ab45 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -140,12 +140,13 @@ static void utsns_put(struct ns_common *ns) put_uts_ns(to_uts_ns(ns)); } -static int utsns_install(struct nsproxy *nsproxy, struct ns_common *new) +static int utsns_install(struct nsset *nsset, struct ns_common *new) { + struct nsproxy *nsproxy = nsset->nsproxy; struct uts_namespace *ns = to_uts_ns(new); if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || - !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) + !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) return -EPERM; get_uts_ns(ns); diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 3732c888a949..4ca61d49885b 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -30,7 +30,7 @@ static void *get_uts(struct ctl_table *table) * to observe. Should this be in kernel/sys.c ???? */ static int proc_do_uts_string(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table uts_table; int r; diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c new file mode 100644 index 000000000000..f74020f6bd9d --- /dev/null +++ b/kernel/watch_queue.c @@ -0,0 +1,655 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Watch queue and general notification mechanism, built on pipes + * + * Copyright (C) 2020 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * See Documentation/watch_queue.rst + */ + +#define pr_fmt(fmt) "watchq: " fmt +#include <linux/module.h> +#include <linux/init.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/printk.h> +#include <linux/miscdevice.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/poll.h> +#include <linux/uaccess.h> +#include <linux/vmalloc.h> +#include <linux/file.h> +#include <linux/security.h> +#include <linux/cred.h> +#include <linux/sched/signal.h> +#include <linux/watch_queue.h> +#include <linux/pipe_fs_i.h> + +MODULE_DESCRIPTION("Watch queue"); +MODULE_AUTHOR("Red Hat, Inc."); +MODULE_LICENSE("GPL"); + +#define WATCH_QUEUE_NOTE_SIZE 128 +#define WATCH_QUEUE_NOTES_PER_PAGE (PAGE_SIZE / WATCH_QUEUE_NOTE_SIZE) + +static void watch_queue_pipe_buf_release(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + struct watch_queue *wqueue = (struct watch_queue *)buf->private; + struct page *page; + unsigned int bit; + + /* We need to work out which note within the page this refers to, but + * the note might have been maximum size, so merely ANDing the offset + * off doesn't work. OTOH, the note must've been more than zero size. + */ + bit = buf->offset + buf->len; + if ((bit & (WATCH_QUEUE_NOTE_SIZE - 1)) == 0) + bit -= WATCH_QUEUE_NOTE_SIZE; + bit /= WATCH_QUEUE_NOTE_SIZE; + + page = buf->page; + bit += page->index; + + set_bit(bit, wqueue->notes_bitmap); +} + +// No try_steal function => no stealing +#define watch_queue_pipe_buf_try_steal NULL + +/* New data written to a pipe may be appended to a buffer with this type. */ +static const struct pipe_buf_operations watch_queue_pipe_buf_ops = { + .release = watch_queue_pipe_buf_release, + .try_steal = watch_queue_pipe_buf_try_steal, + .get = generic_pipe_buf_get, +}; + +/* + * Post a notification to a watch queue. + */ +static bool post_one_notification(struct watch_queue *wqueue, + struct watch_notification *n) +{ + void *p; + struct pipe_inode_info *pipe = wqueue->pipe; + struct pipe_buffer *buf; + struct page *page; + unsigned int head, tail, mask, note, offset, len; + bool done = false; + + if (!pipe) + return false; + + spin_lock_irq(&pipe->rd_wait.lock); + + if (wqueue->defunct) + goto out; + + mask = pipe->ring_size - 1; + head = pipe->head; + tail = pipe->tail; + if (pipe_full(head, tail, pipe->ring_size)) + goto lost; + + note = find_first_bit(wqueue->notes_bitmap, wqueue->nr_notes); + if (note >= wqueue->nr_notes) + goto lost; + + page = wqueue->notes[note / WATCH_QUEUE_NOTES_PER_PAGE]; + offset = note % WATCH_QUEUE_NOTES_PER_PAGE * WATCH_QUEUE_NOTE_SIZE; + get_page(page); + len = n->info & WATCH_INFO_LENGTH; + p = kmap_atomic(page); + memcpy(p + offset, n, len); + kunmap_atomic(p); + + buf = &pipe->bufs[head & mask]; + buf->page = page; + buf->private = (unsigned long)wqueue; + buf->ops = &watch_queue_pipe_buf_ops; + buf->offset = offset; + buf->len = len; + buf->flags = PIPE_BUF_FLAG_WHOLE; + pipe->head = head + 1; + + if (!test_and_clear_bit(note, wqueue->notes_bitmap)) { + spin_unlock_irq(&pipe->rd_wait.lock); + BUG(); + } + wake_up_interruptible_sync_poll_locked(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); + done = true; + +out: + spin_unlock_irq(&pipe->rd_wait.lock); + if (done) + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); + return done; + +lost: + buf = &pipe->bufs[(head - 1) & mask]; + buf->flags |= PIPE_BUF_FLAG_LOSS; + goto out; +} + +/* + * Apply filter rules to a notification. + */ +static bool filter_watch_notification(const struct watch_filter *wf, + const struct watch_notification *n) +{ + const struct watch_type_filter *wt; + unsigned int st_bits = sizeof(wt->subtype_filter[0]) * 8; + unsigned int st_index = n->subtype / st_bits; + unsigned int st_bit = 1U << (n->subtype % st_bits); + int i; + + if (!test_bit(n->type, wf->type_filter)) + return false; + + for (i = 0; i < wf->nr_filters; i++) { + wt = &wf->filters[i]; + if (n->type == wt->type && + (wt->subtype_filter[st_index] & st_bit) && + (n->info & wt->info_mask) == wt->info_filter) + return true; + } + + return false; /* If there is a filter, the default is to reject. */ +} + +/** + * __post_watch_notification - Post an event notification + * @wlist: The watch list to post the event to. + * @n: The notification record to post. + * @cred: The creds of the process that triggered the notification. + * @id: The ID to match on the watch. + * + * Post a notification of an event into a set of watch queues and let the users + * know. + * + * The size of the notification should be set in n->info & WATCH_INFO_LENGTH and + * should be in units of sizeof(*n). + */ +void __post_watch_notification(struct watch_list *wlist, + struct watch_notification *n, + const struct cred *cred, + u64 id) +{ + const struct watch_filter *wf; + struct watch_queue *wqueue; + struct watch *watch; + + if (((n->info & WATCH_INFO_LENGTH) >> WATCH_INFO_LENGTH__SHIFT) == 0) { + WARN_ON(1); + return; + } + + rcu_read_lock(); + + hlist_for_each_entry_rcu(watch, &wlist->watchers, list_node) { + if (watch->id != id) + continue; + n->info &= ~WATCH_INFO_ID; + n->info |= watch->info_id; + + wqueue = rcu_dereference(watch->queue); + wf = rcu_dereference(wqueue->filter); + if (wf && !filter_watch_notification(wf, n)) + continue; + + if (security_post_notification(watch->cred, cred, n) < 0) + continue; + + post_one_notification(wqueue, n); + } + + rcu_read_unlock(); +} +EXPORT_SYMBOL(__post_watch_notification); + +/* + * Allocate sufficient pages to preallocation for the requested number of + * notifications. + */ +long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes) +{ + struct watch_queue *wqueue = pipe->watch_queue; + struct page **pages; + unsigned long *bitmap; + unsigned long user_bufs; + unsigned int bmsize; + int ret, i, nr_pages; + + if (!wqueue) + return -ENODEV; + if (wqueue->notes) + return -EBUSY; + + if (nr_notes < 1 || + nr_notes > 512) /* TODO: choose a better hard limit */ + return -EINVAL; + + nr_pages = (nr_notes + WATCH_QUEUE_NOTES_PER_PAGE - 1); + nr_pages /= WATCH_QUEUE_NOTES_PER_PAGE; + user_bufs = account_pipe_buffers(pipe->user, pipe->nr_accounted, nr_pages); + + if (nr_pages > pipe->max_usage && + (too_many_pipe_buffers_hard(user_bufs) || + too_many_pipe_buffers_soft(user_bufs)) && + pipe_is_unprivileged_user()) { + ret = -EPERM; + goto error; + } + + ret = pipe_resize_ring(pipe, nr_notes); + if (ret < 0) + goto error; + + pages = kcalloc(sizeof(struct page *), nr_pages, GFP_KERNEL); + if (!pages) + goto error; + + for (i = 0; i < nr_pages; i++) { + pages[i] = alloc_page(GFP_KERNEL); + if (!pages[i]) + goto error_p; + pages[i]->index = i * WATCH_QUEUE_NOTES_PER_PAGE; + } + + bmsize = (nr_notes + BITS_PER_LONG - 1) / BITS_PER_LONG; + bmsize *= sizeof(unsigned long); + bitmap = kmalloc(bmsize, GFP_KERNEL); + if (!bitmap) + goto error_p; + + memset(bitmap, 0xff, bmsize); + wqueue->notes = pages; + wqueue->notes_bitmap = bitmap; + wqueue->nr_pages = nr_pages; + wqueue->nr_notes = nr_pages * WATCH_QUEUE_NOTES_PER_PAGE; + return 0; + +error_p: + for (i = 0; i < nr_pages; i++) + __free_page(pages[i]); + kfree(pages); +error: + (void) account_pipe_buffers(pipe->user, nr_pages, pipe->nr_accounted); + return ret; +} + +/* + * Set the filter on a watch queue. + */ +long watch_queue_set_filter(struct pipe_inode_info *pipe, + struct watch_notification_filter __user *_filter) +{ + struct watch_notification_type_filter *tf; + struct watch_notification_filter filter; + struct watch_type_filter *q; + struct watch_filter *wfilter; + struct watch_queue *wqueue = pipe->watch_queue; + int ret, nr_filter = 0, i; + + if (!wqueue) + return -ENODEV; + + if (!_filter) { + /* Remove the old filter */ + wfilter = NULL; + goto set; + } + + /* Grab the user's filter specification */ + if (copy_from_user(&filter, _filter, sizeof(filter)) != 0) + return -EFAULT; + if (filter.nr_filters == 0 || + filter.nr_filters > 16 || + filter.__reserved != 0) + return -EINVAL; + + tf = memdup_user(_filter->filters, filter.nr_filters * sizeof(*tf)); + if (IS_ERR(tf)) + return PTR_ERR(tf); + + ret = -EINVAL; + for (i = 0; i < filter.nr_filters; i++) { + if ((tf[i].info_filter & ~tf[i].info_mask) || + tf[i].info_mask & WATCH_INFO_LENGTH) + goto err_filter; + /* Ignore any unknown types */ + if (tf[i].type >= sizeof(wfilter->type_filter) * 8) + continue; + nr_filter++; + } + + /* Now we need to build the internal filter from only the relevant + * user-specified filters. + */ + ret = -ENOMEM; + wfilter = kzalloc(struct_size(wfilter, filters, nr_filter), GFP_KERNEL); + if (!wfilter) + goto err_filter; + wfilter->nr_filters = nr_filter; + + q = wfilter->filters; + for (i = 0; i < filter.nr_filters; i++) { + if (tf[i].type >= sizeof(wfilter->type_filter) * BITS_PER_LONG) + continue; + + q->type = tf[i].type; + q->info_filter = tf[i].info_filter; + q->info_mask = tf[i].info_mask; + q->subtype_filter[0] = tf[i].subtype_filter[0]; + __set_bit(q->type, wfilter->type_filter); + q++; + } + + kfree(tf); +set: + pipe_lock(pipe); + wfilter = rcu_replace_pointer(wqueue->filter, wfilter, + lockdep_is_held(&pipe->mutex)); + pipe_unlock(pipe); + if (wfilter) + kfree_rcu(wfilter, rcu); + return 0; + +err_filter: + kfree(tf); + return ret; +} + +static void __put_watch_queue(struct kref *kref) +{ + struct watch_queue *wqueue = + container_of(kref, struct watch_queue, usage); + struct watch_filter *wfilter; + int i; + + for (i = 0; i < wqueue->nr_pages; i++) + __free_page(wqueue->notes[i]); + + wfilter = rcu_access_pointer(wqueue->filter); + if (wfilter) + kfree_rcu(wfilter, rcu); + kfree_rcu(wqueue, rcu); +} + +/** + * put_watch_queue - Dispose of a ref on a watchqueue. + * @wqueue: The watch queue to unref. + */ +void put_watch_queue(struct watch_queue *wqueue) +{ + kref_put(&wqueue->usage, __put_watch_queue); +} +EXPORT_SYMBOL(put_watch_queue); + +static void free_watch(struct rcu_head *rcu) +{ + struct watch *watch = container_of(rcu, struct watch, rcu); + + put_watch_queue(rcu_access_pointer(watch->queue)); + put_cred(watch->cred); +} + +static void __put_watch(struct kref *kref) +{ + struct watch *watch = container_of(kref, struct watch, usage); + + call_rcu(&watch->rcu, free_watch); +} + +/* + * Discard a watch. + */ +static void put_watch(struct watch *watch) +{ + kref_put(&watch->usage, __put_watch); +} + +/** + * init_watch_queue - Initialise a watch + * @watch: The watch to initialise. + * @wqueue: The queue to assign. + * + * Initialise a watch and set the watch queue. + */ +void init_watch(struct watch *watch, struct watch_queue *wqueue) +{ + kref_init(&watch->usage); + INIT_HLIST_NODE(&watch->list_node); + INIT_HLIST_NODE(&watch->queue_node); + rcu_assign_pointer(watch->queue, wqueue); +} + +/** + * add_watch_to_object - Add a watch on an object to a watch list + * @watch: The watch to add + * @wlist: The watch list to add to + * + * @watch->queue must have been set to point to the queue to post notifications + * to and the watch list of the object to be watched. @watch->cred must also + * have been set to the appropriate credentials and a ref taken on them. + * + * The caller must pin the queue and the list both and must hold the list + * locked against racing watch additions/removals. + */ +int add_watch_to_object(struct watch *watch, struct watch_list *wlist) +{ + struct watch_queue *wqueue = rcu_access_pointer(watch->queue); + struct watch *w; + + hlist_for_each_entry(w, &wlist->watchers, list_node) { + struct watch_queue *wq = rcu_access_pointer(w->queue); + if (wqueue == wq && watch->id == w->id) + return -EBUSY; + } + + watch->cred = get_current_cred(); + rcu_assign_pointer(watch->watch_list, wlist); + + spin_lock_bh(&wqueue->lock); + kref_get(&wqueue->usage); + kref_get(&watch->usage); + hlist_add_head(&watch->queue_node, &wqueue->watches); + spin_unlock_bh(&wqueue->lock); + + hlist_add_head(&watch->list_node, &wlist->watchers); + return 0; +} +EXPORT_SYMBOL(add_watch_to_object); + +/** + * remove_watch_from_object - Remove a watch or all watches from an object. + * @wlist: The watch list to remove from + * @wq: The watch queue of interest (ignored if @all is true) + * @id: The ID of the watch to remove (ignored if @all is true) + * @all: True to remove all objects + * + * Remove a specific watch or all watches from an object. A notification is + * sent to the watcher to tell them that this happened. + */ +int remove_watch_from_object(struct watch_list *wlist, struct watch_queue *wq, + u64 id, bool all) +{ + struct watch_notification_removal n; + struct watch_queue *wqueue; + struct watch *watch; + int ret = -EBADSLT; + + rcu_read_lock(); + +again: + spin_lock(&wlist->lock); + hlist_for_each_entry(watch, &wlist->watchers, list_node) { + if (all || + (watch->id == id && rcu_access_pointer(watch->queue) == wq)) + goto found; + } + spin_unlock(&wlist->lock); + goto out; + +found: + ret = 0; + hlist_del_init_rcu(&watch->list_node); + rcu_assign_pointer(watch->watch_list, NULL); + spin_unlock(&wlist->lock); + + /* We now own the reference on watch that used to belong to wlist. */ + + n.watch.type = WATCH_TYPE_META; + n.watch.subtype = WATCH_META_REMOVAL_NOTIFICATION; + n.watch.info = watch->info_id | watch_sizeof(n.watch); + n.id = id; + if (id != 0) + n.watch.info = watch->info_id | watch_sizeof(n); + + wqueue = rcu_dereference(watch->queue); + + /* We don't need the watch list lock for the next bit as RCU is + * protecting *wqueue from deallocation. + */ + if (wqueue) { + post_one_notification(wqueue, &n.watch); + + spin_lock_bh(&wqueue->lock); + + if (!hlist_unhashed(&watch->queue_node)) { + hlist_del_init_rcu(&watch->queue_node); + put_watch(watch); + } + + spin_unlock_bh(&wqueue->lock); + } + + if (wlist->release_watch) { + void (*release_watch)(struct watch *); + + release_watch = wlist->release_watch; + rcu_read_unlock(); + (*release_watch)(watch); + rcu_read_lock(); + } + put_watch(watch); + + if (all && !hlist_empty(&wlist->watchers)) + goto again; +out: + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL(remove_watch_from_object); + +/* + * Remove all the watches that are contributory to a queue. This has the + * potential to race with removal of the watches by the destruction of the + * objects being watched or with the distribution of notifications. + */ +void watch_queue_clear(struct watch_queue *wqueue) +{ + struct watch_list *wlist; + struct watch *watch; + bool release; + + rcu_read_lock(); + spin_lock_bh(&wqueue->lock); + + /* Prevent new additions and prevent notifications from happening */ + wqueue->defunct = true; + + while (!hlist_empty(&wqueue->watches)) { + watch = hlist_entry(wqueue->watches.first, struct watch, queue_node); + hlist_del_init_rcu(&watch->queue_node); + /* We now own a ref on the watch. */ + spin_unlock_bh(&wqueue->lock); + + /* We can't do the next bit under the queue lock as we need to + * get the list lock - which would cause a deadlock if someone + * was removing from the opposite direction at the same time or + * posting a notification. + */ + wlist = rcu_dereference(watch->watch_list); + if (wlist) { + void (*release_watch)(struct watch *); + + spin_lock(&wlist->lock); + + release = !hlist_unhashed(&watch->list_node); + if (release) { + hlist_del_init_rcu(&watch->list_node); + rcu_assign_pointer(watch->watch_list, NULL); + + /* We now own a second ref on the watch. */ + } + + release_watch = wlist->release_watch; + spin_unlock(&wlist->lock); + + if (release) { + if (release_watch) { + rcu_read_unlock(); + /* This might need to call dput(), so + * we have to drop all the locks. + */ + (*release_watch)(watch); + rcu_read_lock(); + } + put_watch(watch); + } + } + + put_watch(watch); + spin_lock_bh(&wqueue->lock); + } + + spin_unlock_bh(&wqueue->lock); + rcu_read_unlock(); +} + +/** + * get_watch_queue - Get a watch queue from its file descriptor. + * @fd: The fd to query. + */ +struct watch_queue *get_watch_queue(int fd) +{ + struct pipe_inode_info *pipe; + struct watch_queue *wqueue = ERR_PTR(-EINVAL); + struct fd f; + + f = fdget(fd); + if (f.file) { + pipe = get_pipe_info(f.file, false); + if (pipe && pipe->watch_queue) { + wqueue = pipe->watch_queue; + kref_get(&wqueue->usage); + } + fdput(f); + } + + return wqueue; +} +EXPORT_SYMBOL(get_watch_queue); + +/* + * Initialise a watch queue + */ +int watch_queue_init(struct pipe_inode_info *pipe) +{ + struct watch_queue *wqueue; + + wqueue = kzalloc(sizeof(*wqueue), GFP_KERNEL); + if (!wqueue) + return -ENOMEM; + + wqueue->pipe = pipe; + kref_init(&wqueue->usage); + spin_lock_init(&wqueue->lock); + INIT_HLIST_HEAD(&wqueue->watches); + + pipe->watch_queue = wqueue; + return 0; +} diff --git a/kernel/watchdog.c b/kernel/watchdog.c index b6b1f54a7837..5abb5b22ad13 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -50,6 +50,11 @@ struct cpumask watchdog_cpumask __read_mostly; unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); #ifdef CONFIG_HARDLOCKUP_DETECTOR + +# ifdef CONFIG_SMP +int __read_mostly sysctl_hardlockup_all_cpu_backtrace; +# endif /* CONFIG_SMP */ + /* * Should we panic when a soft-lockup or hard-lockup occurs: */ @@ -82,16 +87,6 @@ static int __init hardlockup_panic_setup(char *str) } __setup("nmi_watchdog=", hardlockup_panic_setup); -# ifdef CONFIG_SMP -int __read_mostly sysctl_hardlockup_all_cpu_backtrace; - -static int __init hardlockup_all_cpu_backtrace_setup(char *str) -{ - sysctl_hardlockup_all_cpu_backtrace = !!simple_strtol(str, NULL, 0); - return 1; -} -__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup); -# endif /* CONFIG_SMP */ #endif /* CONFIG_HARDLOCKUP_DETECTOR */ /* @@ -163,6 +158,10 @@ static void lockup_detector_update_enable(void) #define SOFTLOCKUP_RESET ULONG_MAX +#ifdef CONFIG_SMP +int __read_mostly sysctl_softlockup_all_cpu_backtrace; +#endif + /* Global variables, exported for sysctl */ unsigned int __read_mostly softlockup_panic = CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; @@ -178,13 +177,6 @@ static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); static unsigned long soft_lockup_nmi_warn; -static int __init softlockup_panic_setup(char *str) -{ - softlockup_panic = simple_strtoul(str, NULL, 0); - return 1; -} -__setup("softlockup_panic=", softlockup_panic_setup); - static int __init nowatchdog_setup(char *str) { watchdog_user_enabled = 0; @@ -206,17 +198,6 @@ static int __init watchdog_thresh_setup(char *str) } __setup("watchdog_thresh=", watchdog_thresh_setup); -#ifdef CONFIG_SMP -int __read_mostly sysctl_softlockup_all_cpu_backtrace; - -static int __init softlockup_all_cpu_backtrace_setup(char *str) -{ - sysctl_softlockup_all_cpu_backtrace = !!simple_strtol(str, NULL, 0); - return 1; -} -__setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup); -#endif - static void __lockup_detector_cleanup(void); /* @@ -661,7 +642,7 @@ static void proc_watchdog_update(void) * proc_soft_watchdog | soft_watchdog_user_enabled | SOFT_WATCHDOG_ENABLED */ static int proc_watchdog_common(int which, struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int err, old, *param = table->data; @@ -688,7 +669,7 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, * /proc/sys/kernel/watchdog */ int proc_watchdog(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return proc_watchdog_common(NMI_WATCHDOG_ENABLED|SOFT_WATCHDOG_ENABLED, table, write, buffer, lenp, ppos); @@ -698,7 +679,7 @@ int proc_watchdog(struct ctl_table *table, int write, * /proc/sys/kernel/nmi_watchdog */ int proc_nmi_watchdog(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { if (!nmi_watchdog_available && write) return -ENOTSUPP; @@ -710,7 +691,7 @@ int proc_nmi_watchdog(struct ctl_table *table, int write, * /proc/sys/kernel/soft_watchdog */ int proc_soft_watchdog(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return proc_watchdog_common(SOFT_WATCHDOG_ENABLED, table, write, buffer, lenp, ppos); @@ -720,7 +701,7 @@ int proc_soft_watchdog(struct ctl_table *table, int write, * /proc/sys/kernel/watchdog_thresh */ int proc_watchdog_thresh(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int err, old; @@ -743,7 +724,7 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, * been brought online, if desired. */ int proc_watchdog_cpumask(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int err; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 891ccad5f271..c41c3c17b86a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -145,7 +145,7 @@ enum { /* struct worker is defined in workqueue_internal.h */ struct worker_pool { - spinlock_t lock; /* the pool lock */ + raw_spinlock_t lock; /* the pool lock */ int cpu; /* I: the associated cpu */ int node; /* I: the associated node ID */ int id; /* I: pool ID */ @@ -300,8 +300,9 @@ static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf; static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ static DEFINE_MUTEX(wq_pool_attach_mutex); /* protects worker attach/detach */ -static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ -static DECLARE_WAIT_QUEUE_HEAD(wq_manager_wait); /* wait for manager to go away */ +static DEFINE_RAW_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ +/* wait for manager to go away */ +static struct rcuwait manager_wait = __RCUWAIT_INITIALIZER(manager_wait); static LIST_HEAD(workqueues); /* PR: list of all workqueues */ static bool workqueue_freezing; /* PL: have wqs started freezing? */ @@ -826,7 +827,7 @@ static struct worker *first_idle_worker(struct worker_pool *pool) * Wake up the first idle worker of @pool. * * CONTEXT: - * spin_lock_irq(pool->lock). + * raw_spin_lock_irq(pool->lock). */ static void wake_up_worker(struct worker_pool *pool) { @@ -881,7 +882,7 @@ void wq_worker_sleeping(struct task_struct *task) return; worker->sleeping = 1; - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); /* * The counterpart of the following dec_and_test, implied mb, @@ -900,7 +901,7 @@ void wq_worker_sleeping(struct task_struct *task) if (next) wake_up_process(next->task); } - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); } /** @@ -911,7 +912,7 @@ void wq_worker_sleeping(struct task_struct *task) * the scheduler to get a worker's last known identity. * * CONTEXT: - * spin_lock_irq(rq->lock) + * raw_spin_lock_irq(rq->lock) * * This function is called during schedule() when a kworker is going * to sleep. It's used by psi to identify aggregation workers during @@ -942,7 +943,7 @@ work_func_t wq_worker_last_func(struct task_struct *task) * Set @flags in @worker->flags and adjust nr_running accordingly. * * CONTEXT: - * spin_lock_irq(pool->lock) + * raw_spin_lock_irq(pool->lock) */ static inline void worker_set_flags(struct worker *worker, unsigned int flags) { @@ -967,7 +968,7 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags) * Clear @flags in @worker->flags and adjust nr_running accordingly. * * CONTEXT: - * spin_lock_irq(pool->lock) + * raw_spin_lock_irq(pool->lock) */ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) { @@ -1015,7 +1016,7 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) * actually occurs, it should be easy to locate the culprit work function. * * CONTEXT: - * spin_lock_irq(pool->lock). + * raw_spin_lock_irq(pool->lock). * * Return: * Pointer to worker which is executing @work if found, %NULL @@ -1050,7 +1051,7 @@ static struct worker *find_worker_executing_work(struct worker_pool *pool, * nested inside outer list_for_each_entry_safe(). * * CONTEXT: - * spin_lock_irq(pool->lock). + * raw_spin_lock_irq(pool->lock). */ static void move_linked_works(struct work_struct *work, struct list_head *head, struct work_struct **nextp) @@ -1128,9 +1129,9 @@ static void put_pwq_unlocked(struct pool_workqueue *pwq) * As both pwqs and pools are RCU protected, the * following lock operations are safe. */ - spin_lock_irq(&pwq->pool->lock); + raw_spin_lock_irq(&pwq->pool->lock); put_pwq(pwq); - spin_unlock_irq(&pwq->pool->lock); + raw_spin_unlock_irq(&pwq->pool->lock); } } @@ -1163,7 +1164,7 @@ static void pwq_activate_first_delayed(struct pool_workqueue *pwq) * decrement nr_in_flight of its pwq and handle workqueue flushing. * * CONTEXT: - * spin_lock_irq(pool->lock). + * raw_spin_lock_irq(pool->lock). */ static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color) { @@ -1262,7 +1263,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, if (!pool) goto fail; - spin_lock(&pool->lock); + raw_spin_lock(&pool->lock); /* * work->data is guaranteed to point to pwq only while the work * item is queued on pwq->wq, and both updating work->data to point @@ -1291,11 +1292,11 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, /* work->data points to pwq iff queued, point to pool */ set_work_pool_and_keep_pending(work, pool->id); - spin_unlock(&pool->lock); + raw_spin_unlock(&pool->lock); rcu_read_unlock(); return 1; } - spin_unlock(&pool->lock); + raw_spin_unlock(&pool->lock); fail: rcu_read_unlock(); local_irq_restore(*flags); @@ -1316,7 +1317,7 @@ fail: * work_struct flags. * * CONTEXT: - * spin_lock_irq(pool->lock). + * raw_spin_lock_irq(pool->lock). */ static void insert_work(struct pool_workqueue *pwq, struct work_struct *work, struct list_head *head, unsigned int extra_flags) @@ -1433,7 +1434,7 @@ retry: if (last_pool && last_pool != pwq->pool) { struct worker *worker; - spin_lock(&last_pool->lock); + raw_spin_lock(&last_pool->lock); worker = find_worker_executing_work(last_pool, work); @@ -1441,11 +1442,11 @@ retry: pwq = worker->current_pwq; } else { /* meh... not running there, queue here */ - spin_unlock(&last_pool->lock); - spin_lock(&pwq->pool->lock); + raw_spin_unlock(&last_pool->lock); + raw_spin_lock(&pwq->pool->lock); } } else { - spin_lock(&pwq->pool->lock); + raw_spin_lock(&pwq->pool->lock); } /* @@ -1458,7 +1459,7 @@ retry: */ if (unlikely(!pwq->refcnt)) { if (wq->flags & WQ_UNBOUND) { - spin_unlock(&pwq->pool->lock); + raw_spin_unlock(&pwq->pool->lock); cpu_relax(); goto retry; } @@ -1490,7 +1491,7 @@ retry: insert_work(pwq, work, worklist, work_flags); out: - spin_unlock(&pwq->pool->lock); + raw_spin_unlock(&pwq->pool->lock); rcu_read_unlock(); } @@ -1759,7 +1760,7 @@ EXPORT_SYMBOL(queue_rcu_work); * necessary. * * LOCKING: - * spin_lock_irq(pool->lock). + * raw_spin_lock_irq(pool->lock). */ static void worker_enter_idle(struct worker *worker) { @@ -1799,7 +1800,7 @@ static void worker_enter_idle(struct worker *worker) * @worker is leaving idle state. Update stats. * * LOCKING: - * spin_lock_irq(pool->lock). + * raw_spin_lock_irq(pool->lock). */ static void worker_leave_idle(struct worker *worker) { @@ -1937,11 +1938,11 @@ static struct worker *create_worker(struct worker_pool *pool) worker_attach_to_pool(worker, pool); /* start the newly created worker */ - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); worker->pool->nr_workers++; worker_enter_idle(worker); wake_up_process(worker->task); - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); return worker; @@ -1960,7 +1961,7 @@ fail: * be idle. * * CONTEXT: - * spin_lock_irq(pool->lock). + * raw_spin_lock_irq(pool->lock). */ static void destroy_worker(struct worker *worker) { @@ -1986,7 +1987,7 @@ static void idle_worker_timeout(struct timer_list *t) { struct worker_pool *pool = from_timer(pool, t, idle_timer); - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); while (too_many_workers(pool)) { struct worker *worker; @@ -2004,7 +2005,7 @@ static void idle_worker_timeout(struct timer_list *t) destroy_worker(worker); } - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); } static void send_mayday(struct work_struct *work) @@ -2035,8 +2036,8 @@ static void pool_mayday_timeout(struct timer_list *t) struct worker_pool *pool = from_timer(pool, t, mayday_timer); struct work_struct *work; - spin_lock_irq(&pool->lock); - spin_lock(&wq_mayday_lock); /* for wq->maydays */ + raw_spin_lock_irq(&pool->lock); + raw_spin_lock(&wq_mayday_lock); /* for wq->maydays */ if (need_to_create_worker(pool)) { /* @@ -2049,8 +2050,8 @@ static void pool_mayday_timeout(struct timer_list *t) send_mayday(work); } - spin_unlock(&wq_mayday_lock); - spin_unlock_irq(&pool->lock); + raw_spin_unlock(&wq_mayday_lock); + raw_spin_unlock_irq(&pool->lock); mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL); } @@ -2069,7 +2070,7 @@ static void pool_mayday_timeout(struct timer_list *t) * may_start_working() %true. * * LOCKING: - * spin_lock_irq(pool->lock) which may be released and regrabbed + * raw_spin_lock_irq(pool->lock) which may be released and regrabbed * multiple times. Does GFP_KERNEL allocations. Called only from * manager. */ @@ -2078,7 +2079,7 @@ __releases(&pool->lock) __acquires(&pool->lock) { restart: - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */ mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT); @@ -2094,7 +2095,7 @@ restart: } del_timer_sync(&pool->mayday_timer); - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); /* * This is necessary even after a new worker was just successfully * created as @pool->lock was dropped and the new worker might have @@ -2117,7 +2118,7 @@ restart: * and may_start_working() is true. * * CONTEXT: - * spin_lock_irq(pool->lock) which may be released and regrabbed + * raw_spin_lock_irq(pool->lock) which may be released and regrabbed * multiple times. Does GFP_KERNEL allocations. * * Return: @@ -2140,7 +2141,7 @@ static bool manage_workers(struct worker *worker) pool->manager = NULL; pool->flags &= ~POOL_MANAGER_ACTIVE; - wake_up(&wq_manager_wait); + rcuwait_wake_up(&manager_wait); return true; } @@ -2156,7 +2157,7 @@ static bool manage_workers(struct worker *worker) * call this function to process a work. * * CONTEXT: - * spin_lock_irq(pool->lock) which is released and regrabbed. + * raw_spin_lock_irq(pool->lock) which is released and regrabbed. */ static void process_one_work(struct worker *worker, struct work_struct *work) __releases(&pool->lock) @@ -2238,7 +2239,7 @@ __acquires(&pool->lock) */ set_work_pool_and_clear_pending(work, pool->id); - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); lock_map_acquire(&pwq->wq->lockdep_map); lock_map_acquire(&lockdep_map); @@ -2293,7 +2294,7 @@ __acquires(&pool->lock) */ cond_resched(); - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); /* clear cpu intensive status */ if (unlikely(cpu_intensive)) @@ -2319,7 +2320,7 @@ __acquires(&pool->lock) * fetches a work from the top and executes it. * * CONTEXT: - * spin_lock_irq(pool->lock) which may be released and regrabbed + * raw_spin_lock_irq(pool->lock) which may be released and regrabbed * multiple times. */ static void process_scheduled_works(struct worker *worker) @@ -2361,11 +2362,11 @@ static int worker_thread(void *__worker) /* tell the scheduler that this is a workqueue worker */ set_pf_worker(true); woke_up: - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); /* am I supposed to die? */ if (unlikely(worker->flags & WORKER_DIE)) { - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); WARN_ON_ONCE(!list_empty(&worker->entry)); set_pf_worker(false); @@ -2431,7 +2432,7 @@ sleep: */ worker_enter_idle(worker); __set_current_state(TASK_IDLE); - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); schedule(); goto woke_up; } @@ -2485,7 +2486,7 @@ repeat: should_stop = kthread_should_stop(); /* see whether any pwq is asking for help */ - spin_lock_irq(&wq_mayday_lock); + raw_spin_lock_irq(&wq_mayday_lock); while (!list_empty(&wq->maydays)) { struct pool_workqueue *pwq = list_first_entry(&wq->maydays, @@ -2497,11 +2498,11 @@ repeat: __set_current_state(TASK_RUNNING); list_del_init(&pwq->mayday_node); - spin_unlock_irq(&wq_mayday_lock); + raw_spin_unlock_irq(&wq_mayday_lock); worker_attach_to_pool(rescuer, pool); - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); /* * Slurp in all works issued via this workqueue and @@ -2529,8 +2530,8 @@ repeat: * being used to relieve memory pressure, don't * incur MAYDAY_INTERVAL delay inbetween. */ - if (need_to_create_worker(pool)) { - spin_lock(&wq_mayday_lock); + if (pwq->nr_active && need_to_create_worker(pool)) { + raw_spin_lock(&wq_mayday_lock); /* * Queue iff we aren't racing destruction * and somebody else hasn't queued it already. @@ -2539,7 +2540,7 @@ repeat: get_pwq(pwq); list_add_tail(&pwq->mayday_node, &wq->maydays); } - spin_unlock(&wq_mayday_lock); + raw_spin_unlock(&wq_mayday_lock); } } @@ -2557,14 +2558,14 @@ repeat: if (need_more_worker(pool)) wake_up_worker(pool); - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); worker_detach_from_pool(rescuer); - spin_lock_irq(&wq_mayday_lock); + raw_spin_lock_irq(&wq_mayday_lock); } - spin_unlock_irq(&wq_mayday_lock); + raw_spin_unlock_irq(&wq_mayday_lock); if (should_stop) { __set_current_state(TASK_RUNNING); @@ -2644,7 +2645,7 @@ static void wq_barrier_func(struct work_struct *work) * underneath us, so we can't reliably determine pwq from @target. * * CONTEXT: - * spin_lock_irq(pool->lock). + * raw_spin_lock_irq(pool->lock). */ static void insert_wq_barrier(struct pool_workqueue *pwq, struct wq_barrier *barr, @@ -2731,7 +2732,7 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, for_each_pwq(pwq, wq) { struct worker_pool *pool = pwq->pool; - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); if (flush_color >= 0) { WARN_ON_ONCE(pwq->flush_color != -1); @@ -2748,7 +2749,7 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, pwq->work_color = work_color; } - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); } if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_pwqs_to_flush)) @@ -2948,9 +2949,9 @@ reflush: for_each_pwq(pwq, wq) { bool drained; - spin_lock_irq(&pwq->pool->lock); + raw_spin_lock_irq(&pwq->pool->lock); drained = !pwq->nr_active && list_empty(&pwq->delayed_works); - spin_unlock_irq(&pwq->pool->lock); + raw_spin_unlock_irq(&pwq->pool->lock); if (drained) continue; @@ -2986,7 +2987,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, return false; } - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); /* see the comment in try_to_grab_pending() with the same code */ pwq = get_work_pwq(work); if (pwq) { @@ -3002,7 +3003,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, check_flush_dependency(pwq->wq, work); insert_wq_barrier(pwq, barr, work, worker); - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); /* * Force a lock recursion deadlock when using flush_work() inside a @@ -3021,7 +3022,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, rcu_read_unlock(); return true; already_gone: - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); rcu_read_unlock(); return false; } @@ -3414,7 +3415,7 @@ static bool wqattrs_equal(const struct workqueue_attrs *a, */ static int init_worker_pool(struct worker_pool *pool) { - spin_lock_init(&pool->lock); + raw_spin_lock_init(&pool->lock); pool->id = -1; pool->cpu = -1; pool->node = NUMA_NO_NODE; @@ -3491,7 +3492,6 @@ static void rcu_free_wq(struct rcu_head *rcu) else free_workqueue_attrs(wq->unbound_attrs); - kfree(wq->rescuer); kfree(wq); } @@ -3504,6 +3504,18 @@ static void rcu_free_pool(struct rcu_head *rcu) kfree(pool); } +/* This returns with the lock held on success (pool manager is inactive). */ +static bool wq_manager_inactive(struct worker_pool *pool) +{ + raw_spin_lock_irq(&pool->lock); + + if (pool->flags & POOL_MANAGER_ACTIVE) { + raw_spin_unlock_irq(&pool->lock); + return false; + } + return true; +} + /** * put_unbound_pool - put a worker_pool * @pool: worker_pool to put @@ -3539,16 +3551,17 @@ static void put_unbound_pool(struct worker_pool *pool) * Become the manager and destroy all workers. This prevents * @pool's workers from blocking on attach_mutex. We're the last * manager and @pool gets freed with the flag set. + * Because of how wq_manager_inactive() works, we will hold the + * spinlock after a successful wait. */ - spin_lock_irq(&pool->lock); - wait_event_lock_irq(wq_manager_wait, - !(pool->flags & POOL_MANAGER_ACTIVE), pool->lock); + rcuwait_wait_event(&manager_wait, wq_manager_inactive(pool), + TASK_UNINTERRUPTIBLE); pool->flags |= POOL_MANAGER_ACTIVE; while ((worker = first_idle_worker(pool))) destroy_worker(worker); WARN_ON(pool->nr_workers || pool->nr_idle); - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); mutex_lock(&wq_pool_attach_mutex); if (!list_empty(&pool->workers)) @@ -3704,7 +3717,7 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq) return; /* this function can be called during early boot w/ irq disabled */ - spin_lock_irqsave(&pwq->pool->lock, flags); + raw_spin_lock_irqsave(&pwq->pool->lock, flags); /* * During [un]freezing, the caller is responsible for ensuring that @@ -3727,7 +3740,7 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq) pwq->max_active = 0; } - spin_unlock_irqrestore(&pwq->pool->lock, flags); + raw_spin_unlock_irqrestore(&pwq->pool->lock, flags); } /* initialize newly alloced @pwq which is associated with @wq and @pool */ @@ -4129,9 +4142,9 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu, use_dfl_pwq: mutex_lock(&wq->mutex); - spin_lock_irq(&wq->dfl_pwq->pool->lock); + raw_spin_lock_irq(&wq->dfl_pwq->pool->lock); get_pwq(wq->dfl_pwq); - spin_unlock_irq(&wq->dfl_pwq->pool->lock); + raw_spin_unlock_irq(&wq->dfl_pwq->pool->lock); old_pwq = numa_pwq_tbl_install(wq, node, wq->dfl_pwq); out_unlock: mutex_unlock(&wq->mutex); @@ -4208,8 +4221,8 @@ static int init_rescuer(struct workqueue_struct *wq) rescuer->rescue_wq = wq; rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", wq->name); - ret = PTR_ERR_OR_ZERO(rescuer->task); - if (ret) { + if (IS_ERR(rescuer->task)) { + ret = PTR_ERR(rescuer->task); kfree(rescuer); return ret; } @@ -4360,9 +4373,9 @@ void destroy_workqueue(struct workqueue_struct *wq) struct worker *rescuer = wq->rescuer; /* this prevents new queueing */ - spin_lock_irq(&wq_mayday_lock); + raw_spin_lock_irq(&wq_mayday_lock); wq->rescuer = NULL; - spin_unlock_irq(&wq_mayday_lock); + raw_spin_unlock_irq(&wq_mayday_lock); /* rescuer will empty maydays list before exiting */ kthread_stop(rescuer->task); @@ -4376,27 +4389,25 @@ void destroy_workqueue(struct workqueue_struct *wq) mutex_lock(&wq_pool_mutex); mutex_lock(&wq->mutex); for_each_pwq(pwq, wq) { - spin_lock_irq(&pwq->pool->lock); + raw_spin_lock_irq(&pwq->pool->lock); if (WARN_ON(pwq_busy(pwq))) { pr_warn("%s: %s has the following busy pwq\n", __func__, wq->name); show_pwq(pwq); - spin_unlock_irq(&pwq->pool->lock); + raw_spin_unlock_irq(&pwq->pool->lock); mutex_unlock(&wq->mutex); mutex_unlock(&wq_pool_mutex); show_workqueue_state(); return; } - spin_unlock_irq(&pwq->pool->lock); + raw_spin_unlock_irq(&pwq->pool->lock); } mutex_unlock(&wq->mutex); - mutex_unlock(&wq_pool_mutex); /* * wq list is used to freeze wq, remove from list after * flushing is complete in case freeze races us. */ - mutex_lock(&wq_pool_mutex); list_del_rcu(&wq->list); mutex_unlock(&wq_pool_mutex); @@ -4558,10 +4569,10 @@ unsigned int work_busy(struct work_struct *work) rcu_read_lock(); pool = get_work_pool(work); if (pool) { - spin_lock_irqsave(&pool->lock, flags); + raw_spin_lock_irqsave(&pool->lock, flags); if (find_worker_executing_work(pool, work)) ret |= WORK_BUSY_RUNNING; - spin_unlock_irqrestore(&pool->lock, flags); + raw_spin_unlock_irqrestore(&pool->lock, flags); } rcu_read_unlock(); @@ -4627,11 +4638,11 @@ void print_worker_info(const char *log_lvl, struct task_struct *task) * Carefully copy the associated workqueue's workfn, name and desc. * Keep the original last '\0' in case the original is garbage. */ - probe_kernel_read(&fn, &worker->current_func, sizeof(fn)); - probe_kernel_read(&pwq, &worker->current_pwq, sizeof(pwq)); - probe_kernel_read(&wq, &pwq->wq, sizeof(wq)); - probe_kernel_read(name, wq->name, sizeof(name) - 1); - probe_kernel_read(desc, worker->desc, sizeof(desc) - 1); + copy_from_kernel_nofault(&fn, &worker->current_func, sizeof(fn)); + copy_from_kernel_nofault(&pwq, &worker->current_pwq, sizeof(pwq)); + copy_from_kernel_nofault(&wq, &pwq->wq, sizeof(wq)); + copy_from_kernel_nofault(name, wq->name, sizeof(name) - 1); + copy_from_kernel_nofault(desc, worker->desc, sizeof(desc) - 1); if (fn || name[0] || desc[0]) { printk("%sWorkqueue: %s %ps", log_lvl, name, fn); @@ -4768,10 +4779,10 @@ void show_workqueue_state(void) pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags); for_each_pwq(pwq, wq) { - spin_lock_irqsave(&pwq->pool->lock, flags); + raw_spin_lock_irqsave(&pwq->pool->lock, flags); if (pwq->nr_active || !list_empty(&pwq->delayed_works)) show_pwq(pwq); - spin_unlock_irqrestore(&pwq->pool->lock, flags); + raw_spin_unlock_irqrestore(&pwq->pool->lock, flags); /* * We could be printing a lot from atomic context, e.g. * sysrq-t -> show_workqueue_state(). Avoid triggering @@ -4785,7 +4796,7 @@ void show_workqueue_state(void) struct worker *worker; bool first = true; - spin_lock_irqsave(&pool->lock, flags); + raw_spin_lock_irqsave(&pool->lock, flags); if (pool->nr_workers == pool->nr_idle) goto next_pool; @@ -4804,7 +4815,7 @@ void show_workqueue_state(void) } pr_cont("\n"); next_pool: - spin_unlock_irqrestore(&pool->lock, flags); + raw_spin_unlock_irqrestore(&pool->lock, flags); /* * We could be printing a lot from atomic context, e.g. * sysrq-t -> show_workqueue_state(). Avoid triggering @@ -4834,7 +4845,7 @@ void wq_worker_comm(char *buf, size_t size, struct task_struct *task) struct worker_pool *pool = worker->pool; if (pool) { - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); /* * ->desc tracks information (wq name or * set_worker_desc()) for the latest execution. If @@ -4848,7 +4859,7 @@ void wq_worker_comm(char *buf, size_t size, struct task_struct *task) scnprintf(buf + off, size - off, "-%s", worker->desc); } - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); } } @@ -4879,7 +4890,7 @@ static void unbind_workers(int cpu) for_each_cpu_worker_pool(pool, cpu) { mutex_lock(&wq_pool_attach_mutex); - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); /* * We've blocked all attach/detach operations. Make all workers @@ -4893,7 +4904,7 @@ static void unbind_workers(int cpu) pool->flags |= POOL_DISASSOCIATED; - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); mutex_unlock(&wq_pool_attach_mutex); /* @@ -4919,9 +4930,9 @@ static void unbind_workers(int cpu) * worker blocking could lead to lengthy stalls. Kick off * unbound chain execution of currently pending work items. */ - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); wake_up_worker(pool); - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); } } @@ -4948,7 +4959,7 @@ static void rebind_workers(struct worker_pool *pool) WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask) < 0); - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); pool->flags &= ~POOL_DISASSOCIATED; @@ -4987,7 +4998,7 @@ static void rebind_workers(struct worker_pool *pool) WRITE_ONCE(worker->flags, worker_flags); } - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); } /** @@ -5906,7 +5917,7 @@ void __init workqueue_init_early(void) int hk_flags = HK_FLAG_DOMAIN | HK_FLAG_WQ; int i, cpu; - WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); + BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(hk_flags)); |