diff options
Diffstat (limited to 'kernel/trace')
-rw-r--r-- | kernel/trace/Kconfig | 15 | ||||
-rw-r--r-- | kernel/trace/blktrace.c | 7 | ||||
-rw-r--r-- | kernel/trace/bpf_trace.c | 113 | ||||
-rw-r--r-- | kernel/trace/ftrace.c | 47 | ||||
-rw-r--r-- | kernel/trace/ring_buffer.c | 44 | ||||
-rw-r--r-- | kernel/trace/ring_buffer_benchmark.c | 2 | ||||
-rw-r--r-- | kernel/trace/rv/monitors/wip/wip.h | 2 | ||||
-rw-r--r-- | kernel/trace/rv/monitors/wwnr/wwnr.h | 2 | ||||
-rw-r--r-- | kernel/trace/trace.c | 106 | ||||
-rw-r--r-- | kernel/trace/trace.h | 31 | ||||
-rw-r--r-- | kernel/trace/trace_event_perf.c | 16 | ||||
-rw-r--r-- | kernel/trace/trace_events.c | 66 | ||||
-rw-r--r-- | kernel/trace/trace_events_hist.c | 190 | ||||
-rw-r--r-- | kernel/trace/trace_events_synth.c | 2 | ||||
-rw-r--r-- | kernel/trace/trace_events_trigger.c | 19 | ||||
-rw-r--r-- | kernel/trace/trace_events_user.c | 3 | ||||
-rw-r--r-- | kernel/trace/trace_kprobe.c | 2 | ||||
-rw-r--r-- | kernel/trace/trace_osnoise.c | 244 | ||||
-rw-r--r-- | kernel/trace/trace_output.c | 71 | ||||
-rw-r--r-- | kernel/trace/trace_probe.c | 67 | ||||
-rw-r--r-- | kernel/trace/trace_probe.h | 19 | ||||
-rw-r--r-- | kernel/trace/trace_probe_tmpl.h | 47 | ||||
-rw-r--r-- | kernel/trace/trace_selftest.c | 9 | ||||
-rw-r--r-- | kernel/trace/trace_uprobe.c | 3 |
24 files changed, 888 insertions, 239 deletions
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index e9e95c790b8e..197545241ab8 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -46,10 +46,10 @@ config HAVE_DYNAMIC_FTRACE_WITH_ARGS bool help If this is set, then arguments and stack can be found from - the pt_regs passed into the function callback regs parameter + the ftrace_regs passed into the function callback regs parameter by default, even without setting the REGS flag in the ftrace_ops. - This allows for use of regs_get_kernel_argument() and - kernel_stack_pointer(). + This allows for use of ftrace_regs_get_argument() and + ftrace_regs_get_stack_pointer(). config HAVE_DYNAMIC_FTRACE_NO_PATCHABLE bool @@ -82,6 +82,13 @@ config HAVE_OBJTOOL_MCOUNT help Arch supports objtool --mcount +config HAVE_OBJTOOL_NOP_MCOUNT + bool + help + Arch supports the objtool options --mcount with --mnop. + An architecture can select this if it wants to enable nop'ing + of ftrace locations. + config HAVE_C_RECORDMCOUNT bool help @@ -375,6 +382,7 @@ config SCHED_TRACER config HWLAT_TRACER bool "Tracer to detect hardware latencies (like SMIs)" select GENERIC_TRACER + select TRACER_MAX_TRACE help This tracer, when enabled will create one or more kernel threads, depending on what the cpumask file is set to, which each thread @@ -410,6 +418,7 @@ config HWLAT_TRACER config OSNOISE_TRACER bool "OS Noise tracer" select GENERIC_TRACER + select TRACER_MAX_TRACE help In the context of high-performance computing (HPC), the Operating System Noise (osnoise) refers to the interference experienced by an diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index a995ea1ef849..918a7d12df8f 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -721,7 +721,7 @@ EXPORT_SYMBOL_GPL(blk_trace_startstop); */ /** - * blk_trace_ioctl: - handle the ioctls associated with tracing + * blk_trace_ioctl - handle the ioctls associated with tracing * @bdev: the block device * @cmd: the ioctl cmd * @arg: the argument data, if any @@ -769,7 +769,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) } /** - * blk_trace_shutdown: - stop and cleanup trace structures + * blk_trace_shutdown - stop and cleanup trace structures * @q: the request queue associated with the device * **/ @@ -1548,7 +1548,8 @@ blk_trace_event_print_binary(struct trace_iterator *iter, int flags, static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter) { - if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC)) + if ((iter->ent->type != TRACE_BLK) || + !(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC)) return TRACE_TYPE_UNHANDLED; return print_one_line(iter, true); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 1ed08967fb97..3bbd3f0c810c 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -6,6 +6,7 @@ #include <linux/types.h> #include <linux/slab.h> #include <linux/bpf.h> +#include <linux/bpf_verifier.h> #include <linux/bpf_perf_event.h> #include <linux/btf.h> #include <linux/filter.h> @@ -773,7 +774,7 @@ BPF_CALL_0(bpf_get_current_task_btf) const struct bpf_func_proto bpf_get_current_task_btf_proto = { .func = bpf_get_current_task_btf, .gpl_only = true, - .ret_type = RET_PTR_TO_BTF_ID, + .ret_type = RET_PTR_TO_BTF_ID_TRUSTED, .ret_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], }; @@ -1456,6 +1457,10 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_current_cgroup_id_proto; case BPF_FUNC_get_current_ancestor_cgroup_id: return &bpf_get_current_ancestor_cgroup_id_proto; + case BPF_FUNC_cgrp_storage_get: + return &bpf_cgrp_storage_get_proto; + case BPF_FUNC_cgrp_storage_delete: + return &bpf_cgrp_storage_delete_proto; #endif case BPF_FUNC_send_signal: return &bpf_send_signal_proto; @@ -1480,9 +1485,9 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_get_task_stack: return &bpf_get_task_stack_proto; case BPF_FUNC_copy_from_user: - return prog->aux->sleepable ? &bpf_copy_from_user_proto : NULL; + return &bpf_copy_from_user_proto; case BPF_FUNC_copy_from_user_task: - return prog->aux->sleepable ? &bpf_copy_from_user_task_proto : NULL; + return &bpf_copy_from_user_task_proto; case BPF_FUNC_snprintf_btf: return &bpf_snprintf_btf_proto; case BPF_FUNC_per_cpu_ptr: @@ -1490,8 +1495,12 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_this_cpu_ptr: return &bpf_this_cpu_ptr_proto; case BPF_FUNC_task_storage_get: + if (bpf_prog_check_recur(prog)) + return &bpf_task_storage_get_recur_proto; return &bpf_task_storage_get_proto; case BPF_FUNC_task_storage_delete: + if (bpf_prog_check_recur(prog)) + return &bpf_task_storage_delete_recur_proto; return &bpf_task_storage_delete_proto; case BPF_FUNC_for_each_map_elem: return &bpf_for_each_map_elem_proto; @@ -2452,6 +2461,8 @@ struct bpf_kprobe_multi_link { unsigned long *addrs; u64 *cookies; u32 cnt; + u32 mods_cnt; + struct module **mods; }; struct bpf_kprobe_multi_run_ctx { @@ -2507,6 +2518,14 @@ error: return err; } +static void kprobe_multi_put_modules(struct module **mods, u32 cnt) +{ + u32 i; + + for (i = 0; i < cnt; i++) + module_put(mods[i]); +} + static void free_user_syms(struct user_syms *us) { kvfree(us->syms); @@ -2519,6 +2538,7 @@ static void bpf_kprobe_multi_link_release(struct bpf_link *link) kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link); unregister_fprobe(&kmulti_link->fp); + kprobe_multi_put_modules(kmulti_link->mods, kmulti_link->mods_cnt); } static void bpf_kprobe_multi_link_dealloc(struct bpf_link *link) @@ -2528,6 +2548,7 @@ static void bpf_kprobe_multi_link_dealloc(struct bpf_link *link) kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link); kvfree(kmulti_link->addrs); kvfree(kmulti_link->cookies); + kfree(kmulti_link->mods); kfree(kmulti_link); } @@ -2550,7 +2571,7 @@ static void bpf_kprobe_multi_cookie_swap(void *a, void *b, int size, const void swap(*cookie_a, *cookie_b); } -static int __bpf_kprobe_multi_cookie_cmp(const void *a, const void *b) +static int bpf_kprobe_multi_addrs_cmp(const void *a, const void *b) { const unsigned long *addr_a = a, *addr_b = b; @@ -2561,7 +2582,7 @@ static int __bpf_kprobe_multi_cookie_cmp(const void *a, const void *b) static int bpf_kprobe_multi_cookie_cmp(const void *a, const void *b, const void *priv) { - return __bpf_kprobe_multi_cookie_cmp(a, b); + return bpf_kprobe_multi_addrs_cmp(a, b); } static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx) @@ -2579,7 +2600,7 @@ static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx) return 0; entry_ip = run_ctx->entry_ip; addr = bsearch(&entry_ip, link->addrs, link->cnt, sizeof(entry_ip), - __bpf_kprobe_multi_cookie_cmp); + bpf_kprobe_multi_addrs_cmp); if (!addr) return 0; cookie = link->cookies + (addr - link->addrs); @@ -2663,6 +2684,71 @@ static void symbols_swap_r(void *a, void *b, int size, const void *priv) } } +struct module_addr_args { + unsigned long *addrs; + u32 addrs_cnt; + struct module **mods; + int mods_cnt; + int mods_cap; +}; + +static int module_callback(void *data, const char *name, + struct module *mod, unsigned long addr) +{ + struct module_addr_args *args = data; + struct module **mods; + + /* We iterate all modules symbols and for each we: + * - search for it in provided addresses array + * - if found we check if we already have the module pointer stored + * (we iterate modules sequentially, so we can check just the last + * module pointer) + * - take module reference and store it + */ + if (!bsearch(&addr, args->addrs, args->addrs_cnt, sizeof(addr), + bpf_kprobe_multi_addrs_cmp)) + return 0; + + if (args->mods && args->mods[args->mods_cnt - 1] == mod) + return 0; + + if (args->mods_cnt == args->mods_cap) { + args->mods_cap = max(16, args->mods_cap * 3 / 2); + mods = krealloc_array(args->mods, args->mods_cap, sizeof(*mods), GFP_KERNEL); + if (!mods) + return -ENOMEM; + args->mods = mods; + } + + if (!try_module_get(mod)) + return -EINVAL; + + args->mods[args->mods_cnt] = mod; + args->mods_cnt++; + return 0; +} + +static int get_modules_for_addrs(struct module ***mods, unsigned long *addrs, u32 addrs_cnt) +{ + struct module_addr_args args = { + .addrs = addrs, + .addrs_cnt = addrs_cnt, + }; + int err; + + /* We return either err < 0 in case of error, ... */ + err = module_kallsyms_on_each_symbol(module_callback, &args); + if (err) { + kprobe_multi_put_modules(args.mods, args.mods_cnt); + kfree(args.mods); + return err; + } + + /* or number of modules found if everything is ok. */ + *mods = args.mods; + return args.mods_cnt; +} + int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { struct bpf_kprobe_multi_link *link = NULL; @@ -2773,10 +2859,25 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr bpf_kprobe_multi_cookie_cmp, bpf_kprobe_multi_cookie_swap, link); + } else { + /* + * We need to sort addrs array even if there are no cookies + * provided, to allow bsearch in get_modules_for_addrs. + */ + sort(addrs, cnt, sizeof(*addrs), + bpf_kprobe_multi_addrs_cmp, NULL); + } + + err = get_modules_for_addrs(&link->mods, addrs, cnt); + if (err < 0) { + bpf_link_cleanup(&link_primer); + return err; } + link->mods_cnt = err; err = register_fprobe_ips(&link->fp, addrs, cnt); if (err) { + kprobe_multi_put_modules(link->mods, link->mods_cnt); bpf_link_cleanup(&link_primer); return err; } diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 33236241f236..442438b93fe9 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -163,7 +163,7 @@ static void ftrace_sync_ipi(void *data) static ftrace_func_t ftrace_ops_get_list_func(struct ftrace_ops *ops) { /* - * If this is a dynamic, RCU, or per CPU ops, or we force list func, + * If this is a dynamic or RCU ops, or we force list func, * then it needs to call the list anyway. */ if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_RCU) || @@ -2488,14 +2488,13 @@ ftrace_add_rec_direct(unsigned long ip, unsigned long addr, static void call_direct_funcs(unsigned long ip, unsigned long pip, struct ftrace_ops *ops, struct ftrace_regs *fregs) { - struct pt_regs *regs = ftrace_get_regs(fregs); unsigned long addr; addr = ftrace_find_rec_direct(ip); if (!addr) return; - arch_ftrace_set_direct_caller(regs, addr); + arch_ftrace_set_direct_caller(fregs, addr); } struct ftrace_ops direct_ops = { @@ -2763,6 +2762,19 @@ void __weak ftrace_arch_code_modify_post_process(void) { } +static int update_ftrace_func(ftrace_func_t func) +{ + static ftrace_func_t save_func; + + /* Avoid updating if it hasn't changed */ + if (func == save_func) + return 0; + + save_func = func; + + return ftrace_update_ftrace_func(func); +} + void ftrace_modify_all_code(int command) { int update = command & FTRACE_UPDATE_TRACE_FUNC; @@ -2783,7 +2795,7 @@ void ftrace_modify_all_code(int command) * traced. */ if (update) { - err = ftrace_update_ftrace_func(ftrace_ops_list_func); + err = update_ftrace_func(ftrace_ops_list_func); if (FTRACE_WARN_ON(err)) return; } @@ -2799,7 +2811,7 @@ void ftrace_modify_all_code(int command) /* If irqs are disabled, we are in stop machine */ if (!irqs_disabled()) smp_call_function(ftrace_sync_ipi, NULL, 1); - err = ftrace_update_ftrace_func(ftrace_trace_function); + err = update_ftrace_func(ftrace_trace_function); if (FTRACE_WARN_ON(err)) return; } @@ -3071,8 +3083,6 @@ out: /* * Dynamic ops may be freed, we must make sure that all * callers are done before leaving this function. - * The same goes for freeing the per_cpu data of the per_cpu - * ops. */ if (ops->flags & FTRACE_OPS_FL_DYNAMIC) { /* @@ -4193,6 +4203,7 @@ match_records(struct ftrace_hash *hash, char *func, int len, char *mod) } found = 1; } + cond_resched(); } while_for_each_ftrace_rec(); out_unlock: mutex_unlock(&ftrace_lock); @@ -7519,8 +7530,6 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, /* * Check the following for each ops before calling their func: * if RCU flag is set, then rcu_is_watching() must be true - * if PER_CPU is set, then ftrace_function_local_disable() - * must be false * Otherwise test if the ip matches the ops filter * * If any of the above fails then the op->func() is not executed. @@ -7570,8 +7579,8 @@ NOKPROBE_SYMBOL(arch_ftrace_ops_list_func); /* * If there's only one function registered but it does not support - * recursion, needs RCU protection and/or requires per cpu handling, then - * this function will be called by the mcount trampoline. + * recursion, needs RCU protection, then this function will be called + * by the mcount trampoline. */ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs) @@ -8258,6 +8267,10 @@ struct kallsyms_data { size_t found; }; +/* This function gets called for all kernel and module symbols + * and returns 1 in case we resolved all the requested symbols, + * 0 otherwise. + */ static int kallsyms_callback(void *data, const char *name, struct module *mod, unsigned long addr) { @@ -8300,17 +8313,19 @@ static int kallsyms_callback(void *data, const char *name, int ftrace_lookup_symbols(const char **sorted_syms, size_t cnt, unsigned long *addrs) { struct kallsyms_data args; - int err; + int found_all; memset(addrs, 0, sizeof(*addrs) * cnt); args.addrs = addrs; args.syms = sorted_syms; args.cnt = cnt; args.found = 0; - err = kallsyms_on_each_symbol(kallsyms_callback, &args); - if (err < 0) - return err; - return args.found == args.cnt ? 0 : -ESRCH; + + found_all = kallsyms_on_each_symbol(kallsyms_callback, &args); + if (found_all) + return 0; + found_all = module_kallsyms_on_each_symbol(kallsyms_callback, &args); + return found_all ? 0 : -ESRCH; } #ifdef CONFIG_SYSCTL diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index b21bf14bae9b..c366a0a9ddba 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2062,8 +2062,10 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer) { struct list_head *pages = &cpu_buffer->new_pages; int retries, success; + unsigned long flags; - raw_spin_lock_irq(&cpu_buffer->reader_lock); + /* Can be called at early boot up, where interrupts must not been enabled */ + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); /* * We are holding the reader lock, so the reader page won't be swapped * in the ring buffer. Now we are racing with the writer trying to @@ -2120,7 +2122,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer) * tracing */ RB_WARN_ON(cpu_buffer, !success); - raw_spin_unlock_irq(&cpu_buffer->reader_lock); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); /* free pages if they weren't inserted */ if (!success) { @@ -2248,8 +2250,16 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, rb_update_pages(cpu_buffer); cpu_buffer->nr_pages_to_update = 0; } else { - schedule_work_on(cpu, - &cpu_buffer->update_pages_work); + /* Run directly if possible. */ + migrate_disable(); + if (cpu != smp_processor_id()) { + migrate_enable(); + schedule_work_on(cpu, + &cpu_buffer->update_pages_work); + } else { + update_pages_handler(&cpu_buffer->update_pages_work); + migrate_enable(); + } } } @@ -2298,9 +2308,17 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, if (!cpu_online(cpu_id)) rb_update_pages(cpu_buffer); else { - schedule_work_on(cpu_id, - &cpu_buffer->update_pages_work); - wait_for_completion(&cpu_buffer->update_done); + /* Run directly if possible. */ + migrate_disable(); + if (cpu_id == smp_processor_id()) { + rb_update_pages(cpu_buffer); + migrate_enable(); + } else { + migrate_enable(); + schedule_work_on(cpu_id, + &cpu_buffer->update_pages_work); + wait_for_completion(&cpu_buffer->update_done); + } } cpu_buffer->nr_pages_to_update = 0; @@ -3180,8 +3198,7 @@ static inline void rb_event_discard(struct ring_buffer_event *event) event->time_delta = 1; } -static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event) +static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer) { local_inc(&cpu_buffer->entries); rb_end_commit(cpu_buffer); @@ -3383,15 +3400,14 @@ void ring_buffer_nest_end(struct trace_buffer *buffer) * * Must be paired with ring_buffer_lock_reserve. */ -int ring_buffer_unlock_commit(struct trace_buffer *buffer, - struct ring_buffer_event *event) +int ring_buffer_unlock_commit(struct trace_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; int cpu = raw_smp_processor_id(); cpu_buffer = buffer->buffers[cpu]; - rb_commit(cpu_buffer, event); + rb_commit(cpu_buffer); rb_wakeups(buffer, cpu_buffer); @@ -3977,7 +3993,7 @@ int ring_buffer_write(struct trace_buffer *buffer, memcpy(body, data, length); - rb_commit(cpu_buffer, event); + rb_commit(cpu_buffer); rb_wakeups(buffer, cpu_buffer); @@ -5998,7 +6014,7 @@ static __init int rb_write_something(struct rb_test_data *data, bool nested) } out: - ring_buffer_unlock_commit(data->buffer, event); + ring_buffer_unlock_commit(data->buffer); return 0; } diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 78e576575b79..aef34673d79d 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -258,7 +258,7 @@ static void ring_buffer_producer(void) hit++; entry = ring_buffer_event_data(event); *entry = smp_processor_id(); - ring_buffer_unlock_commit(buffer, event); + ring_buffer_unlock_commit(buffer); } } end_time = ktime_get(); diff --git a/kernel/trace/rv/monitors/wip/wip.h b/kernel/trace/rv/monitors/wip/wip.h index dacc37b62a2c..2e373f2c65ed 100644 --- a/kernel/trace/rv/monitors/wip/wip.h +++ b/kernel/trace/rv/monitors/wip/wip.h @@ -27,7 +27,7 @@ struct automaton_wip { bool final_states[state_max_wip]; }; -static struct automaton_wip automaton_wip = { +static const struct automaton_wip automaton_wip = { .state_names = { "preemptive", "non_preemptive" diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.h b/kernel/trace/rv/monitors/wwnr/wwnr.h index 118e576b91b4..d0d9c4b8121b 100644 --- a/kernel/trace/rv/monitors/wwnr/wwnr.h +++ b/kernel/trace/rv/monitors/wwnr/wwnr.h @@ -27,7 +27,7 @@ struct automaton_wwnr { bool final_states[state_max_wwnr]; }; -static struct automaton_wwnr automaton_wwnr = { +static const struct automaton_wwnr automaton_wwnr = { .state_names = { "not_running", "running" diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5cfc95a52bc3..a555a861b978 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -19,7 +19,6 @@ #include <linux/kallsyms.h> #include <linux/security.h> #include <linux/seq_file.h> -#include <linux/notifier.h> #include <linux/irqflags.h> #include <linux/debugfs.h> #include <linux/tracefs.h> @@ -85,7 +84,7 @@ void __init disable_tracing_selftest(const char *reason) #endif /* Pipe tracepoints to printk */ -struct trace_iterator *tracepoint_print_iter; +static struct trace_iterator *tracepoint_print_iter; int tracepoint_printk; static bool tracepoint_printk_stop_on_boot __initdata; static DEFINE_STATIC_KEY_FALSE(tracepoint_printk_key); @@ -999,7 +998,7 @@ __buffer_unlock_commit(struct trace_buffer *buffer, struct ring_buffer_event *ev /* ring_buffer_unlock_commit() enables preemption */ preempt_enable_notrace(); } else - ring_buffer_unlock_commit(buffer, event); + ring_buffer_unlock_commit(buffer); } /** @@ -1421,6 +1420,7 @@ int tracing_snapshot_cond_disable(struct trace_array *tr) return false; } EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable); +#define free_snapshot(tr) do { } while (0) #endif /* CONFIG_TRACER_SNAPSHOT */ void tracer_tracing_off(struct trace_array *tr) @@ -1692,6 +1692,8 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) } unsigned long __read_mostly tracing_thresh; + +#ifdef CONFIG_TRACER_MAX_TRACE static const struct file_operations tracing_max_lat_fops; #ifdef LATENCY_FS_NOTIFY @@ -1748,18 +1750,14 @@ void latency_fsnotify(struct trace_array *tr) irq_work_queue(&tr->fsnotify_irqwork); } -#elif defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) \ - || defined(CONFIG_OSNOISE_TRACER) +#else /* !LATENCY_FS_NOTIFY */ #define trace_create_maxlat_file(tr, d_tracer) \ trace_create_file("tracing_max_latency", TRACE_MODE_WRITE, \ d_tracer, &tr->max_latency, &tracing_max_lat_fops) -#else -#define trace_create_maxlat_file(tr, d_tracer) do { } while (0) #endif -#ifdef CONFIG_TRACER_MAX_TRACE /* * Copy the new maximum trace into the separate maximum-trace * structure. (this way the maximum trace is permanently saved, @@ -1834,14 +1832,15 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu, ring_buffer_record_off(tr->max_buffer.buffer); #ifdef CONFIG_TRACER_SNAPSHOT - if (tr->cond_snapshot && !tr->cond_snapshot->update(tr, cond_data)) - goto out_unlock; + if (tr->cond_snapshot && !tr->cond_snapshot->update(tr, cond_data)) { + arch_spin_unlock(&tr->max_lock); + return; + } #endif swap(tr->array_buffer.buffer, tr->max_buffer.buffer); __update_max_tr(tr, tsk, cpu); - out_unlock: arch_spin_unlock(&tr->max_lock); } @@ -1888,6 +1887,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) __update_max_tr(tr, tsk, cpu); arch_spin_unlock(&tr->max_lock); } + #endif /* CONFIG_TRACER_MAX_TRACE */ static int wait_on_pipe(struct trace_iterator *iter, int full) @@ -5617,7 +5617,7 @@ static const char readme_msg[] = "\t +|-[u]<offset>(<fetcharg>), \\imm-value, \\\"imm-string\"\n" "\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, string, symbol,\n" "\t b<bit-width>@<bit-offset>/<container-size>, ustring,\n" - "\t <type>\\[<array-size>\\]\n" + "\t symstr, <type>\\[<array-size>\\]\n" #ifdef CONFIG_HIST_TRIGGERS "\t field: <stype> <name>;\n" "\t stype: u8/u16/u32/u64, s8/s16/s32/s64, pid_t,\n" @@ -5678,6 +5678,7 @@ static const char readme_msg[] = "\t [:size=#entries]\n" "\t [:pause][:continue][:clear]\n" "\t [:name=histname1]\n" + "\t [:nohitcount]\n" "\t [:<handler>.<action>]\n" "\t [if <filter>]\n\n" "\t Note, special fields can be used as well:\n" @@ -5724,7 +5725,9 @@ static const char readme_msg[] = "\t .syscall display a syscall id as a syscall name\n" "\t .log2 display log2 value rather than raw number\n" "\t .buckets=size display values in groups of size rather than raw number\n" - "\t .usecs display a common_timestamp in microseconds\n\n" + "\t .usecs display a common_timestamp in microseconds\n" + "\t .percent display a number of percentage value\n" + "\t .graph display a bar-graph of a value\n\n" "\t The 'pause' parameter can be used to pause an existing hist\n" "\t trigger or to start a hist trigger but not log any events\n" "\t until told to do so. 'continue' can be used to start or\n" @@ -5732,6 +5735,8 @@ static const char readme_msg[] = "\t The 'clear' parameter will clear the contents of a running\n" "\t hist trigger and leave its current paused/active state\n" "\t unchanged.\n\n" + "\t The 'nohitcount' (or NOHC) parameter will suppress display of\n" + "\t raw hitcount in the histogram.\n\n" "\t The enable_hist and disable_hist triggers can be used to\n" "\t have one event conditionally start and stop another event's\n" "\t already-attached hist trigger. The syntax is analogous to\n" @@ -6572,7 +6577,7 @@ out: return ret; } -#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) +#ifdef CONFIG_TRACER_MAX_TRACE static ssize_t tracing_max_lat_read(struct file *filp, char __user *ubuf, @@ -6796,7 +6801,20 @@ waitagain: ret = print_trace_line(iter); if (ret == TRACE_TYPE_PARTIAL_LINE) { - /* don't print partial lines */ + /* + * If one print_trace_line() fills entire trace_seq in one shot, + * trace_seq_to_user() will returns -EBUSY because save_len == 0, + * In this case, we need to consume it, otherwise, loop will peek + * this event next time, resulting in an infinite loop. + */ + if (save_len == 0) { + iter->seq.full = 0; + trace_seq_puts(&iter->seq, "[LINE TOO BIG]\n"); + trace_consume(iter); + break; + } + + /* In other cases, don't print partial lines */ iter->seq.seq.len = save_len; break; } @@ -7587,7 +7605,7 @@ static const struct file_operations tracing_thresh_fops = { .llseek = generic_file_llseek, }; -#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) +#ifdef CONFIG_TRACER_MAX_TRACE static const struct file_operations tracing_max_lat_fops = { .open = tracing_open_generic, .read = tracing_max_lat_read, @@ -9601,7 +9619,9 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) create_trace_options_dir(tr); +#ifdef CONFIG_TRACER_MAX_TRACE trace_create_maxlat_file(tr, d_tracer); +#endif if (ftrace_create_function_files(tr, d_tracer)) MEM_FAIL(1, "Could not allocate function filter files"); @@ -9855,41 +9875,41 @@ static __init int tracer_init_tracefs(void) fs_initcall(tracer_init_tracefs); -static int trace_panic_handler(struct notifier_block *this, - unsigned long event, void *unused) -{ - if (ftrace_dump_on_oops) - ftrace_dump(ftrace_dump_on_oops); - return NOTIFY_OK; -} +static int trace_die_panic_handler(struct notifier_block *self, + unsigned long ev, void *unused); static struct notifier_block trace_panic_notifier = { - .notifier_call = trace_panic_handler, - .next = NULL, - .priority = 150 /* priority: INT_MAX >= x >= 0 */ + .notifier_call = trace_die_panic_handler, + .priority = INT_MAX - 1, }; -static int trace_die_handler(struct notifier_block *self, - unsigned long val, - void *data) -{ - switch (val) { - case DIE_OOPS: - if (ftrace_dump_on_oops) - ftrace_dump(ftrace_dump_on_oops); - break; - default: - break; - } - return NOTIFY_OK; -} - static struct notifier_block trace_die_notifier = { - .notifier_call = trace_die_handler, - .priority = 200 + .notifier_call = trace_die_panic_handler, + .priority = INT_MAX - 1, }; /* + * The idea is to execute the following die/panic callback early, in order + * to avoid showing irrelevant information in the trace (like other panic + * notifier functions); we are the 2nd to run, after hung_task/rcu_stall + * warnings get disabled (to prevent potential log flooding). + */ +static int trace_die_panic_handler(struct notifier_block *self, + unsigned long ev, void *unused) +{ + if (!ftrace_dump_on_oops) + return NOTIFY_DONE; + + /* The die notifier requires DIE_OOPS to trigger */ + if (self == &trace_die_notifier && ev != DIE_OOPS) + return NOTIFY_DONE; + + ftrace_dump(ftrace_dump_on_oops); + + return NOTIFY_DONE; +} + +/* * printk is set to max of 1024, we really don't need it that big. * Nothing should be printing 1000 characters anyway. */ diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index d42e24507152..e46a49269be2 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -308,8 +308,7 @@ struct trace_array { struct array_buffer max_buffer; bool allocated_snapshot; #endif -#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) \ - || defined(CONFIG_OSNOISE_TRACER) +#ifdef CONFIG_TRACER_MAX_TRACE unsigned long max_latency; #ifdef CONFIG_FSNOTIFY struct dentry *d_max_latency; @@ -615,7 +614,7 @@ void trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer, bool trace_is_tracepoint_string(const char *str); const char *trace_event_format(struct trace_iterator *iter, const char *fmt); void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, - va_list ap); + va_list ap) __printf(2, 0); int trace_empty(struct trace_iterator *iter); @@ -688,12 +687,11 @@ void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu, void *cond_data); void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu); -#endif /* CONFIG_TRACER_MAX_TRACE */ -#if (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) \ - || defined(CONFIG_OSNOISE_TRACER)) && defined(CONFIG_FSNOTIFY) +#ifdef CONFIG_FSNOTIFY #define LATENCY_FS_NOTIFY #endif +#endif /* CONFIG_TRACER_MAX_TRACE */ #ifdef LATENCY_FS_NOTIFY void latency_fsnotify(struct trace_array *tr); @@ -1942,8 +1940,6 @@ static inline void tracer_hardirqs_on(unsigned long a0, unsigned long a1) { } static inline void tracer_hardirqs_off(unsigned long a0, unsigned long a1) { } #endif -extern struct trace_iterator *tracepoint_print_iter; - /* * Reset the state of the trace_iterator so that it can read consumed data. * Normally, the trace_iterator is used for reading the data when it is not @@ -1956,17 +1952,30 @@ static __always_inline void trace_iterator_reset(struct trace_iterator *iter) } /* Check the name is good for event/group/fields */ -static inline bool is_good_name(const char *name) +static inline bool __is_good_name(const char *name, bool hash_ok) { - if (!isalpha(*name) && *name != '_') + if (!isalpha(*name) && *name != '_' && (!hash_ok || *name != '-')) return false; while (*++name != '\0') { - if (!isalpha(*name) && !isdigit(*name) && *name != '_') + if (!isalpha(*name) && !isdigit(*name) && *name != '_' && + (!hash_ok || *name != '-')) return false; } return true; } +/* Check the name is good for event/group/fields */ +static inline bool is_good_name(const char *name) +{ + return __is_good_name(name, false); +} + +/* Check the name is good for system */ +static inline bool is_good_system_name(const char *name) +{ + return __is_good_name(name, true); +} + /* Convert certain expected symbols into '_' when generating event names */ static inline void sanitize_event_name(char *name) { diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 61e3a2620fa3..05e791241812 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -251,16 +251,12 @@ int perf_kprobe_init(struct perf_event *p_event, bool is_retprobe) struct trace_event_call *tp_event; if (p_event->attr.kprobe_func) { - func = kzalloc(KSYM_NAME_LEN, GFP_KERNEL); - if (!func) - return -ENOMEM; - ret = strncpy_from_user( - func, u64_to_user_ptr(p_event->attr.kprobe_func), - KSYM_NAME_LEN); - if (ret == KSYM_NAME_LEN) - ret = -E2BIG; - if (ret < 0) - goto out; + func = strndup_user(u64_to_user_ptr(p_event->attr.kprobe_func), + KSYM_NAME_LEN); + if (IS_ERR(func)) { + ret = PTR_ERR(func); + return (ret == -EINVAL) ? -E2BIG : ret; + } if (func[0] == '\0') { kfree(func); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f71ea6e79b3c..33e0b4f8ebe6 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2796,6 +2796,42 @@ trace_create_new_event(struct trace_event_call *call, return file; } +#define MAX_BOOT_TRIGGERS 32 + +static struct boot_triggers { + const char *event; + char *trigger; +} bootup_triggers[MAX_BOOT_TRIGGERS]; + +static char bootup_trigger_buf[COMMAND_LINE_SIZE]; +static int nr_boot_triggers; + +static __init int setup_trace_triggers(char *str) +{ + char *trigger; + char *buf; + int i; + + strlcpy(bootup_trigger_buf, str, COMMAND_LINE_SIZE); + ring_buffer_expanded = true; + disable_tracing_selftest("running event triggers"); + + buf = bootup_trigger_buf; + for (i = 0; i < MAX_BOOT_TRIGGERS; i++) { + trigger = strsep(&buf, ","); + if (!trigger) + break; + bootup_triggers[i].event = strsep(&trigger, "."); + bootup_triggers[i].trigger = strsep(&trigger, "."); + if (!bootup_triggers[i].trigger) + break; + } + + nr_boot_triggers = i; + return 1; +} +__setup("trace_trigger=", setup_trace_triggers); + /* Add an event to a trace directory */ static int __trace_add_new_event(struct trace_event_call *call, struct trace_array *tr) @@ -2812,6 +2848,24 @@ __trace_add_new_event(struct trace_event_call *call, struct trace_array *tr) return event_define_fields(call); } +static void trace_early_triggers(struct trace_event_file *file, const char *name) +{ + int ret; + int i; + + for (i = 0; i < nr_boot_triggers; i++) { + if (strcmp(name, bootup_triggers[i].event)) + continue; + mutex_lock(&event_mutex); + ret = trigger_process_regex(file, bootup_triggers[i].trigger); + mutex_unlock(&event_mutex); + if (ret) + pr_err("Failed to register trigger '%s' on event %s\n", + bootup_triggers[i].trigger, + bootup_triggers[i].event); + } +} + /* * Just create a descriptor for early init. A descriptor is required * for enabling events at boot. We want to enable events before @@ -2822,12 +2876,19 @@ __trace_early_add_new_event(struct trace_event_call *call, struct trace_array *tr) { struct trace_event_file *file; + int ret; file = trace_create_new_event(call, tr); if (!file) return -ENOMEM; - return event_define_fields(call); + ret = event_define_fields(call); + if (ret) + return ret; + + trace_early_triggers(file, trace_event_name(call)); + + return 0; } struct ftrace_module_file_ops; @@ -3735,6 +3796,8 @@ static __init int event_trace_enable(void) list_add(&call->list, &ftrace_events); } + register_trigger_cmds(); + /* * We need the top trace array to have a working set of trace * points at early init, before the debug files and directories @@ -3749,7 +3812,6 @@ static __init int event_trace_enable(void) register_event_cmds(); - register_trigger_cmds(); return 0; } diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 1c82478e8dff..fcaf226b7744 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -69,7 +69,8 @@ C(INVALID_STR_OPERAND, "String type can not be an operand in expression"), \ C(EXPECT_NUMBER, "Expecting numeric literal"), \ C(UNARY_MINUS_SUBEXPR, "Unary minus not supported in sub-expressions"), \ - C(DIVISION_BY_ZERO, "Division by zero"), + C(DIVISION_BY_ZERO, "Division by zero"), \ + C(NEED_NOHC_VAL, "Non-hitcount value is required for 'nohitcount'"), #undef C #define C(a, b) HIST_ERR_##a @@ -506,6 +507,8 @@ enum hist_field_flags { HIST_FIELD_FL_ALIAS = 1 << 16, HIST_FIELD_FL_BUCKET = 1 << 17, HIST_FIELD_FL_CONST = 1 << 18, + HIST_FIELD_FL_PERCENT = 1 << 19, + HIST_FIELD_FL_GRAPH = 1 << 20, }; struct var_defs { @@ -524,6 +527,7 @@ struct hist_trigger_attrs { bool cont; bool clear; bool ts_in_usecs; + bool no_hitcount; unsigned int map_bits; char *assignment_str[TRACING_MAP_VARS_MAX]; @@ -617,7 +621,7 @@ struct action_data { * event param, and is passed to the synthetic event * invocation. */ - unsigned int var_ref_idx[TRACING_MAP_VARS_MAX]; + unsigned int var_ref_idx[SYNTH_FIELDS_MAX]; struct synth_event *synth_event; bool use_trace_keyword; char *synth_event_name; @@ -1356,6 +1360,8 @@ static const char *hist_field_name(struct hist_field *field, field_name = field->name; } else if (field->flags & HIST_FIELD_FL_TIMESTAMP) field_name = "common_timestamp"; + else if (field->flags & HIST_FIELD_FL_HITCOUNT) + field_name = "hitcount"; if (field_name == NULL) field_name = ""; @@ -1546,7 +1552,10 @@ parse_hist_trigger_attrs(struct trace_array *tr, char *trigger_str) ret = parse_assignment(tr, str, attrs); if (ret) goto free; - } else if (strcmp(str, "pause") == 0) + } else if (strcmp(str, "nohitcount") == 0 || + strcmp(str, "NOHC") == 0) + attrs->no_hitcount = true; + else if (strcmp(str, "pause") == 0) attrs->pause = true; else if ((strcmp(str, "cont") == 0) || (strcmp(str, "continue") == 0)) @@ -1705,6 +1714,10 @@ static const char *get_hist_field_flags(struct hist_field *hist_field) flags_str = "buckets"; else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP_USECS) flags_str = "usecs"; + else if (hist_field->flags & HIST_FIELD_FL_PERCENT) + flags_str = "percent"; + else if (hist_field->flags & HIST_FIELD_FL_GRAPH) + flags_str = "graph"; return flags_str; } @@ -2173,7 +2186,9 @@ static struct hist_field *create_var_ref(struct hist_trigger_data *hist_data, return ref_field; } } - + /* Sanity check to avoid out-of-bound write on 'hist_data->var_refs' */ + if (hist_data->n_var_refs >= TRACING_MAP_VARS_MAX) + return NULL; ref_field = create_hist_field(var_field->hist_data, NULL, flags, NULL); if (ref_field) { if (init_var_ref(ref_field, var_field, system, event_name)) { @@ -2313,6 +2328,14 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, if (ret || !(*buckets)) goto error; *flags |= HIST_FIELD_FL_BUCKET; + } else if (strncmp(modifier, "percent", 7) == 0) { + if (*flags & (HIST_FIELD_FL_VAR | HIST_FIELD_FL_KEY)) + goto error; + *flags |= HIST_FIELD_FL_PERCENT; + } else if (strncmp(modifier, "graph", 5) == 0) { + if (*flags & (HIST_FIELD_FL_VAR | HIST_FIELD_FL_KEY)) + goto error; + *flags |= HIST_FIELD_FL_GRAPH; } else { error: hist_err(tr, HIST_ERR_BAD_FIELD_MODIFIER, errpos(modifier)); @@ -2328,6 +2351,8 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, hist_data->attrs->ts_in_usecs = true; } else if (strcmp(field_name, "common_cpu") == 0) *flags |= HIST_FIELD_FL_CPU; + else if (strcmp(field_name, "hitcount") == 0) + *flags |= HIST_FIELD_FL_HITCOUNT; else { field = trace_find_event_field(file->event_call, field_name); if (!field || !field->size) { @@ -3586,6 +3611,7 @@ static int parse_action_params(struct trace_array *tr, char *params, while (params) { if (data->n_params >= SYNTH_FIELDS_MAX) { hist_err(tr, HIST_ERR_TOO_MANY_PARAMS, 0); + ret = -EINVAL; goto out; } @@ -3922,6 +3948,10 @@ static int trace_action_create(struct hist_trigger_data *hist_data, lockdep_assert_held(&event_mutex); + /* Sanity check to avoid out-of-bound write on 'data->var_ref_idx' */ + if (data->n_params > SYNTH_FIELDS_MAX) + return -EINVAL; + if (data->use_trace_keyword) synth_event_name = data->synth_event_name; else @@ -4328,8 +4358,8 @@ static int create_var_field(struct hist_trigger_data *hist_data, static int create_val_fields(struct hist_trigger_data *hist_data, struct trace_event_file *file) { + unsigned int i, j = 1, n_hitcount = 0; char *fields_str, *field_str; - unsigned int i, j = 1; int ret; ret = create_hitcount_val(hist_data); @@ -4346,8 +4376,10 @@ static int create_val_fields(struct hist_trigger_data *hist_data, if (!field_str) break; - if (strcmp(field_str, "hitcount") == 0) - continue; + if (strcmp(field_str, "hitcount") == 0) { + if (!n_hitcount++) + continue; + } ret = create_val_field(hist_data, j++, file, field_str); if (ret) @@ -4357,6 +4389,12 @@ static int create_val_fields(struct hist_trigger_data *hist_data, if (fields_str && (strcmp(fields_str, "hitcount") != 0)) ret = -EINVAL; out: + /* There is only raw hitcount but nohitcount suppresses it. */ + if (j == 1 && hist_data->attrs->no_hitcount) { + hist_err(hist_data->event_file->tr, HIST_ERR_NEED_NOHC_VAL, 0); + ret = -ENOENT; + } + return ret; } @@ -5285,33 +5323,101 @@ static void hist_trigger_print_key(struct seq_file *m, seq_puts(m, "}"); } +/* Get the 100 times of the percentage of @val in @total */ +static inline unsigned int __get_percentage(u64 val, u64 total) +{ + if (!total) + goto div0; + + if (val < (U64_MAX / 10000)) + return (unsigned int)div64_ul(val * 10000, total); + + total = div64_u64(total, 10000); + if (!total) + goto div0; + + return (unsigned int)div64_ul(val, total); +div0: + return val ? UINT_MAX : 0; +} + +#define BAR_CHAR '#' + +static inline const char *__fill_bar_str(char *buf, int size, u64 val, u64 max) +{ + unsigned int len = __get_percentage(val, max); + int i; + + if (len == UINT_MAX) { + snprintf(buf, size, "[ERROR]"); + return buf; + } + + len = len * size / 10000; + for (i = 0; i < len && i < size; i++) + buf[i] = BAR_CHAR; + while (i < size) + buf[i++] = ' '; + buf[size] = '\0'; + + return buf; +} + +struct hist_val_stat { + u64 max; + u64 total; +}; + +static void hist_trigger_print_val(struct seq_file *m, unsigned int idx, + const char *field_name, unsigned long flags, + struct hist_val_stat *stats, + struct tracing_map_elt *elt) +{ + u64 val = tracing_map_read_sum(elt, idx); + unsigned int pc; + char bar[21]; + + if (flags & HIST_FIELD_FL_PERCENT) { + pc = __get_percentage(val, stats[idx].total); + if (pc == UINT_MAX) + seq_printf(m, " %s (%%):[ERROR]", field_name); + else + seq_printf(m, " %s (%%): %3u.%02u", field_name, + pc / 100, pc % 100); + } else if (flags & HIST_FIELD_FL_GRAPH) { + seq_printf(m, " %s: %20s", field_name, + __fill_bar_str(bar, 20, val, stats[idx].max)); + } else if (flags & HIST_FIELD_FL_HEX) { + seq_printf(m, " %s: %10llx", field_name, val); + } else { + seq_printf(m, " %s: %10llu", field_name, val); + } +} + static void hist_trigger_entry_print(struct seq_file *m, struct hist_trigger_data *hist_data, + struct hist_val_stat *stats, void *key, struct tracing_map_elt *elt) { const char *field_name; - unsigned int i; + unsigned int i = HITCOUNT_IDX; + unsigned long flags; hist_trigger_print_key(m, hist_data, key, elt); - seq_printf(m, " hitcount: %10llu", - tracing_map_read_sum(elt, HITCOUNT_IDX)); + /* At first, show the raw hitcount if !nohitcount */ + if (!hist_data->attrs->no_hitcount) + hist_trigger_print_val(m, i, "hitcount", 0, stats, elt); for (i = 1; i < hist_data->n_vals; i++) { field_name = hist_field_name(hist_data->fields[i], 0); - - if (hist_data->fields[i]->flags & HIST_FIELD_FL_VAR || - hist_data->fields[i]->flags & HIST_FIELD_FL_EXPR) + flags = hist_data->fields[i]->flags; + if (flags & HIST_FIELD_FL_VAR || flags & HIST_FIELD_FL_EXPR) continue; - if (hist_data->fields[i]->flags & HIST_FIELD_FL_HEX) { - seq_printf(m, " %s: %10llx", field_name, - tracing_map_read_sum(elt, i)); - } else { - seq_printf(m, " %s: %10llu", field_name, - tracing_map_read_sum(elt, i)); - } + seq_puts(m, " "); + hist_trigger_print_val(m, i, field_name, flags, stats, elt); } print_actions(m, hist_data, elt); @@ -5324,7 +5430,9 @@ static int print_entries(struct seq_file *m, { struct tracing_map_sort_entry **sort_entries = NULL; struct tracing_map *map = hist_data->map; - int i, n_entries; + int i, j, n_entries; + struct hist_val_stat *stats = NULL; + u64 val; n_entries = tracing_map_sort_entries(map, hist_data->sort_keys, hist_data->n_sort_keys, @@ -5332,11 +5440,34 @@ static int print_entries(struct seq_file *m, if (n_entries < 0) return n_entries; + /* Calculate the max and the total for each field if needed. */ + for (j = 0; j < hist_data->n_vals; j++) { + if (!(hist_data->fields[j]->flags & + (HIST_FIELD_FL_PERCENT | HIST_FIELD_FL_GRAPH))) + continue; + if (!stats) { + stats = kcalloc(hist_data->n_vals, sizeof(*stats), + GFP_KERNEL); + if (!stats) { + n_entries = -ENOMEM; + goto out; + } + } + for (i = 0; i < n_entries; i++) { + val = tracing_map_read_sum(sort_entries[i]->elt, j); + stats[j].total += val; + if (stats[j].max < val) + stats[j].max = val; + } + } + for (i = 0; i < n_entries; i++) - hist_trigger_entry_print(m, hist_data, + hist_trigger_entry_print(m, hist_data, stats, sort_entries[i]->key, sort_entries[i]->elt); + kfree(stats); +out: tracing_map_destroy_sort_entries(sort_entries, n_entries); return n_entries; @@ -5726,6 +5857,7 @@ static int event_hist_trigger_print(struct seq_file *m, struct hist_trigger_data *hist_data = data->private_data; struct hist_field *field; bool have_var = false; + bool show_val = false; unsigned int i; seq_puts(m, HIST_PREFIX); @@ -5756,12 +5888,16 @@ static int event_hist_trigger_print(struct seq_file *m, continue; } - if (i == HITCOUNT_IDX) + if (i == HITCOUNT_IDX) { + if (hist_data->attrs->no_hitcount) + continue; seq_puts(m, "hitcount"); - else { - seq_puts(m, ","); + } else { + if (show_val) + seq_puts(m, ","); hist_field_print(m, field); } + show_val = true; } if (have_var) { @@ -5812,6 +5948,8 @@ static int event_hist_trigger_print(struct seq_file *m, seq_printf(m, ":size=%u", (1 << hist_data->map->map_bits)); if (hist_data->enable_timestamps) seq_printf(m, ":clock=%s", hist_data->attrs->clock); + if (hist_data->attrs->no_hitcount) + seq_puts(m, ":nohitcount"); print_actions_spec(m, hist_data); @@ -6438,7 +6576,7 @@ enable: if (se) se->ref++; out: - if (ret == 0) + if (ret == 0 && glob[0]) hist_err_clear(); return ret; diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index c3b582d19b62..67592eed0be8 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -1282,12 +1282,12 @@ static int __create_synth_event(const char *name, const char *raw_fields) goto err_free_arg; } - fields[n_fields++] = field; if (n_fields == SYNTH_FIELDS_MAX) { synth_err(SYNTH_ERR_TOO_MANY_FIELDS, 0); ret = -EINVAL; goto err_free_arg; } + fields[n_fields++] = field; n_fields_this_loop++; } diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 918730d74932..e535959939d3 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -1067,7 +1067,14 @@ int set_trigger_filter(char *filter_str, /* The filter is for the 'trigger' event, not the triggered event */ ret = create_event_filter(file->tr, file->event_call, - filter_str, false, &filter); + filter_str, true, &filter); + + /* Only enabled set_str for error handling */ + if (filter) { + kfree(filter->filter_string); + filter->filter_string = NULL; + } + /* * If create_event_filter() fails, filter still needs to be freed. * Which the calling code will do with data->filter. @@ -1078,8 +1085,14 @@ int set_trigger_filter(char *filter_str, rcu_assign_pointer(data->filter, filter); if (tmp) { - /* Make sure the call is done with the filter */ - tracepoint_synchronize_unregister(); + /* + * Make sure the call is done with the filter. + * It is possible that a filter could fail at boot up, + * and then this path will be called. Avoid the synchronization + * in that case. + */ + if (system_state != SYSTEM_BOOTING) + tracepoint_synchronize_unregister(); free_event_filter(tmp); } diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 539b08ae7020..908e8a13c675 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -1359,6 +1359,7 @@ put_user_lock: put_user: user_event_destroy_fields(user); user_event_destroy_validators(user); + kfree(user->call.print_fmt); kfree(user); return ret; } @@ -1488,7 +1489,7 @@ static ssize_t user_events_write(struct file *file, const char __user *ubuf, if (unlikely(*ppos != 0)) return -EFAULT; - if (unlikely(import_single_range(WRITE, (char __user *)ubuf, + if (unlikely(import_single_range(ITER_SOURCE, (char __user *)ubuf, count, &iov, &i))) return -EFAULT; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 5a75b039e586..ee77c8203bd5 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1344,7 +1344,6 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs, return; fbuffer.regs = regs; - entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event); entry->ip = (unsigned long)tk->rp.kp.addr; store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize); @@ -1385,7 +1384,6 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, return; fbuffer.regs = regs; - entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event); entry->func = (unsigned long)tk->rp.kp.addr; entry->ret_ip = get_kretprobe_retaddr(ri); store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize); diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 4300c5dc4e5d..94c1b5eb1dc0 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -49,6 +49,28 @@ #define DEFAULT_TIMERLAT_PRIO 95 /* FIFO 95 */ /* + * osnoise/options entries. + */ +enum osnoise_options_index { + OSN_DEFAULTS = 0, + OSN_WORKLOAD, + OSN_PANIC_ON_STOP, + OSN_PREEMPT_DISABLE, + OSN_IRQ_DISABLE, + OSN_MAX +}; + +static const char * const osnoise_options_str[OSN_MAX] = { + "DEFAULTS", + "OSNOISE_WORKLOAD", + "PANIC_ON_STOP", + "OSNOISE_PREEMPT_DISABLE", + "OSNOISE_IRQ_DISABLE" }; + +#define OSN_DEFAULT_OPTIONS 0x2 +static unsigned long osnoise_options = OSN_DEFAULT_OPTIONS; + +/* * trace_array of the enabled osnoise/timerlat instances. */ struct osnoise_instance { @@ -1173,11 +1195,12 @@ trace_sched_switch_callback(void *data, bool preempt, unsigned int prev_state) { struct osnoise_variables *osn_var = this_cpu_osn_var(); + int workload = test_bit(OSN_WORKLOAD, &osnoise_options); - if (p->pid != osn_var->pid) + if ((p->pid != osn_var->pid) || !workload) thread_exit(osn_var, p); - if (n->pid != osn_var->pid) + if ((n->pid != osn_var->pid) || !workload) thread_entry(osn_var, n); } @@ -1255,6 +1278,9 @@ static __always_inline void osnoise_stop_tracing(void) trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_, "stop tracing hit on cpu %d\n", smp_processor_id()); + if (test_bit(OSN_PANIC_ON_STOP, &osnoise_options)) + panic("tracer hit stop condition on CPU %d\n", smp_processor_id()); + tracer_tracing_off(tr); } rcu_read_unlock(); @@ -1289,12 +1315,14 @@ static void notify_new_max_latency(u64 latency) */ static int run_osnoise(void) { + bool disable_irq = test_bit(OSN_IRQ_DISABLE, &osnoise_options); struct osnoise_variables *osn_var = this_cpu_osn_var(); u64 start, sample, last_sample; u64 last_int_count, int_count; s64 noise = 0, max_noise = 0; s64 total, last_total = 0; struct osnoise_sample s; + bool disable_preemption; unsigned int threshold; u64 runtime, stop_in; u64 sum_noise = 0; @@ -1302,6 +1330,12 @@ static int run_osnoise(void) int ret = -1; /* + * Disabling preemption is only required if IRQs are enabled, + * and the options is set on. + */ + disable_preemption = !disable_irq && test_bit(OSN_PREEMPT_DISABLE, &osnoise_options); + + /* * Considers the current thread as the workload. */ osn_var->pid = current->pid; @@ -1317,6 +1351,15 @@ static int run_osnoise(void) threshold = tracing_thresh ? : 5000; /* + * Apply PREEMPT and IRQ disabled options. + */ + if (disable_irq) + local_irq_disable(); + + if (disable_preemption) + preempt_disable(); + + /* * Make sure NMIs see sampling first */ osn_var->sampling = true; @@ -1403,16 +1446,21 @@ static int run_osnoise(void) * cond_resched() */ if (IS_ENABLED(CONFIG_PREEMPT_RCU)) { - local_irq_disable(); + if (!disable_irq) + local_irq_disable(); + rcu_momentary_dyntick_idle(); - local_irq_enable(); + + if (!disable_irq) + local_irq_enable(); } /* * For the non-preemptive kernel config: let threads runs, if - * they so wish. + * they so wish, unless set not do to so. */ - cond_resched(); + if (!disable_irq && !disable_preemption) + cond_resched(); last_sample = sample; last_int_count = int_count; @@ -1432,6 +1480,15 @@ static int run_osnoise(void) barrier(); /* + * Return to the preemptive state. + */ + if (disable_preemption) + preempt_enable(); + + if (disable_irq) + local_irq_enable(); + + /* * Save noise info. */ s.noise = time_to_us(sum_noise); @@ -1710,9 +1767,16 @@ static void stop_kthread(unsigned int cpu) struct task_struct *kthread; kthread = per_cpu(per_cpu_osnoise_var, cpu).kthread; - if (kthread) + if (kthread) { kthread_stop(kthread); - per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL; + per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL; + } else { + if (!test_bit(OSN_WORKLOAD, &osnoise_options)) { + per_cpu(per_cpu_osnoise_var, cpu).sampling = false; + barrier(); + return; + } + } } /* @@ -1746,6 +1810,13 @@ static int start_kthread(unsigned int cpu) snprintf(comm, 24, "timerlat/%d", cpu); main = timerlat_main; } else { + /* if no workload, just return */ + if (!test_bit(OSN_WORKLOAD, &osnoise_options)) { + per_cpu(per_cpu_osnoise_var, cpu).sampling = true; + barrier(); + return 0; + } + snprintf(comm, 24, "osnoise/%d", cpu); } @@ -1861,6 +1932,150 @@ static void osnoise_init_hotplug_support(void) #endif /* CONFIG_HOTPLUG_CPU */ /* + * seq file functions for the osnoise/options file. + */ +static void *s_options_start(struct seq_file *s, loff_t *pos) +{ + int option = *pos; + + mutex_lock(&interface_lock); + + if (option >= OSN_MAX) + return NULL; + + return pos; +} + +static void *s_options_next(struct seq_file *s, void *v, loff_t *pos) +{ + int option = ++(*pos); + + if (option >= OSN_MAX) + return NULL; + + return pos; +} + +static int s_options_show(struct seq_file *s, void *v) +{ + loff_t *pos = v; + int option = *pos; + + if (option == OSN_DEFAULTS) { + if (osnoise_options == OSN_DEFAULT_OPTIONS) + seq_printf(s, "%s", osnoise_options_str[option]); + else + seq_printf(s, "NO_%s", osnoise_options_str[option]); + goto out; + } + + if (test_bit(option, &osnoise_options)) + seq_printf(s, "%s", osnoise_options_str[option]); + else + seq_printf(s, "NO_%s", osnoise_options_str[option]); + +out: + if (option != OSN_MAX) + seq_puts(s, " "); + + return 0; +} + +static void s_options_stop(struct seq_file *s, void *v) +{ + seq_puts(s, "\n"); + mutex_unlock(&interface_lock); +} + +static const struct seq_operations osnoise_options_seq_ops = { + .start = s_options_start, + .next = s_options_next, + .show = s_options_show, + .stop = s_options_stop +}; + +static int osnoise_options_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &osnoise_options_seq_ops); +}; + +/** + * osnoise_options_write - Write function for "options" entry + * @filp: The active open file structure + * @ubuf: The user buffer that contains the value to write + * @cnt: The maximum number of bytes to write to "file" + * @ppos: The current position in @file + * + * Writing the option name sets the option, writing the "NO_" + * prefix in front of the option name disables it. + * + * Writing "DEFAULTS" resets the option values to the default ones. + */ +static ssize_t osnoise_options_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + int running, option, enable, retval; + char buf[256], *option_str; + + if (cnt >= 256) + return -EINVAL; + + if (copy_from_user(buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + if (strncmp(buf, "NO_", 3)) { + option_str = strstrip(buf); + enable = true; + } else { + option_str = strstrip(&buf[3]); + enable = false; + } + + option = match_string(osnoise_options_str, OSN_MAX, option_str); + if (option < 0) + return -EINVAL; + + /* + * trace_types_lock is taken to avoid concurrency on start/stop. + */ + mutex_lock(&trace_types_lock); + running = osnoise_has_registered_instances(); + if (running) + stop_per_cpu_kthreads(); + + mutex_lock(&interface_lock); + /* + * avoid CPU hotplug operations that might read options. + */ + cpus_read_lock(); + + retval = cnt; + + if (enable) { + if (option == OSN_DEFAULTS) + osnoise_options = OSN_DEFAULT_OPTIONS; + else + set_bit(option, &osnoise_options); + } else { + if (option == OSN_DEFAULTS) + retval = -EINVAL; + else + clear_bit(option, &osnoise_options); + } + + cpus_read_unlock(); + mutex_unlock(&interface_lock); + + if (running) + start_per_cpu_kthreads(); + mutex_unlock(&trace_types_lock); + + return retval; +} + +/* * osnoise_cpus_read - Read function for reading the "cpus" file * @filp: The active open file structure * @ubuf: The userspace provided buffer to read value into @@ -2042,6 +2257,14 @@ static const struct file_operations cpus_fops = { .llseek = generic_file_llseek, }; +static const struct file_operations osnoise_options_fops = { + .open = osnoise_options_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .write = osnoise_options_write +}; + #ifdef CONFIG_TIMERLAT_TRACER #ifdef CONFIG_STACKTRACE static int init_timerlat_stack_tracefs(struct dentry *top_dir) @@ -2128,6 +2351,11 @@ static int init_tracefs(void) if (!tmp) goto err; + tmp = trace_create_file("options", TRACE_MODE_WRITE, top_dir, NULL, + &osnoise_options_fops); + if (!tmp) + goto err; + ret = init_timerlat_tracefs(top_dir); if (ret) goto err; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 67f47ea27921..57a13b61f186 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -11,6 +11,7 @@ #include <linux/kprobes.h> #include <linux/sched/clock.h> #include <linux/sched/mm.h> +#include <linux/idr.h> #include "trace_output.h" @@ -21,8 +22,6 @@ DECLARE_RWSEM(trace_event_sem); static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; -static int next_event_type = __TRACE_LAST_TYPE; - enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; @@ -323,8 +322,9 @@ void trace_event_printf(struct trace_iterator *iter, const char *fmt, ...) } EXPORT_SYMBOL(trace_event_printf); -static int trace_output_raw(struct trace_iterator *iter, char *name, - char *fmt, va_list ap) +static __printf(3, 0) +int trace_output_raw(struct trace_iterator *iter, char *name, + char *fmt, va_list ap) { struct trace_seq *s = &iter->seq; @@ -688,38 +688,23 @@ struct trace_event *ftrace_find_event(int type) return NULL; } -static LIST_HEAD(ftrace_event_list); +static DEFINE_IDA(trace_event_ida); -static int trace_search_list(struct list_head **list) +static void free_trace_event_type(int type) { - struct trace_event *e = NULL, *iter; - int next = __TRACE_LAST_TYPE; - - if (list_empty(&ftrace_event_list)) { - *list = &ftrace_event_list; - return next; - } + if (type >= __TRACE_LAST_TYPE) + ida_free(&trace_event_ida, type); +} - /* - * We used up all possible max events, - * lets see if somebody freed one. - */ - list_for_each_entry(iter, &ftrace_event_list, list) { - if (iter->type != next) { - e = iter; - break; - } - next++; - } +static int alloc_trace_event_type(void) +{ + int next; - /* Did we used up all 65 thousand events??? */ - if (next > TRACE_EVENT_TYPE_MAX) + /* Skip static defined type numbers */ + next = ida_alloc_range(&trace_event_ida, __TRACE_LAST_TYPE, + TRACE_EVENT_TYPE_MAX, GFP_KERNEL); + if (next < 0) return 0; - - if (e) - *list = &e->list; - else - *list = &ftrace_event_list; return next; } @@ -761,28 +746,10 @@ int register_trace_event(struct trace_event *event) if (WARN_ON(!event->funcs)) goto out; - INIT_LIST_HEAD(&event->list); - if (!event->type) { - struct list_head *list = NULL; - - if (next_event_type > TRACE_EVENT_TYPE_MAX) { - - event->type = trace_search_list(&list); - if (!event->type) - goto out; - - } else { - - event->type = next_event_type++; - list = &ftrace_event_list; - } - - if (WARN_ON(ftrace_find_event(event->type))) + event->type = alloc_trace_event_type(); + if (!event->type) goto out; - - list_add_tail(&event->list, list); - } else if (WARN(event->type > __TRACE_LAST_TYPE, "Need to add type to trace.h")) { goto out; @@ -819,7 +786,7 @@ EXPORT_SYMBOL_GPL(register_trace_event); int __unregister_trace_event(struct trace_event *event) { hlist_del(&event->node); - list_del(&event->list); + free_trace_event_type(event->type); return 0; } diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 36dff277de46..01ebabbbe8c9 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -76,9 +76,11 @@ const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\""; /* Fetch type information table */ static const struct fetch_type probe_fetch_types[] = { /* Special types */ - __ASSIGN_FETCH_TYPE("string", string, string, sizeof(u32), 1, + __ASSIGN_FETCH_TYPE("string", string, string, sizeof(u32), 1, 1, "__data_loc char[]"), - __ASSIGN_FETCH_TYPE("ustring", string, string, sizeof(u32), 1, + __ASSIGN_FETCH_TYPE("ustring", string, string, sizeof(u32), 1, 1, + "__data_loc char[]"), + __ASSIGN_FETCH_TYPE("symstr", string, string, sizeof(u32), 1, 1, "__data_loc char[]"), /* Basic types */ ASSIGN_FETCH_TYPE(u8, u8, 0), @@ -98,10 +100,15 @@ static const struct fetch_type probe_fetch_types[] = { ASSIGN_FETCH_TYPE_END }; -static const struct fetch_type *find_fetch_type(const char *type) +static const struct fetch_type *find_fetch_type(const char *type, unsigned long flags) { int i; + /* Reject the symbol/symstr for uprobes */ + if (type && (flags & TPARG_FL_USER) && + (!strcmp(type, "symbol") || !strcmp(type, "symstr"))) + return NULL; + if (!type) type = DEFAULT_FETCH_TYPE_STR; @@ -119,13 +126,13 @@ static const struct fetch_type *find_fetch_type(const char *type) switch (bs) { case 8: - return find_fetch_type("u8"); + return find_fetch_type("u8", flags); case 16: - return find_fetch_type("u16"); + return find_fetch_type("u16", flags); case 32: - return find_fetch_type("u32"); + return find_fetch_type("u32", flags); case 64: - return find_fetch_type("u64"); + return find_fetch_type("u64", flags); default: goto fail; } @@ -246,7 +253,7 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, return -EINVAL; } strlcpy(buf, event, slash - event + 1); - if (!is_good_name(buf)) { + if (!is_good_system_name(buf)) { trace_probe_log_err(offset, BAD_GROUP_NAME); return -EINVAL; } @@ -478,7 +485,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type, DEREF_OPEN_BRACE); return -EINVAL; } else { - const struct fetch_type *t2 = find_fetch_type(NULL); + const struct fetch_type *t2 = find_fetch_type(NULL, flags); *tmp = '\0'; ret = parse_probe_arg(arg, t2, &code, end, flags, offs); @@ -630,9 +637,9 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, /* The type of $comm must be "string", and not an array. */ if (parg->count || (t && strcmp(t, "string"))) goto out; - parg->type = find_fetch_type("string"); + parg->type = find_fetch_type("string", flags); } else - parg->type = find_fetch_type(t); + parg->type = find_fetch_type(t, flags); if (!parg->type) { trace_probe_log_err(offset + (t ? (t - arg) : 0), BAD_TYPE); goto out; @@ -662,16 +669,26 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, ret = -EINVAL; /* Store operation */ - if (!strcmp(parg->type->name, "string") || - !strcmp(parg->type->name, "ustring")) { - if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_UDEREF && - code->op != FETCH_OP_IMM && code->op != FETCH_OP_COMM && - code->op != FETCH_OP_DATA && code->op != FETCH_OP_TP_ARG) { - trace_probe_log_err(offset + (t ? (t - arg) : 0), - BAD_STRING); - goto fail; + if (parg->type->is_string) { + if (!strcmp(parg->type->name, "symstr")) { + if (code->op != FETCH_OP_REG && code->op != FETCH_OP_STACK && + code->op != FETCH_OP_RETVAL && code->op != FETCH_OP_ARG && + code->op != FETCH_OP_DEREF && code->op != FETCH_OP_TP_ARG) { + trace_probe_log_err(offset + (t ? (t - arg) : 0), + BAD_SYMSTRING); + goto fail; + } + } else { + if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_UDEREF && + code->op != FETCH_OP_IMM && code->op != FETCH_OP_COMM && + code->op != FETCH_OP_DATA && code->op != FETCH_OP_TP_ARG) { + trace_probe_log_err(offset + (t ? (t - arg) : 0), + BAD_STRING); + goto fail; + } } - if ((code->op == FETCH_OP_IMM || code->op == FETCH_OP_COMM || + if (!strcmp(parg->type->name, "symstr") || + (code->op == FETCH_OP_IMM || code->op == FETCH_OP_COMM || code->op == FETCH_OP_DATA) || code->op == FETCH_OP_TP_ARG || parg->count) { /* @@ -679,6 +696,8 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, * must be kept, and if parg->count != 0, this is an * array of string pointers instead of string address * itself. + * For the symstr, it doesn't need to dereference, thus + * it just get the value. */ code++; if (code->op != FETCH_OP_NOP) { @@ -690,6 +709,8 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, if (!strcmp(parg->type->name, "ustring") || code->op == FETCH_OP_UDEREF) code->op = FETCH_OP_ST_USTRING; + else if (!strcmp(parg->type->name, "symstr")) + code->op = FETCH_OP_ST_SYMSTR; else code->op = FETCH_OP_ST_STRING; code->size = parg->type->size; @@ -919,8 +940,7 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len, for (i = 0; i < tp->nr_args; i++) { parg = tp->args + i; if (parg->count) { - if ((strcmp(parg->type->name, "string") == 0) || - (strcmp(parg->type->name, "ustring") == 0)) + if (parg->type->is_string) fmt = ", __get_str(%s[%d])"; else fmt = ", REC->%s[%d]"; @@ -928,8 +948,7 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len, pos += snprintf(buf + pos, LEN_OR_ZERO, fmt, parg->name, j); } else { - if ((strcmp(parg->type->name, "string") == 0) || - (strcmp(parg->type->name, "ustring") == 0)) + if (parg->type->is_string) fmt = ", __get_str(%s)"; else fmt = ", REC->%s"; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index de38f1c03776..23acfd1c3812 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -98,6 +98,7 @@ enum fetch_op { FETCH_OP_ST_UMEM, /* Mem: .offset, .size */ FETCH_OP_ST_STRING, /* String: .offset, .size */ FETCH_OP_ST_USTRING, /* User String: .offset, .size */ + FETCH_OP_ST_SYMSTR, /* Kernel Symbol String: .offset, .size */ // Stage 4 (modify) op FETCH_OP_MOD_BF, /* Bitfield: .basesize, .lshift, .rshift */ // Stage 5 (loop) op @@ -133,7 +134,8 @@ struct fetch_insn { struct fetch_type { const char *name; /* Name of type */ size_t size; /* Byte size of type */ - int is_signed; /* Signed flag */ + bool is_signed; /* Signed flag */ + bool is_string; /* String flag */ print_type_func_t print; /* Print functions */ const char *fmt; /* Format string */ const char *fmttype; /* Name in format file */ @@ -177,16 +179,19 @@ DECLARE_BASIC_PRINT_TYPE_FUNC(symbol); #define _ADDR_FETCH_TYPE(t) __ADDR_FETCH_TYPE(t) #define ADDR_FETCH_TYPE _ADDR_FETCH_TYPE(BITS_PER_LONG) -#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \ - {.name = _name, \ +#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, str, _fmttype) \ + {.name = _name, \ .size = _size, \ - .is_signed = sign, \ + .is_signed = (bool)sign, \ + .is_string = (bool)str, \ .print = PRINT_TYPE_FUNC_NAME(ptype), \ .fmt = PRINT_TYPE_FMT_NAME(ptype), \ .fmttype = _fmttype, \ } + +/* Non string types can use these macros */ #define _ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \ - __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, #_fmttype) + __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, 0, #_fmttype) #define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ _ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, ptype) @@ -353,7 +358,8 @@ int trace_probe_create(const char *raw_command, int (*createfn)(int, const char #define TPARG_FL_KERNEL BIT(1) #define TPARG_FL_FENTRY BIT(2) #define TPARG_FL_TPOINT BIT(3) -#define TPARG_FL_MASK GENMASK(3, 0) +#define TPARG_FL_USER BIT(4) +#define TPARG_FL_MASK GENMASK(4, 0) extern int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, const char *argv, unsigned int flags); @@ -431,6 +437,7 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(ARRAY_TOO_BIG, "Array number is too big"), \ C(BAD_TYPE, "Unknown type is specified"), \ C(BAD_STRING, "String accepts only memory argument"), \ + C(BAD_SYMSTRING, "Symbol String doesn't accept data/userdata"), \ C(BAD_BITFIELD, "Invalid bitfield"), \ C(ARG_NAME_TOO_LONG, "Argument name is too long"), \ C(NO_ARG_NAME, "Argument name is not specified"), \ diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h index b3bdb8ddb862..5cea672243f6 100644 --- a/kernel/trace/trace_probe_tmpl.h +++ b/kernel/trace/trace_probe_tmpl.h @@ -67,6 +67,37 @@ probe_mem_read(void *dest, void *src, size_t size); static nokprobe_inline int probe_mem_read_user(void *dest, void *src, size_t size); +static nokprobe_inline int +fetch_store_symstrlen(unsigned long addr) +{ + char namebuf[KSYM_SYMBOL_LEN]; + int ret; + + ret = sprint_symbol(namebuf, addr); + if (ret < 0) + return 0; + + return ret + 1; +} + +/* + * Fetch a null-terminated symbol string + offset. Caller MUST set *(u32 *)buf + * with max length and relative data location. + */ +static nokprobe_inline int +fetch_store_symstring(unsigned long addr, void *dest, void *base) +{ + int maxlen = get_loc_len(*(u32 *)dest); + void *__dest; + + if (unlikely(!maxlen)) + return -ENOMEM; + + __dest = get_loc_data(dest, base); + + return sprint_symbol(__dest, addr); +} + /* From the 2nd stage, routine is same */ static nokprobe_inline int process_fetch_insn_bottom(struct fetch_insn *code, unsigned long val, @@ -99,16 +130,22 @@ stage2: stage3: /* 3rd stage: store value to buffer */ if (unlikely(!dest)) { - if (code->op == FETCH_OP_ST_STRING) { + switch (code->op) { + case FETCH_OP_ST_STRING: ret = fetch_store_strlen(val + code->offset); code++; goto array; - } else if (code->op == FETCH_OP_ST_USTRING) { + case FETCH_OP_ST_USTRING: ret += fetch_store_strlen_user(val + code->offset); code++; goto array; - } else + case FETCH_OP_ST_SYMSTR: + ret += fetch_store_symstrlen(val + code->offset); + code++; + goto array; + default: return -EILSEQ; + } } switch (code->op) { @@ -129,6 +166,10 @@ stage3: loc = *(u32 *)dest; ret = fetch_store_string_user(val + code->offset, dest, base); break; + case FETCH_OP_ST_SYMSTR: + loc = *(u32 *)dest; + ret = fetch_store_symstring(val + code->offset, dest, base); + break; default: return -EILSEQ; } diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index a2d301f58ced..ff0536cea968 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -785,7 +785,14 @@ static struct fgraph_ops fgraph_ops __initdata = { }; #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS -noinline __noclone static void trace_direct_tramp(void) { } +#ifndef CALL_DEPTH_ACCOUNT +#define CALL_DEPTH_ACCOUNT "" +#endif + +noinline __noclone static void trace_direct_tramp(void) +{ + asm(CALL_DEPTH_ACCOUNT); +} #endif /* diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index fb58e86dd117..8d64b6553aed 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -691,7 +691,8 @@ static int __trace_uprobe_create(int argc, const char **argv) for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { trace_probe_log_set_index(i + 2); ret = traceprobe_parse_probe_arg(&tu->tp, i, argv[i], - is_return ? TPARG_FL_RETURN : 0); + (is_return ? TPARG_FL_RETURN : 0) | + TPARG_FL_USER); if (ret) goto error; } |