From 78a01feb4024ffb6c6321e45dc2bfcafb2d1d1e5 Mon Sep 17 00:00:00 2001 From: Zheng Yejian Date: Tue, 25 Oct 2022 15:39:23 +0000 Subject: ftrace: Clean comments related to FTRACE_OPS_FL_PER_CPU Commit b3a88803ac5b ("ftrace: Kill FTRACE_OPS_FL_PER_CPU") didn't completely remove the comments related to FTRACE_OPS_FL_PER_CPU. Link: https://lkml.kernel.org/r/20221025153923.1995973-1-zhengyejian1@huawei.com Fixes: b3a88803ac5b ("ftrace: Kill FTRACE_OPS_FL_PER_CPU") Signed-off-by: Zheng Yejian Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 33236241f236..65a5d36463e0 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -163,7 +163,7 @@ static void ftrace_sync_ipi(void *data) static ftrace_func_t ftrace_ops_get_list_func(struct ftrace_ops *ops) { /* - * If this is a dynamic, RCU, or per CPU ops, or we force list func, + * If this is a dynamic or RCU ops, or we force list func, * then it needs to call the list anyway. */ if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_RCU) || @@ -3071,8 +3071,6 @@ out: /* * Dynamic ops may be freed, we must make sure that all * callers are done before leaving this function. - * The same goes for freeing the per_cpu data of the per_cpu - * ops. */ if (ops->flags & FTRACE_OPS_FL_DYNAMIC) { /* @@ -7519,8 +7517,6 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, /* * Check the following for each ops before calling their func: * if RCU flag is set, then rcu_is_watching() must be true - * if PER_CPU is set, then ftrace_function_local_disable() - * must be false * Otherwise test if the ip matches the ops filter * * If any of the above fails then the op->func() is not executed. @@ -7570,8 +7566,8 @@ NOKPROBE_SYMBOL(arch_ftrace_ops_list_func); /* * If there's only one function registered but it does not support - * recursion, needs RCU protection and/or requires per cpu handling, then - * this function will be called by the mcount trampoline. + * recursion, needs RCU protection, then this function will be called + * by the mcount trampoline. */ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs) -- cgit v1.2.3 From a01fdc897fa56ffb596d0a0cd7ea2ab3bd8398c5 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Thu, 20 Oct 2022 21:00:56 -0400 Subject: tracing: Add trace_trigger kernel command line option Allow triggers to be enabled at kernel boot up. For example: trace_trigger="sched_switch.stacktrace if prev_state == 2" The above will enable the stacktrace trigger on top of the sched_switch event and only trigger if its prev_state is 2 (TASK_UNINTERRUPTIBLE). Then at boot up, a stacktrace will trigger and be recorded in the tracing ring buffer every time the sched_switch happens where the previous state is TASK_INTERRUPTIBLE. Another useful trigger would be "traceoff" which can stop tracing on an event if a field of the event matches a certain value defined by the filter ("if" statement). Link: https://lore.kernel.org/linux-trace-kernel/20221020210056.0d8d0a5b@gandalf.local.home Signed-off-by: Steven Rostedt (Google) --- Documentation/admin-guide/kernel-parameters.txt | 19 +++++++ kernel/trace/trace_events.c | 72 ++++++++++++++++++++++++- 2 files changed, 89 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a465d5242774..ccf91a4bf113 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6257,6 +6257,25 @@ See also Documentation/trace/ftrace.rst "trace options" section. + trace_trigger=[trigger-list] + [FTRACE] Add a event trigger on specific events. + Set a trigger on top of a specific event, with an optional + filter. + + The format is is "trace_trigger=.[ if ],..." + Where more than one trigger may be specified that are comma deliminated. + + For example: + + trace_trigger="sched_switch.stacktrace if prev_state == 2" + + The above will enable the "stacktrace" trigger on the "sched_switch" + event but only trigger it if the "prev_state" of the "sched_switch" + event is "2" (TASK_UNINTERUPTIBLE). + + See also "Event triggers" in Documentation/trace/events.rst + + traceoff_on_warning [FTRACE] enable this option to disable tracing when a warning is hit. This turns off "tracing_on". Tracing can diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f71ea6e79b3c..3bfaf560ecc4 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2796,6 +2796,44 @@ trace_create_new_event(struct trace_event_call *call, return file; } +#ifdef CONFIG_HIST_TRIGGERS +#define MAX_BOOT_TRIGGERS 32 + +static struct boot_triggers { + const char *event; + char *trigger; +} bootup_triggers[MAX_BOOT_TRIGGERS]; + +static char bootup_trigger_buf[COMMAND_LINE_SIZE]; +static int nr_boot_triggers; + +static __init int setup_trace_triggers(char *str) +{ + char *trigger; + char *buf; + int i; + + strlcpy(bootup_trigger_buf, str, COMMAND_LINE_SIZE); + ring_buffer_expanded = true; + disable_tracing_selftest("running event triggers"); + + buf = bootup_trigger_buf; + for (i = 0; i < MAX_BOOT_TRIGGERS; i++) { + trigger = strsep(&buf, ","); + if (!trigger) + break; + bootup_triggers[i].event = strsep(&trigger, "."); + bootup_triggers[i].trigger = strsep(&trigger, "."); + if (!bootup_triggers[i].trigger) + break; + } + + nr_boot_triggers = i; + return 1; +} +__setup("trace_trigger=", setup_trace_triggers); +#endif + /* Add an event to a trace directory */ static int __trace_add_new_event(struct trace_event_call *call, struct trace_array *tr) @@ -2812,6 +2850,28 @@ __trace_add_new_event(struct trace_event_call *call, struct trace_array *tr) return event_define_fields(call); } +#ifdef CONFIG_HIST_TRIGGERS +static void trace_early_triggers(struct trace_event_file *file, const char *name) +{ + int ret; + int i; + + for (i = 0; i < nr_boot_triggers; i++) { + if (strcmp(name, bootup_triggers[i].event)) + continue; + mutex_lock(&event_mutex); + ret = trigger_process_regex(file, bootup_triggers[i].trigger); + mutex_unlock(&event_mutex); + if (ret) + pr_err("Failed to register trigger '%s' on event %s\n", + bootup_triggers[i].trigger, + bootup_triggers[i].event); + } +} +#else +static inline void trace_early_triggers(struct trace_event_file *file, const char *name) { } +#endif + /* * Just create a descriptor for early init. A descriptor is required * for enabling events at boot. We want to enable events before @@ -2822,12 +2882,19 @@ __trace_early_add_new_event(struct trace_event_call *call, struct trace_array *tr) { struct trace_event_file *file; + int ret; file = trace_create_new_event(call, tr); if (!file) return -ENOMEM; - return event_define_fields(call); + ret = event_define_fields(call); + if (ret) + return ret; + + trace_early_triggers(file, trace_event_name(call)); + + return 0; } struct ftrace_module_file_ops; @@ -3735,6 +3802,8 @@ static __init int event_trace_enable(void) list_add(&call->list, &ftrace_events); } + register_trigger_cmds(); + /* * We need the top trace array to have a working set of trace * points at early init, before the debug files and directories @@ -3749,7 +3818,6 @@ static __init int event_trace_enable(void) register_event_cmds(); - register_trigger_cmds(); return 0; } -- cgit v1.2.3 From 04aabc32fb677f91d676fd306bca1043805e78d5 Mon Sep 17 00:00:00 2001 From: Song Chen Date: Thu, 20 Oct 2022 22:06:51 +0800 Subject: ring_buffer: Remove unused "event" parameter After commit a389d86f7fd0 ("ring-buffer: Have nested events still record running time stamp"), the "event" parameter is no longer used in either ring_buffer_unlock_commit() or rb_commit(). Best to remove it. Link: https://lkml.kernel.org/r/1666274811-24138-1-git-send-email-chensong_2000@189.cn Signed-off-by: Song Chen Signed-off-by: Steven Rostedt (Google) --- include/linux/ring_buffer.h | 3 +-- kernel/trace/ring_buffer.c | 12 +++++------- kernel/trace/ring_buffer_benchmark.c | 2 +- kernel/trace/trace.c | 2 +- 4 files changed, 8 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 3c7d295746f6..782e14f62201 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -113,8 +113,7 @@ void ring_buffer_change_overwrite(struct trace_buffer *buffer, int val); struct ring_buffer_event *ring_buffer_lock_reserve(struct trace_buffer *buffer, unsigned long length); -int ring_buffer_unlock_commit(struct trace_buffer *buffer, - struct ring_buffer_event *event); +int ring_buffer_unlock_commit(struct trace_buffer *buffer); int ring_buffer_write(struct trace_buffer *buffer, unsigned long length, void *data); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index b21bf14bae9b..843818ee4814 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3180,8 +3180,7 @@ static inline void rb_event_discard(struct ring_buffer_event *event) event->time_delta = 1; } -static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event) +static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer) { local_inc(&cpu_buffer->entries); rb_end_commit(cpu_buffer); @@ -3383,15 +3382,14 @@ void ring_buffer_nest_end(struct trace_buffer *buffer) * * Must be paired with ring_buffer_lock_reserve. */ -int ring_buffer_unlock_commit(struct trace_buffer *buffer, - struct ring_buffer_event *event) +int ring_buffer_unlock_commit(struct trace_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; int cpu = raw_smp_processor_id(); cpu_buffer = buffer->buffers[cpu]; - rb_commit(cpu_buffer, event); + rb_commit(cpu_buffer); rb_wakeups(buffer, cpu_buffer); @@ -3977,7 +3975,7 @@ int ring_buffer_write(struct trace_buffer *buffer, memcpy(body, data, length); - rb_commit(cpu_buffer, event); + rb_commit(cpu_buffer); rb_wakeups(buffer, cpu_buffer); @@ -5998,7 +5996,7 @@ static __init int rb_write_something(struct rb_test_data *data, bool nested) } out: - ring_buffer_unlock_commit(data->buffer, event); + ring_buffer_unlock_commit(data->buffer); return 0; } diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 78e576575b79..aef34673d79d 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -258,7 +258,7 @@ static void ring_buffer_producer(void) hit++; entry = ring_buffer_event_data(event); *entry = smp_processor_id(); - ring_buffer_unlock_commit(buffer, event); + ring_buffer_unlock_commit(buffer); } } end_time = ktime_get(); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5cfc95a52bc3..5c97dbef741b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -999,7 +999,7 @@ __buffer_unlock_commit(struct trace_buffer *buffer, struct ring_buffer_event *ev /* ring_buffer_unlock_commit() enables preemption */ preempt_enable_notrace(); } else - ring_buffer_unlock_commit(buffer, event); + ring_buffer_unlock_commit(buffer); } /** -- cgit v1.2.3 From b179d48b6aab21f3999f5006685ea4254c0618a9 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Thu, 17 Nov 2022 14:46:18 +0100 Subject: tracing/osnoise: Add osnoise/options file Add the tracing/osnoise/options file to control osnoise/timerlat tracer features. It is a single file to contain multiple features, similar to the sched/features file. Reading the file displays a list of options. Writing the OPTION_NAME enables it, writing NO_OPTION_NAME disables it. The DEAFULTS is a particular option that resets the options to the default ones. It uses a bitmask to keep track of the status of the option. When needed, we can add a list of static keys, but for now it does not justify the memory increase. Link: https://lkml.kernel.org/r/f8d34aefdb225d2603fcb4c02a120832a0cd3339.1668692096.git.bristot@kernel.org Cc: Daniel Bristot de Oliveira Cc: Masami Hiramatsu Cc: Jonathan Corbet Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_osnoise.c | 170 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 170 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 4300c5dc4e5d..17b77fe3950b 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -48,6 +48,19 @@ #define DEFAULT_TIMERLAT_PERIOD 1000 /* 1ms */ #define DEFAULT_TIMERLAT_PRIO 95 /* FIFO 95 */ +/* + * osnoise/options entries. + */ +enum osnoise_options_index { + OSN_DEFAULTS = 0, + OSN_MAX +}; + +static const char * const osnoise_options_str[OSN_MAX] = { "DEFAULTS" }; + +#define OSN_DEFAULT_OPTIONS 0 +unsigned long osnoise_options = OSN_DEFAULT_OPTIONS; + /* * trace_array of the enabled osnoise/timerlat instances. */ @@ -1860,6 +1873,150 @@ static void osnoise_init_hotplug_support(void) } #endif /* CONFIG_HOTPLUG_CPU */ +/* + * seq file functions for the osnoise/options file. + */ +static void *s_options_start(struct seq_file *s, loff_t *pos) +{ + int option = *pos; + + mutex_lock(&interface_lock); + + if (option >= OSN_MAX) + return NULL; + + return pos; +} + +static void *s_options_next(struct seq_file *s, void *v, loff_t *pos) +{ + int option = ++(*pos); + + if (option >= OSN_MAX) + return NULL; + + return pos; +} + +static int s_options_show(struct seq_file *s, void *v) +{ + loff_t *pos = v; + int option = *pos; + + if (option == OSN_DEFAULTS) { + if (osnoise_options == OSN_DEFAULT_OPTIONS) + seq_printf(s, "%s", osnoise_options_str[option]); + else + seq_printf(s, "NO_%s", osnoise_options_str[option]); + goto out; + } + + if (test_bit(option, &osnoise_options)) + seq_printf(s, "%s", osnoise_options_str[option]); + else + seq_printf(s, "NO_%s", osnoise_options_str[option]); + +out: + if (option != OSN_MAX) + seq_puts(s, " "); + + return 0; +} + +static void s_options_stop(struct seq_file *s, void *v) +{ + seq_puts(s, "\n"); + mutex_unlock(&interface_lock); +} + +static const struct seq_operations osnoise_options_seq_ops = { + .start = s_options_start, + .next = s_options_next, + .show = s_options_show, + .stop = s_options_stop +}; + +static int osnoise_options_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &osnoise_options_seq_ops); +}; + +/** + * osnoise_options_write - Write function for "options" entry + * @filp: The active open file structure + * @ubuf: The user buffer that contains the value to write + * @cnt: The maximum number of bytes to write to "file" + * @ppos: The current position in @file + * + * Writing the option name sets the option, writing the "NO_" + * prefix in front of the option name disables it. + * + * Writing "DEFAULTS" resets the option values to the default ones. + */ +static ssize_t osnoise_options_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + int running, option, enable, retval; + char buf[256], *option_str; + + if (cnt >= 256) + return -EINVAL; + + if (copy_from_user(buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + if (strncmp(buf, "NO_", 3)) { + option_str = strstrip(buf); + enable = true; + } else { + option_str = strstrip(&buf[3]); + enable = false; + } + + option = match_string(osnoise_options_str, OSN_MAX, option_str); + if (option < 0) + return -EINVAL; + + /* + * trace_types_lock is taken to avoid concurrency on start/stop. + */ + mutex_lock(&trace_types_lock); + running = osnoise_has_registered_instances(); + if (running) + stop_per_cpu_kthreads(); + + mutex_lock(&interface_lock); + /* + * avoid CPU hotplug operations that might read options. + */ + cpus_read_lock(); + + retval = cnt; + + if (enable) { + if (option == OSN_DEFAULTS) + osnoise_options = OSN_DEFAULT_OPTIONS; + else + set_bit(option, &osnoise_options); + } else { + if (option == OSN_DEFAULTS) + retval = -EINVAL; + else + clear_bit(option, &osnoise_options); + } + + cpus_read_unlock(); + mutex_unlock(&interface_lock); + + if (running) + start_per_cpu_kthreads(); + mutex_unlock(&trace_types_lock); + + return retval; +} + /* * osnoise_cpus_read - Read function for reading the "cpus" file * @filp: The active open file structure @@ -2042,6 +2199,14 @@ static const struct file_operations cpus_fops = { .llseek = generic_file_llseek, }; +static const struct file_operations osnoise_options_fops = { + .open = osnoise_options_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .write = osnoise_options_write +}; + #ifdef CONFIG_TIMERLAT_TRACER #ifdef CONFIG_STACKTRACE static int init_timerlat_stack_tracefs(struct dentry *top_dir) @@ -2128,6 +2293,11 @@ static int init_tracefs(void) if (!tmp) goto err; + tmp = trace_create_file("options", TRACE_MODE_WRITE, top_dir, NULL, + &osnoise_options_fops); + if (!tmp) + goto err; + ret = init_timerlat_tracefs(top_dir); if (ret) goto err; -- cgit v1.2.3 From 30838fcd81078d078b10209bc18a6357ba4dd5fa Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Thu, 17 Nov 2022 14:46:19 +0100 Subject: tracing/osnoise: Add OSNOISE_WORKLOAD option The osnoise tracer is not only a tracer, and a set of tracepoints, but also a workload dispatcher. In preparation for having other workloads, e.g., in user-space, add an option to avoid dispatching the workload. By not dispatching the workload, the osnoise: tracepoints become generic events to measure the execution time of *any* task on Linux. For example: # cd /sys/kernel/tracing/ # cat osnoise/options DEFAULTS OSNOISE_WORKLOAD # echo NO_OSNOISE_WORKLOAD > osnoise/options # cat osnoise/options NO_DEFAULTS NO_OSNOISE_WORKLOAD # echo osnoise > set_event # echo osnoise > current_tracer # tail -8 trace make-94722 [002] d..3. 1371.794507: thread_noise: make:94722 start 1371.794302286 duration 200897 ns sh-121042 [020] d..3. 1371.794534: thread_noise: sh:121042 start 1371.781610976 duration 8943683 ns make-121097 [005] d..3. 1371.794542: thread_noise: make:121097 start 1371.794481522 duration 60444 ns <...>-40 [005] d..3. 1371.794550: thread_noise: migration/5:40 start 1371.794542256 duration 7154 ns -0 [018] dNh2. 1371.794554: irq_noise: reschedule:253 start 1371.794553547 duration 40 ns -0 [018] dNh2. 1371.794561: irq_noise: local_timer:236 start 1371.794556222 duration 4890 ns -0 [018] .Ns2. 1371.794563: softirq_noise: SCHED:7 start 1371.794561803 duration 992 ns -0 [018] d..3. 1371.794566: thread_noise: swapper/18:0 start 1371.781368110 duration 13191798 ns In preparation for the rtla exec_time tracer/tool and rtla osnoise --user option. Link: https://lkml.kernel.org/r/f5cfbd37aefd419eefe9243b4d2fc38ed5753fe4.1668692096.git.bristot@kernel.org Cc: Daniel Bristot de Oliveira Cc: Masami Hiramatsu Cc: Jonathan Corbet Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_osnoise.c | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 17b77fe3950b..3f10dd1f2f1c 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -53,12 +53,13 @@ */ enum osnoise_options_index { OSN_DEFAULTS = 0, + OSN_WORKLOAD, OSN_MAX }; -static const char * const osnoise_options_str[OSN_MAX] = { "DEFAULTS" }; +static const char * const osnoise_options_str[OSN_MAX] = { "DEFAULTS", "OSNOISE_WORKLOAD" }; -#define OSN_DEFAULT_OPTIONS 0 +#define OSN_DEFAULT_OPTIONS 0x2 unsigned long osnoise_options = OSN_DEFAULT_OPTIONS; /* @@ -1186,11 +1187,12 @@ trace_sched_switch_callback(void *data, bool preempt, unsigned int prev_state) { struct osnoise_variables *osn_var = this_cpu_osn_var(); + int workload = test_bit(OSN_WORKLOAD, &osnoise_options); - if (p->pid != osn_var->pid) + if ((p->pid != osn_var->pid) || !workload) thread_exit(osn_var, p); - if (n->pid != osn_var->pid) + if ((n->pid != osn_var->pid) || !workload) thread_entry(osn_var, n); } @@ -1723,9 +1725,16 @@ static void stop_kthread(unsigned int cpu) struct task_struct *kthread; kthread = per_cpu(per_cpu_osnoise_var, cpu).kthread; - if (kthread) + if (kthread) { kthread_stop(kthread); - per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL; + per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL; + } else { + if (!test_bit(OSN_WORKLOAD, &osnoise_options)) { + per_cpu(per_cpu_osnoise_var, cpu).sampling = false; + barrier(); + return; + } + } } /* @@ -1759,6 +1768,13 @@ static int start_kthread(unsigned int cpu) snprintf(comm, 24, "timerlat/%d", cpu); main = timerlat_main; } else { + /* if no workload, just return */ + if (!test_bit(OSN_WORKLOAD, &osnoise_options)) { + per_cpu(per_cpu_osnoise_var, cpu).sampling = true; + barrier(); + return 0; + } + snprintf(comm, 24, "osnoise/%d", cpu); } -- cgit v1.2.3 From 9430cd62b6ccdf1f06915cc06561f0e364809604 Mon Sep 17 00:00:00 2001 From: Chuang Wang Date: Mon, 21 Nov 2022 16:08:31 +0800 Subject: tracing/perf: Use strndup_user instead of kzalloc/strncpy_from_user This patch uses strndup_user instead of kzalloc + strncpy_from_user, which makes the code more concise. Link: https://lkml.kernel.org/r/20221121080831.707409-1-nashuiliang@gmail.com Signed-off-by: Chuang Wang Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_event_perf.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 61e3a2620fa3..05e791241812 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -251,16 +251,12 @@ int perf_kprobe_init(struct perf_event *p_event, bool is_retprobe) struct trace_event_call *tp_event; if (p_event->attr.kprobe_func) { - func = kzalloc(KSYM_NAME_LEN, GFP_KERNEL); - if (!func) - return -ENOMEM; - ret = strncpy_from_user( - func, u64_to_user_ptr(p_event->attr.kprobe_func), - KSYM_NAME_LEN); - if (ret == KSYM_NAME_LEN) - ret = -E2BIG; - if (ret < 0) - goto out; + func = strndup_user(u64_to_user_ptr(p_event->attr.kprobe_func), + KSYM_NAME_LEN); + if (IS_ERR(func)) { + ret = PTR_ERR(func); + return (ret == -EINVAL) ? -E2BIG : ret; + } if (func[0] == '\0') { kfree(func); -- cgit v1.2.3 From a76d4648a0bbd624a1322c15073b5032a5135f01 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Tue, 22 Nov 2022 17:14:56 +0800 Subject: tracing: Make tracepoint_print_iter static After change in commit 4239174570da ("tracing: Make tracepoint_printk a static_key"), this symbol is not used outside of the file, so mark it static. Link: https://lkml.kernel.org/r/20221122091456.72055-1-xiujianfeng@huawei.com Signed-off-by: Xiu Jianfeng Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 2 +- kernel/trace/trace.h | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5c97dbef741b..93a75a97118f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -85,7 +85,7 @@ void __init disable_tracing_selftest(const char *reason) #endif /* Pipe tracepoints to printk */ -struct trace_iterator *tracepoint_print_iter; +static struct trace_iterator *tracepoint_print_iter; int tracepoint_printk; static bool tracepoint_printk_stop_on_boot __initdata; static DEFINE_STATIC_KEY_FALSE(tracepoint_printk_key); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index d42e24507152..48643f07bc01 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1942,8 +1942,6 @@ static inline void tracer_hardirqs_on(unsigned long a0, unsigned long a1) { } static inline void tracer_hardirqs_off(unsigned long a0, unsigned long a1) { } #endif -extern struct trace_iterator *tracepoint_print_iter; - /* * Reset the state of the trace_iterator so that it can read consumed data. * Normally, the trace_iterator is used for reading the data when it is not -- cgit v1.2.3 From 96e6122cb79616c622ae0d025eb9f981120b568d Mon Sep 17 00:00:00 2001 From: Zheng Yejian Date: Thu, 10 Nov 2022 10:03:19 +0800 Subject: tracing: Optimize event type allocation with IDA MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After commit 060fa5c83e67 ("tracing/events: reuse trace event ids after overflow"), trace events with dynamic type are linked up in list 'ftrace_event_list' through field 'trace_event.list'. Then when max event type number used up, it's possible to reuse type number of some freed one by traversing 'ftrace_event_list'. As instead, using IDA to manage available type numbers can make codes simpler and then the field 'trace_event.list' can be dropped. Since 'struct trace_event' is used in static tracepoints, drop 'trace_event.list' can make vmlinux smaller. Local test with about 2000 tracepoints, vmlinux reduced about 64KB: before:-rwxrwxr-x 1 root root 76669448 Nov 8 17:14 vmlinux after: -rwxrwxr-x 1 root root 76604176 Nov 8 17:15 vmlinux Link: https://lkml.kernel.org/r/20221110020319.1259291-1-zhengyejian1@huawei.com Signed-off-by: Zheng Yejian Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_events.h | 1 - kernel/trace/trace_output.c | 66 +++++++++++--------------------------------- 2 files changed, 16 insertions(+), 51 deletions(-) (limited to 'kernel') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 20749bd9db71..bb2053246d6a 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -136,7 +136,6 @@ struct trace_event_functions { struct trace_event { struct hlist_node node; - struct list_head list; int type; struct trace_event_functions *funcs; }; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 67f47ea27921..f0ba97121345 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "trace_output.h" @@ -21,8 +22,6 @@ DECLARE_RWSEM(trace_event_sem); static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; -static int next_event_type = __TRACE_LAST_TYPE; - enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; @@ -688,38 +687,23 @@ struct trace_event *ftrace_find_event(int type) return NULL; } -static LIST_HEAD(ftrace_event_list); +static DEFINE_IDA(trace_event_ida); -static int trace_search_list(struct list_head **list) +static void free_trace_event_type(int type) { - struct trace_event *e = NULL, *iter; - int next = __TRACE_LAST_TYPE; - - if (list_empty(&ftrace_event_list)) { - *list = &ftrace_event_list; - return next; - } + if (type >= __TRACE_LAST_TYPE) + ida_free(&trace_event_ida, type); +} - /* - * We used up all possible max events, - * lets see if somebody freed one. - */ - list_for_each_entry(iter, &ftrace_event_list, list) { - if (iter->type != next) { - e = iter; - break; - } - next++; - } +static int alloc_trace_event_type(void) +{ + int next; - /* Did we used up all 65 thousand events??? */ - if (next > TRACE_EVENT_TYPE_MAX) + /* Skip static defined type numbers */ + next = ida_alloc_range(&trace_event_ida, __TRACE_LAST_TYPE, + TRACE_EVENT_TYPE_MAX, GFP_KERNEL); + if (next < 0) return 0; - - if (e) - *list = &e->list; - else - *list = &ftrace_event_list; return next; } @@ -761,28 +745,10 @@ int register_trace_event(struct trace_event *event) if (WARN_ON(!event->funcs)) goto out; - INIT_LIST_HEAD(&event->list); - if (!event->type) { - struct list_head *list = NULL; - - if (next_event_type > TRACE_EVENT_TYPE_MAX) { - - event->type = trace_search_list(&list); - if (!event->type) - goto out; - - } else { - - event->type = next_event_type++; - list = &ftrace_event_list; - } - - if (WARN_ON(ftrace_find_event(event->type))) + event->type = alloc_trace_event_type(); + if (!event->type) goto out; - - list_add_tail(&event->list, list); - } else if (WARN(event->type > __TRACE_LAST_TYPE, "Need to add type to trace.h")) { goto out; @@ -819,7 +785,7 @@ EXPORT_SYMBOL_GPL(register_trace_event); int __unregister_trace_event(struct trace_event *event) { hlist_del(&event->node); - list_del(&event->list); + free_trace_event_type(event->type); return 0; } -- cgit v1.2.3 From bd604f3db49c5b21171abea0414a2020dcbf2646 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 22 Nov 2022 18:09:05 -0500 Subject: ftrace: Avoid needless updates of the ftrace function call Song Shuai reported: The list func (ftrace_ops_list_func) will be patched first before the transition between old and new calls are set, which fixed the race described in this commit `59338f75`. While ftrace_trace_function changes from the list func to a ftrace_ops func, like unregistering the klp_ops to leave the only global_ops in ftrace_ops_list, the ftrace_[regs]_call will be replaced with the list func although it already exists. So there should be a condition to avoid this. And suggested using another variable to keep track of what the ftrace function is set to. But this could be simplified by using a helper function that does the same with a static variable. Link: https://lore.kernel.org/lkml/20221026132039.2236233-1-suagrfillet@gmail.com/ Link: https://lore.kernel.org/linux-trace-kernel/20221122180905.737b6f52@gandalf.local.home Reported-by: Song Shuai Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 65a5d36463e0..d04552c0c275 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2763,6 +2763,19 @@ void __weak ftrace_arch_code_modify_post_process(void) { } +static int update_ftrace_func(ftrace_func_t func) +{ + static ftrace_func_t save_func; + + /* Avoid updating if it hasn't changed */ + if (func == save_func) + return 0; + + save_func = func; + + return ftrace_update_ftrace_func(func); +} + void ftrace_modify_all_code(int command) { int update = command & FTRACE_UPDATE_TRACE_FUNC; @@ -2783,7 +2796,7 @@ void ftrace_modify_all_code(int command) * traced. */ if (update) { - err = ftrace_update_ftrace_func(ftrace_ops_list_func); + err = update_ftrace_func(ftrace_ops_list_func); if (FTRACE_WARN_ON(err)) return; } @@ -2799,7 +2812,7 @@ void ftrace_modify_all_code(int command) /* If irqs are disabled, we are in stop machine */ if (!irqs_disabled()) smp_call_function(ftrace_sync_ipi, NULL, 1); - err = ftrace_update_ftrace_func(ftrace_trace_function); + err = update_ftrace_func(ftrace_trace_function); if (FTRACE_WARN_ON(err)) return; } -- cgit v1.2.3 From 4bded7af8b9af6e97514b0521004f90267905aef Mon Sep 17 00:00:00 2001 From: Beau Belgrave Date: Wed, 23 Nov 2022 10:32:48 -0800 Subject: tracing/user_events: Fix call print_fmt leak If user_event_trace_register() fails within user_event_parse() the call's print_fmt member is not freed. Add kfree call to fix this. Link: https://lkml.kernel.org/r/20221123183248.554-1-beaub@linux.microsoft.com Fixes: aa3b2b4c6692 ("user_events: Add print_fmt generation support for basic types") Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_user.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 539b08ae7020..9cb53182bb31 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -1359,6 +1359,7 @@ put_user_lock: put_user: user_event_destroy_fields(user); user_event_destroy_validators(user); + kfree(user->call.print_fmt); kfree(user); return ret; } -- cgit v1.2.3 From 5f2e094ed2592abb3d1f49e263957188e121c2ac Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 20 Oct 2022 00:31:55 +0900 Subject: tracing: Allow multiple hitcount values in histograms The hitcount is treated specially in the histograms - since it's always expected to be there regardless of whether the user specified anything or not, it's always added as the first histogram value. Currently the code doesn't allow it to be added more than once as a value, which is inconsistent with all the other possible values. It would seem to be a pointless thing to want to do, but other features being added such as percent and graph modifiers don't work properly with the current hitcount restrictions. Fix this by allowing multiple hitcounts to be added. Link: https://lore.kernel.org/linux-trace-kernel/166610812248.56030.16754785928712505251.stgit@devnote2 Signed-off-by: Tom Zanussi Signed-off-by: Masami Hiramatsu (Google) Tested-by: Tom Zanussi --- kernel/trace/trace_events_hist.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 1c82478e8dff..31d58ddcc1d9 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1356,6 +1356,8 @@ static const char *hist_field_name(struct hist_field *field, field_name = field->name; } else if (field->flags & HIST_FIELD_FL_TIMESTAMP) field_name = "common_timestamp"; + else if (field->flags & HIST_FIELD_FL_HITCOUNT) + field_name = "hitcount"; if (field_name == NULL) field_name = ""; @@ -2328,6 +2330,8 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, hist_data->attrs->ts_in_usecs = true; } else if (strcmp(field_name, "common_cpu") == 0) *flags |= HIST_FIELD_FL_CPU; + else if (strcmp(field_name, "hitcount") == 0) + *flags |= HIST_FIELD_FL_HITCOUNT; else { field = trace_find_event_field(file->event_call, field_name); if (!field || !field->size) { @@ -4328,8 +4332,8 @@ static int create_var_field(struct hist_trigger_data *hist_data, static int create_val_fields(struct hist_trigger_data *hist_data, struct trace_event_file *file) { + unsigned int i, j = 1, n_hitcount = 0; char *fields_str, *field_str; - unsigned int i, j = 1; int ret; ret = create_hitcount_val(hist_data); @@ -4346,8 +4350,10 @@ static int create_val_fields(struct hist_trigger_data *hist_data, if (!field_str) break; - if (strcmp(field_str, "hitcount") == 0) - continue; + if (strcmp(field_str, "hitcount") == 0) { + if (!n_hitcount++) + continue; + } ret = create_val_field(hist_data, j++, file, field_str); if (ret) -- cgit v1.2.3 From abaa5258ce5e5887a9de049f50a85dc023391a1c Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 20 Oct 2022 00:31:55 +0900 Subject: tracing: Add .percent suffix option to histogram values Add .percent suffix option to show the histogram values in percentage. This feature is useful when we need yo undersntand the overall trend for the histograms of large values. E.g. this shows the runtime percentage for each tasks. ------ # cd /sys/kernel/debug/tracing/ # echo hist:keys=pid:vals=hitcount,runtime.percent:sort=pid > \ events/sched/sched_stat_runtime/trigger # sleep 10 # cat events/sched/sched_stat_runtime/hist # event histogram # # trigger info: hist:keys=pid:vals=hitcount,runtime.percent:sort=pid:size=2048 [active] # { pid: 8 } hitcount: 7 runtime (%): 4.14 { pid: 14 } hitcount: 5 runtime (%): 3.69 { pid: 16 } hitcount: 11 runtime (%): 3.41 { pid: 61 } hitcount: 41 runtime (%): 19.75 { pid: 65 } hitcount: 4 runtime (%): 1.48 { pid: 70 } hitcount: 6 runtime (%): 3.60 { pid: 72 } hitcount: 2 runtime (%): 1.10 { pid: 144 } hitcount: 10 runtime (%): 32.01 { pid: 151 } hitcount: 8 runtime (%): 22.66 { pid: 152 } hitcount: 2 runtime (%): 8.10 Totals: Hits: 96 Entries: 10 Dropped: 0 ----- Link: https://lore.kernel.org/linux-trace-kernel/166610813077.56030.4238090506973562347.stgit@devnote2 Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Tom Zanussi Tested-by: Tom Zanussi --- kernel/trace/trace.c | 3 +- kernel/trace/trace_events_hist.c | 90 +++++++++++++++++++++++++++++++++------- 2 files changed, 78 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 93a75a97118f..08e9568849b1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5724,7 +5724,8 @@ static const char readme_msg[] = "\t .syscall display a syscall id as a syscall name\n" "\t .log2 display log2 value rather than raw number\n" "\t .buckets=size display values in groups of size rather than raw number\n" - "\t .usecs display a common_timestamp in microseconds\n\n" + "\t .usecs display a common_timestamp in microseconds\n" + "\t .percent display a number of percentage value\n\n" "\t The 'pause' parameter can be used to pause an existing hist\n" "\t trigger or to start a hist trigger but not log any events\n" "\t until told to do so. 'continue' can be used to start or\n" diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 31d58ddcc1d9..35b0e956f06e 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -506,6 +506,7 @@ enum hist_field_flags { HIST_FIELD_FL_ALIAS = 1 << 16, HIST_FIELD_FL_BUCKET = 1 << 17, HIST_FIELD_FL_CONST = 1 << 18, + HIST_FIELD_FL_PERCENT = 1 << 19, }; struct var_defs { @@ -1707,6 +1708,8 @@ static const char *get_hist_field_flags(struct hist_field *hist_field) flags_str = "buckets"; else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP_USECS) flags_str = "usecs"; + else if (hist_field->flags & HIST_FIELD_FL_PERCENT) + flags_str = "percent"; return flags_str; } @@ -2315,6 +2318,10 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, if (ret || !(*buckets)) goto error; *flags |= HIST_FIELD_FL_BUCKET; + } else if (strncmp(modifier, "percent", 7) == 0) { + if (*flags & (HIST_FIELD_FL_VAR | HIST_FIELD_FL_KEY)) + goto error; + *flags |= HIST_FIELD_FL_PERCENT; } else { error: hist_err(tr, HIST_ERR_BAD_FIELD_MODIFIER, errpos(modifier)); @@ -5291,33 +5298,69 @@ static void hist_trigger_print_key(struct seq_file *m, seq_puts(m, "}"); } +/* Get the 100 times of the percentage of @val in @total */ +static inline unsigned int __get_percentage(u64 val, u64 total) +{ + if (!total) + goto div0; + + if (val < (U64_MAX / 10000)) + return (unsigned int)div64_ul(val * 10000, total); + + total = div64_u64(total, 10000); + if (!total) + goto div0; + + return (unsigned int)div64_ul(val, total); +div0: + return val ? UINT_MAX : 0; +} + +static void hist_trigger_print_val(struct seq_file *m, unsigned int idx, + const char *field_name, unsigned long flags, + u64 *totals, struct tracing_map_elt *elt) +{ + u64 val = tracing_map_read_sum(elt, idx); + unsigned int pc; + + if (flags & HIST_FIELD_FL_PERCENT) { + pc = __get_percentage(val, totals[idx]); + if (pc == UINT_MAX) + seq_printf(m, " %s (%%):[ERROR]", field_name); + else + seq_printf(m, " %s (%%): %3u.%02u", field_name, + pc / 100, pc % 100); + } else if (flags & HIST_FIELD_FL_HEX) { + seq_printf(m, " %s: %10llx", field_name, val); + } else { + seq_printf(m, " %s: %10llu", field_name, val); + } +} + static void hist_trigger_entry_print(struct seq_file *m, struct hist_trigger_data *hist_data, + u64 *totals, void *key, struct tracing_map_elt *elt) { const char *field_name; - unsigned int i; + unsigned int i = HITCOUNT_IDX; + unsigned long flags; hist_trigger_print_key(m, hist_data, key, elt); - seq_printf(m, " hitcount: %10llu", - tracing_map_read_sum(elt, HITCOUNT_IDX)); + /* At first, show the raw hitcount always */ + hist_trigger_print_val(m, i, "hitcount", 0, totals, elt); for (i = 1; i < hist_data->n_vals; i++) { field_name = hist_field_name(hist_data->fields[i], 0); + flags = hist_data->fields[i]->flags; - if (hist_data->fields[i]->flags & HIST_FIELD_FL_VAR || - hist_data->fields[i]->flags & HIST_FIELD_FL_EXPR) + if (flags & HIST_FIELD_FL_VAR || flags & HIST_FIELD_FL_EXPR) continue; - if (hist_data->fields[i]->flags & HIST_FIELD_FL_HEX) { - seq_printf(m, " %s: %10llx", field_name, - tracing_map_read_sum(elt, i)); - } else { - seq_printf(m, " %s: %10llu", field_name, - tracing_map_read_sum(elt, i)); - } + seq_puts(m, " "); + hist_trigger_print_val(m, i, field_name, flags, totals, elt); } print_actions(m, hist_data, elt); @@ -5330,7 +5373,8 @@ static int print_entries(struct seq_file *m, { struct tracing_map_sort_entry **sort_entries = NULL; struct tracing_map *map = hist_data->map; - int i, n_entries; + int i, j, n_entries; + u64 *totals = NULL; n_entries = tracing_map_sort_entries(map, hist_data->sort_keys, hist_data->n_sort_keys, @@ -5338,11 +5382,29 @@ static int print_entries(struct seq_file *m, if (n_entries < 0) return n_entries; + for (j = 0; j < hist_data->n_vals; j++) { + if (!(hist_data->fields[j]->flags & HIST_FIELD_FL_PERCENT)) + continue; + if (!totals) { + totals = kcalloc(hist_data->n_vals, sizeof(u64), + GFP_KERNEL); + if (!totals) { + n_entries = -ENOMEM; + goto out; + } + } + for (i = 0; i < n_entries; i++) + totals[j] += tracing_map_read_sum( + sort_entries[i]->elt, j); + } + for (i = 0; i < n_entries; i++) - hist_trigger_entry_print(m, hist_data, + hist_trigger_entry_print(m, hist_data, totals, sort_entries[i]->key, sort_entries[i]->elt); + kfree(totals); +out: tracing_map_destroy_sort_entries(sort_entries, n_entries); return n_entries; -- cgit v1.2.3 From a2c54256dec7510477e2b4f4db187e638f7cac37 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 20 Oct 2022 00:31:55 +0900 Subject: tracing: Add .graph suffix option to histogram value Add the .graph suffix which shows the bar graph of the histogram value. For example, the below example shows that the bar graph of the histogram of the runtime for each tasks. ------ # cd /sys/kernel/debug/tracing/ # echo hist:keys=pid:vals=runtime.graph:sort=pid > \ events/sched/sched_stat_runtime/trigger # sleep 10 # cat events/sched/sched_stat_runtime/hist # event histogram # # trigger info: hist:keys=pid:vals=hitcount,runtime.graph:sort=pid:size=2048 [active] # { pid: 14 } hitcount: 2 runtime: { pid: 16 } hitcount: 8 runtime: { pid: 26 } hitcount: 1 runtime: { pid: 57 } hitcount: 3 runtime: { pid: 61 } hitcount: 20 runtime: ### { pid: 66 } hitcount: 2 runtime: { pid: 70 } hitcount: 3 runtime: { pid: 72 } hitcount: 2 runtime: { pid: 145 } hitcount: 14 runtime: #################### { pid: 152 } hitcount: 5 runtime: ####### { pid: 153 } hitcount: 2 runtime: #### Totals: Hits: 62 Entries: 11 Dropped: 0 ------- Link: https://lore.kernel.org/linux-trace-kernel/166610813953.56030.10944148382315789485.stgit@devnote2 Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Tom Zanussi Tested-by: Tom Zanussi --- kernel/trace/trace.c | 3 +- kernel/trace/trace_events_hist.c | 77 +++++++++++++++++++++++++++++++--------- 2 files changed, 63 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 08e9568849b1..55aec4616d8b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5725,7 +5725,8 @@ static const char readme_msg[] = "\t .log2 display log2 value rather than raw number\n" "\t .buckets=size display values in groups of size rather than raw number\n" "\t .usecs display a common_timestamp in microseconds\n" - "\t .percent display a number of percentage value\n\n" + "\t .percent display a number of percentage value\n" + "\t .graph display a bar-graph of a value\n\n" "\t The 'pause' parameter can be used to pause an existing hist\n" "\t trigger or to start a hist trigger but not log any events\n" "\t until told to do so. 'continue' can be used to start or\n" diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 35b0e956f06e..946b2b8f0f2c 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -507,6 +507,7 @@ enum hist_field_flags { HIST_FIELD_FL_BUCKET = 1 << 17, HIST_FIELD_FL_CONST = 1 << 18, HIST_FIELD_FL_PERCENT = 1 << 19, + HIST_FIELD_FL_GRAPH = 1 << 20, }; struct var_defs { @@ -1710,6 +1711,8 @@ static const char *get_hist_field_flags(struct hist_field *hist_field) flags_str = "usecs"; else if (hist_field->flags & HIST_FIELD_FL_PERCENT) flags_str = "percent"; + else if (hist_field->flags & HIST_FIELD_FL_GRAPH) + flags_str = "graph"; return flags_str; } @@ -2322,6 +2325,10 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, if (*flags & (HIST_FIELD_FL_VAR | HIST_FIELD_FL_KEY)) goto error; *flags |= HIST_FIELD_FL_PERCENT; + } else if (strncmp(modifier, "graph", 5) == 0) { + if (*flags & (HIST_FIELD_FL_VAR | HIST_FIELD_FL_KEY)) + goto error; + *flags |= HIST_FIELD_FL_GRAPH; } else { error: hist_err(tr, HIST_ERR_BAD_FIELD_MODIFIER, errpos(modifier)); @@ -5316,20 +5323,52 @@ div0: return val ? UINT_MAX : 0; } +#define BAR_CHAR '#' + +static inline const char *__fill_bar_str(char *buf, int size, u64 val, u64 max) +{ + unsigned int len = __get_percentage(val, max); + int i; + + if (len == UINT_MAX) { + snprintf(buf, size, "[ERROR]"); + return buf; + } + + len = len * size / 10000; + for (i = 0; i < len && i < size; i++) + buf[i] = BAR_CHAR; + while (i < size) + buf[i++] = ' '; + buf[size] = '\0'; + + return buf; +} + +struct hist_val_stat { + u64 max; + u64 total; +}; + static void hist_trigger_print_val(struct seq_file *m, unsigned int idx, const char *field_name, unsigned long flags, - u64 *totals, struct tracing_map_elt *elt) + struct hist_val_stat *stats, + struct tracing_map_elt *elt) { u64 val = tracing_map_read_sum(elt, idx); unsigned int pc; + char bar[21]; if (flags & HIST_FIELD_FL_PERCENT) { - pc = __get_percentage(val, totals[idx]); + pc = __get_percentage(val, stats[idx].total); if (pc == UINT_MAX) seq_printf(m, " %s (%%):[ERROR]", field_name); else seq_printf(m, " %s (%%): %3u.%02u", field_name, pc / 100, pc % 100); + } else if (flags & HIST_FIELD_FL_GRAPH) { + seq_printf(m, " %s: %20s", field_name, + __fill_bar_str(bar, 20, val, stats[idx].max)); } else if (flags & HIST_FIELD_FL_HEX) { seq_printf(m, " %s: %10llx", field_name, val); } else { @@ -5339,7 +5378,7 @@ static void hist_trigger_print_val(struct seq_file *m, unsigned int idx, static void hist_trigger_entry_print(struct seq_file *m, struct hist_trigger_data *hist_data, - u64 *totals, + struct hist_val_stat *stats, void *key, struct tracing_map_elt *elt) { @@ -5350,7 +5389,7 @@ static void hist_trigger_entry_print(struct seq_file *m, hist_trigger_print_key(m, hist_data, key, elt); /* At first, show the raw hitcount always */ - hist_trigger_print_val(m, i, "hitcount", 0, totals, elt); + hist_trigger_print_val(m, i, "hitcount", 0, stats, elt); for (i = 1; i < hist_data->n_vals; i++) { field_name = hist_field_name(hist_data->fields[i], 0); @@ -5360,7 +5399,7 @@ static void hist_trigger_entry_print(struct seq_file *m, continue; seq_puts(m, " "); - hist_trigger_print_val(m, i, field_name, flags, totals, elt); + hist_trigger_print_val(m, i, field_name, flags, stats, elt); } print_actions(m, hist_data, elt); @@ -5374,7 +5413,8 @@ static int print_entries(struct seq_file *m, struct tracing_map_sort_entry **sort_entries = NULL; struct tracing_map *map = hist_data->map; int i, j, n_entries; - u64 *totals = NULL; + struct hist_val_stat *stats = NULL; + u64 val; n_entries = tracing_map_sort_entries(map, hist_data->sort_keys, hist_data->n_sort_keys, @@ -5382,28 +5422,33 @@ static int print_entries(struct seq_file *m, if (n_entries < 0) return n_entries; + /* Calculate the max and the total for each field if needed. */ for (j = 0; j < hist_data->n_vals; j++) { - if (!(hist_data->fields[j]->flags & HIST_FIELD_FL_PERCENT)) + if (!(hist_data->fields[j]->flags & + (HIST_FIELD_FL_PERCENT | HIST_FIELD_FL_GRAPH))) continue; - if (!totals) { - totals = kcalloc(hist_data->n_vals, sizeof(u64), - GFP_KERNEL); - if (!totals) { + if (!stats) { + stats = kcalloc(hist_data->n_vals, sizeof(*stats), + GFP_KERNEL); + if (!stats) { n_entries = -ENOMEM; goto out; } } - for (i = 0; i < n_entries; i++) - totals[j] += tracing_map_read_sum( - sort_entries[i]->elt, j); + for (i = 0; i < n_entries; i++) { + val = tracing_map_read_sum(sort_entries[i]->elt, j); + stats[j].total += val; + if (stats[j].max < val) + stats[j].max = val; + } } for (i = 0; i < n_entries; i++) - hist_trigger_entry_print(m, hist_data, totals, + hist_trigger_entry_print(m, hist_data, stats, sort_entries[i]->key, sort_entries[i]->elt); - kfree(totals); + kfree(stats); out: tracing_map_destroy_sort_entries(sort_entries, n_entries); -- cgit v1.2.3 From ccf47f5cc4cebdba7444cf1c04fc1b02d4b3cd15 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 20 Oct 2022 00:31:55 +0900 Subject: tracing: Add nohitcount option for suppressing display of raw hitcount Add 'nohitcount' ('NOHC' for short) option for suppressing display of the raw hitcount column in the histogram. Note that you must specify at least one value except raw 'hitcount' when you specify this nohitcount option. # cd /sys/kernel/debug/tracing/ # echo hist:keys=pid:vals=runtime.percent,runtime.graph:sort=pid:NOHC > \ events/sched/sched_stat_runtime/trigger # sleep 10 # cat events/sched/sched_stat_runtime/hist # event histogram # # trigger info: hist:keys=pid:vals=runtime.percent,runtime.graph:sort=pid:size=2048:nohitcount [active] # { pid: 8 } runtime (%): 3.02 runtime: # { pid: 14 } runtime (%): 2.25 runtime: { pid: 16 } runtime (%): 2.25 runtime: { pid: 26 } runtime (%): 0.17 runtime: { pid: 61 } runtime (%): 11.52 runtime: #### { pid: 67 } runtime (%): 1.56 runtime: { pid: 68 } runtime (%): 0.84 runtime: { pid: 76 } runtime (%): 0.92 runtime: { pid: 117 } runtime (%): 2.50 runtime: # { pid: 146 } runtime (%): 49.88 runtime: #################### { pid: 157 } runtime (%): 16.63 runtime: ###### { pid: 158 } runtime (%): 8.38 runtime: ### Link: https://lore.kernel.org/linux-trace-kernel/166610814787.56030.4980636083486339906.stgit@devnote2 Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Tom Zanussi Tested-by: Tom Zanussi --- kernel/trace/trace.c | 3 +++ kernel/trace/trace_events_hist.c | 34 ++++++++++++++++++++++++++-------- 2 files changed, 29 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 55aec4616d8b..948f321b9df1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5678,6 +5678,7 @@ static const char readme_msg[] = "\t [:size=#entries]\n" "\t [:pause][:continue][:clear]\n" "\t [:name=histname1]\n" + "\t [:nohitcount]\n" "\t [:.]\n" "\t [if ]\n\n" "\t Note, special fields can be used as well:\n" @@ -5734,6 +5735,8 @@ static const char readme_msg[] = "\t The 'clear' parameter will clear the contents of a running\n" "\t hist trigger and leave its current paused/active state\n" "\t unchanged.\n\n" + "\t The 'nohitcount' (or NOHC) parameter will suppress display of\n" + "\t raw hitcount in the histogram.\n\n" "\t The enable_hist and disable_hist triggers can be used to\n" "\t have one event conditionally start and stop another event's\n" "\t already-attached hist trigger. The syntax is analogous to\n" diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 946b2b8f0f2c..a0cd118af527 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -69,7 +69,8 @@ C(INVALID_STR_OPERAND, "String type can not be an operand in expression"), \ C(EXPECT_NUMBER, "Expecting numeric literal"), \ C(UNARY_MINUS_SUBEXPR, "Unary minus not supported in sub-expressions"), \ - C(DIVISION_BY_ZERO, "Division by zero"), + C(DIVISION_BY_ZERO, "Division by zero"), \ + C(NEED_NOHC_VAL, "Non-hitcount value is required for 'nohitcount'"), #undef C #define C(a, b) HIST_ERR_##a @@ -526,6 +527,7 @@ struct hist_trigger_attrs { bool cont; bool clear; bool ts_in_usecs; + bool no_hitcount; unsigned int map_bits; char *assignment_str[TRACING_MAP_VARS_MAX]; @@ -1550,7 +1552,10 @@ parse_hist_trigger_attrs(struct trace_array *tr, char *trigger_str) ret = parse_assignment(tr, str, attrs); if (ret) goto free; - } else if (strcmp(str, "pause") == 0) + } else if (strcmp(str, "nohitcount") == 0 || + strcmp(str, "NOHC") == 0) + attrs->no_hitcount = true; + else if (strcmp(str, "pause") == 0) attrs->pause = true; else if ((strcmp(str, "cont") == 0) || (strcmp(str, "continue") == 0)) @@ -4377,6 +4382,12 @@ static int create_val_fields(struct hist_trigger_data *hist_data, if (fields_str && (strcmp(fields_str, "hitcount") != 0)) ret = -EINVAL; out: + /* There is only raw hitcount but nohitcount suppresses it. */ + if (j == 1 && hist_data->attrs->no_hitcount) { + hist_err(hist_data->event_file->tr, HIST_ERR_NEED_NOHC_VAL, 0); + ret = -ENOENT; + } + return ret; } @@ -5388,13 +5399,13 @@ static void hist_trigger_entry_print(struct seq_file *m, hist_trigger_print_key(m, hist_data, key, elt); - /* At first, show the raw hitcount always */ - hist_trigger_print_val(m, i, "hitcount", 0, stats, elt); + /* At first, show the raw hitcount if !nohitcount */ + if (!hist_data->attrs->no_hitcount) + hist_trigger_print_val(m, i, "hitcount", 0, stats, elt); for (i = 1; i < hist_data->n_vals; i++) { field_name = hist_field_name(hist_data->fields[i], 0); flags = hist_data->fields[i]->flags; - if (flags & HIST_FIELD_FL_VAR || flags & HIST_FIELD_FL_EXPR) continue; @@ -5839,6 +5850,7 @@ static int event_hist_trigger_print(struct seq_file *m, struct hist_trigger_data *hist_data = data->private_data; struct hist_field *field; bool have_var = false; + bool show_val = false; unsigned int i; seq_puts(m, HIST_PREFIX); @@ -5869,12 +5881,16 @@ static int event_hist_trigger_print(struct seq_file *m, continue; } - if (i == HITCOUNT_IDX) + if (i == HITCOUNT_IDX) { + if (hist_data->attrs->no_hitcount) + continue; seq_puts(m, "hitcount"); - else { - seq_puts(m, ","); + } else { + if (show_val) + seq_puts(m, ","); hist_field_print(m, field); } + show_val = true; } if (have_var) { @@ -5925,6 +5941,8 @@ static int event_hist_trigger_print(struct seq_file *m, seq_printf(m, ":size=%u", (1 << hist_data->map->map_bits)); if (hist_data->enable_timestamps) seq_printf(m, ":clock=%s", hist_data->attrs->clock); + if (hist_data->attrs->no_hitcount) + seq_puts(m, ":nohitcount"); print_actions_spec(m, hist_data); -- cgit v1.2.3 From fff1787adaeebe66f27c01d5c40d8d2e4d79d5ee Mon Sep 17 00:00:00 2001 From: Song Chen Date: Wed, 19 Oct 2022 10:11:18 +0800 Subject: trace/kprobe: remove duplicated calls of ring_buffer_event_data Function __kprobe_trace_func calls ring_buffer_event_data to get a ring buffer, however, it has been done in above call trace_event_buffer_reserve. So does __kretprobe_trace_func. This patch removes those duplicated calls. Link: https://lore.kernel.org/all/1666145478-4706-1-git-send-email-chensong_2000@189.cn/ Reviewed-by: Steven Rostedt (Google) Acked-by: Masami Hiramatsu (Google) Signed-off-by: Song Chen Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_kprobe.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 5a75b039e586..ee77c8203bd5 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1344,7 +1344,6 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs, return; fbuffer.regs = regs; - entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event); entry->ip = (unsigned long)tk->rp.kp.addr; store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize); @@ -1385,7 +1384,6 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, return; fbuffer.regs = regs; - entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event); entry->func = (unsigned long)tk->rp.kp.addr; entry->ret_ip = get_kretprobe_retaddr(ri); store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize); -- cgit v1.2.3 From 575b76cb885532aae13a9d979fd476bb2b156cb9 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 22 Nov 2022 12:23:45 -0500 Subject: tracing/probes: Handle system names with hyphens When creating probe names, a check is done to make sure it matches basic C standard variable naming standards. Basically, starts with alphabetic or underline, and then the rest of the characters have alpha-numeric or underline in them. But system names do not have any true naming conventions, as they are created by the TRACE_SYSTEM macro and nothing tests to see what they are. The "xhci-hcd" trace events has a '-' in the system name. When trying to attach a eprobe to one of these trace points, it fails because the system name does not follow the variable naming convention because of the hyphen, and the eprobe checks fail on this. Allow hyphens in the system name so that eprobes can attach to the "xhci-hcd" trace events. Link: https://lore.kernel.org/all/Y3eJ8GiGnEvVd8%2FN@macondo/ Link: https://lore.kernel.org/linux-trace-kernel/20221122122345.160f5077@gandalf.local.home Cc: Masami Hiramatsu Cc: stable@vger.kernel.org Fixes: 5b7a96220900e ("tracing/probe: Check event/group naming rule at parsing") Reported-by: Rafael Mendonca Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.h | 19 ++++++++++++++++--- kernel/trace/trace_probe.c | 2 +- 2 files changed, 17 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 48643f07bc01..8f37ff032b4f 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1954,17 +1954,30 @@ static __always_inline void trace_iterator_reset(struct trace_iterator *iter) } /* Check the name is good for event/group/fields */ -static inline bool is_good_name(const char *name) +static inline bool __is_good_name(const char *name, bool hash_ok) { - if (!isalpha(*name) && *name != '_') + if (!isalpha(*name) && *name != '_' && (!hash_ok || *name != '-')) return false; while (*++name != '\0') { - if (!isalpha(*name) && !isdigit(*name) && *name != '_') + if (!isalpha(*name) && !isdigit(*name) && *name != '_' && + (!hash_ok || *name != '-')) return false; } return true; } +/* Check the name is good for event/group/fields */ +static inline bool is_good_name(const char *name) +{ + return __is_good_name(name, false); +} + +/* Check the name is good for system */ +static inline bool is_good_system_name(const char *name) +{ + return __is_good_name(name, true); +} + /* Convert certain expected symbols into '_' when generating event names */ static inline void sanitize_event_name(char *name) { diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 36dff277de46..bb2f95d7175c 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -246,7 +246,7 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, return -EINVAL; } strlcpy(buf, event, slash - event + 1); - if (!is_good_name(buf)) { + if (!is_good_system_name(buf)) { trace_probe_log_err(offset, BAD_GROUP_NAME); return -EINVAL; } -- cgit v1.2.3 From e25e43a4e5d8cb2323553d8b6a7ba08d2ebab21f Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 6 Dec 2022 23:18:01 +0900 Subject: tracing: Fix complicated dependency of CONFIG_TRACER_MAX_TRACE Both CONFIG_OSNOISE_TRACER and CONFIG_HWLAT_TRACER partially enables the CONFIG_TRACER_MAX_TRACE code, but that is complicated and has introduced a bug; It declares tracing_max_lat_fops data structure outside of #ifdefs, but since it is defined only when CONFIG_TRACER_MAX_TRACE=y or CONFIG_HWLAT_TRACER=y, if only CONFIG_OSNOISE_TRACER=y, that declaration comes to a definition(!). To fix this issue, and do not repeat the similar problem, makes CONFIG_OSNOISE_TRACER and CONFIG_HWLAT_TRACER enables the CONFIG_TRACER_MAX_TRACE always. It has there benefits; - Fix the tracing_max_lat_fops bug - Simplify the #ifdefs - CONFIG_TRACER_MAX_TRACE code is fully enabled, or not. Link: https://lore.kernel.org/linux-trace-kernel/167033628155.4111793.12185405690820208159.stgit@devnote3 Fixes: 424b650f35c7 ("tracing: Fix missing osnoise tracer on max_latency") Cc: Daniel Bristot de Oliveira Cc: stable@vger.kernel.org Reported-by: David Howells Reported-by: kernel test robot Signed-off-by: Masami Hiramatsu (Google) Link: https://lore.kernel.org/all/166992525941.1716618.13740663757583361463.stgit@warthog.procyon.org.uk/ (original thread and v1) Link: https://lore.kernel.org/all/202212052253.VuhZ2ulJ-lkp@intel.com/T/#u (v1 error report) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/Kconfig | 2 ++ kernel/trace/trace.c | 23 +++++++++++++---------- kernel/trace/trace.h | 8 +++----- 3 files changed, 18 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index e9e95c790b8e..93d724996283 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -375,6 +375,7 @@ config SCHED_TRACER config HWLAT_TRACER bool "Tracer to detect hardware latencies (like SMIs)" select GENERIC_TRACER + select TRACER_MAX_TRACE help This tracer, when enabled will create one or more kernel threads, depending on what the cpumask file is set to, which each thread @@ -410,6 +411,7 @@ config HWLAT_TRACER config OSNOISE_TRACER bool "OS Noise tracer" select GENERIC_TRACER + select TRACER_MAX_TRACE help In the context of high-performance computing (HPC), the Operating System Noise (osnoise) refers to the interference experienced by an diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 948f321b9df1..664619b3f1e1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1421,6 +1421,7 @@ int tracing_snapshot_cond_disable(struct trace_array *tr) return false; } EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable); +#define free_snapshot(tr) do { } while (0) #endif /* CONFIG_TRACER_SNAPSHOT */ void tracer_tracing_off(struct trace_array *tr) @@ -1692,6 +1693,8 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) } unsigned long __read_mostly tracing_thresh; + +#ifdef CONFIG_TRACER_MAX_TRACE static const struct file_operations tracing_max_lat_fops; #ifdef LATENCY_FS_NOTIFY @@ -1748,18 +1751,14 @@ void latency_fsnotify(struct trace_array *tr) irq_work_queue(&tr->fsnotify_irqwork); } -#elif defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) \ - || defined(CONFIG_OSNOISE_TRACER) +#else /* !LATENCY_FS_NOTIFY */ #define trace_create_maxlat_file(tr, d_tracer) \ trace_create_file("tracing_max_latency", TRACE_MODE_WRITE, \ d_tracer, &tr->max_latency, &tracing_max_lat_fops) -#else -#define trace_create_maxlat_file(tr, d_tracer) do { } while (0) #endif -#ifdef CONFIG_TRACER_MAX_TRACE /* * Copy the new maximum trace into the separate maximum-trace * structure. (this way the maximum trace is permanently saved, @@ -1834,14 +1833,15 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu, ring_buffer_record_off(tr->max_buffer.buffer); #ifdef CONFIG_TRACER_SNAPSHOT - if (tr->cond_snapshot && !tr->cond_snapshot->update(tr, cond_data)) - goto out_unlock; + if (tr->cond_snapshot && !tr->cond_snapshot->update(tr, cond_data)) { + arch_spin_unlock(&tr->max_lock); + return; + } #endif swap(tr->array_buffer.buffer, tr->max_buffer.buffer); __update_max_tr(tr, tsk, cpu); - out_unlock: arch_spin_unlock(&tr->max_lock); } @@ -1888,6 +1888,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) __update_max_tr(tr, tsk, cpu); arch_spin_unlock(&tr->max_lock); } + #endif /* CONFIG_TRACER_MAX_TRACE */ static int wait_on_pipe(struct trace_iterator *iter, int full) @@ -6577,7 +6578,7 @@ out: return ret; } -#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) +#ifdef CONFIG_TRACER_MAX_TRACE static ssize_t tracing_max_lat_read(struct file *filp, char __user *ubuf, @@ -7592,7 +7593,7 @@ static const struct file_operations tracing_thresh_fops = { .llseek = generic_file_llseek, }; -#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) +#ifdef CONFIG_TRACER_MAX_TRACE static const struct file_operations tracing_max_lat_fops = { .open = tracing_open_generic, .read = tracing_max_lat_read, @@ -9606,7 +9607,9 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) create_trace_options_dir(tr); +#ifdef CONFIG_TRACER_MAX_TRACE trace_create_maxlat_file(tr, d_tracer); +#endif if (ftrace_create_function_files(tr, d_tracer)) MEM_FAIL(1, "Could not allocate function filter files"); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 8f37ff032b4f..9dc920b01c17 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -308,8 +308,7 @@ struct trace_array { struct array_buffer max_buffer; bool allocated_snapshot; #endif -#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) \ - || defined(CONFIG_OSNOISE_TRACER) +#ifdef CONFIG_TRACER_MAX_TRACE unsigned long max_latency; #ifdef CONFIG_FSNOTIFY struct dentry *d_max_latency; @@ -688,12 +687,11 @@ void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu, void *cond_data); void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu); -#endif /* CONFIG_TRACER_MAX_TRACE */ -#if (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) \ - || defined(CONFIG_OSNOISE_TRACER)) && defined(CONFIG_FSNOTIFY) +#ifdef CONFIG_FSNOTIFY #define LATENCY_FS_NOTIFY #endif +#endif /* CONFIG_TRACER_MAX_TRACE */ #ifdef LATENCY_FS_NOTIFY void latency_fsnotify(struct trace_array *tr); -- cgit v1.2.3 From 2cc6a528882d0e0ccbc1bca5f95b8c963cedac54 Mon Sep 17 00:00:00 2001 From: Zheng Yejian Date: Wed, 7 Dec 2022 11:46:35 +0800 Subject: tracing/hist: Fix wrong return value in parse_action_params() When number of synth fields is more than SYNTH_FIELDS_MAX, parse_action_params() should return -EINVAL. Link: https://lore.kernel.org/linux-trace-kernel/20221207034635.2253990-1-zhengyejian1@huawei.com Cc: Cc: Cc: stable@vger.kernel.org Fixes: c282a386a397 ("tracing: Add 'onmatch' hist trigger action support") Signed-off-by: Zheng Yejian Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_hist.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index a0cd118af527..b4ad86c22b43 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -3609,6 +3609,7 @@ static int parse_action_params(struct trace_array *tr, char *params, while (params) { if (data->n_params >= SYNTH_FIELDS_MAX) { hist_err(tr, HIST_ERR_TOO_MANY_PARAMS, 0); + ret = -EINVAL; goto out; } -- cgit v1.2.3 From 82470f7d9044842618c847a7166de2b7458157a7 Mon Sep 17 00:00:00 2001 From: Zheng Yejian Date: Wed, 7 Dec 2022 11:51:43 +0800 Subject: tracing/hist: Fix out-of-bound write on 'action_data.var_ref_idx' When generate a synthetic event with many params and then create a trace action for it [1], kernel panic happened [2]. It is because that in trace_action_create() 'data->n_params' is up to SYNTH_FIELDS_MAX (current value is 64), and array 'data->var_ref_idx' keeps indices into array 'hist_data->var_refs' for each synthetic event param, but the length of 'data->var_ref_idx' is TRACING_MAP_VARS_MAX (current value is 16), so out-of-bound write happened when 'data->n_params' more than 16. In this case, 'data->match_data.event' is overwritten and eventually cause the panic. To solve the issue, adjust the length of 'data->var_ref_idx' to be SYNTH_FIELDS_MAX and add sanity checks to avoid out-of-bound write. [1] # cd /sys/kernel/tracing/ # echo "my_synth_event int v1; int v2; int v3; int v4; int v5; int v6;\ int v7; int v8; int v9; int v10; int v11; int v12; int v13; int v14;\ int v15; int v16; int v17; int v18; int v19; int v20; int v21; int v22;\ int v23; int v24; int v25; int v26; int v27; int v28; int v29; int v30;\ int v31; int v32; int v33; int v34; int v35; int v36; int v37; int v38;\ int v39; int v40; int v41; int v42; int v43; int v44; int v45; int v46;\ int v47; int v48; int v49; int v50; int v51; int v52; int v53; int v54;\ int v55; int v56; int v57; int v58; int v59; int v60; int v61; int v62;\ int v63" >> synthetic_events # echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="bash"' >> \ events/sched/sched_waking/trigger # echo "hist:keys=next_pid:onmatch(sched.sched_waking).my_synth_event(\ pid,pid,pid,pid,pid,pid,pid,pid,pid,pid,pid,pid,pid,pid,pid,pid,pid,pid,\ pid,pid,pid,pid,pid,pid,pid,pid,pid,pid,pid,pid,pid,pid,pid,pid,pid,pid,\ pid,pid,pid,pid,pid,pid,pid,pid,pid,pid,pid,pid,pid,pid,pid,pid,pid,pid,\ pid,pid,pid,pid,pid,pid,pid,pid,pid)" >> events/sched/sched_switch/trigger [2] BUG: unable to handle page fault for address: ffff91c900000000 PGD 61001067 P4D 61001067 PUD 0 Oops: 0000 [#1] PREEMPT SMP NOPTI CPU: 2 PID: 322 Comm: bash Tainted: G W 6.1.0-rc8+ #229 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.15.0-0-g2dd4b9b3f840-prebuilt.qemu.org 04/01/2014 RIP: 0010:strcmp+0xc/0x30 Code: 75 f7 31 d2 44 0f b6 04 16 44 88 04 11 48 83 c2 01 45 84 c0 75 ee c3 cc cc cc cc 0f 1f 00 31 c0 eb 08 48 83 c0 01 84 d2 74 13 <0f> b6 14 07 3a 14 06 74 ef 19 c0 83 c8 01 c3 cc cc cc cc 31 c3 RSP: 0018:ffff9b3b00f53c48 EFLAGS: 00000246 RAX: 0000000000000000 RBX: ffffffffba958a68 RCX: 0000000000000000 RDX: 0000000000000010 RSI: ffff91c943d33a90 RDI: ffff91c900000000 RBP: ffff91c900000000 R08: 00000018d604b529 R09: 0000000000000000 R10: ffff91c9483eddb1 R11: ffff91ca483eddab R12: ffff91c946171580 R13: ffff91c9479f0538 R14: ffff91c9457c2848 R15: ffff91c9479f0538 FS: 00007f1d1cfbe740(0000) GS:ffff91c9bdc80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffff91c900000000 CR3: 0000000006316000 CR4: 00000000000006e0 Call Trace: __find_event_file+0x55/0x90 action_create+0x76c/0x1060 event_hist_trigger_parse+0x146d/0x2060 ? event_trigger_write+0x31/0xd0 trigger_process_regex+0xbb/0x110 event_trigger_write+0x6b/0xd0 vfs_write+0xc8/0x3e0 ? alloc_fd+0xc0/0x160 ? preempt_count_add+0x4d/0xa0 ? preempt_count_add+0x70/0xa0 ksys_write+0x5f/0xe0 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x63/0xcd RIP: 0033:0x7f1d1d0cf077 Code: 64 89 02 48 c7 c0 ff ff ff ff eb bb 0f 1f 80 00 00 00 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 28 48 89 54 24 18 48 89 74 RSP: 002b:00007ffcebb0e568 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 0000000000000143 RCX: 00007f1d1d0cf077 RDX: 0000000000000143 RSI: 00005639265aa7e0 RDI: 0000000000000001 RBP: 00005639265aa7e0 R08: 000000000000000a R09: 0000000000000142 R10: 000056392639c017 R11: 0000000000000246 R12: 0000000000000143 R13: 00007f1d1d1ae6a0 R14: 00007f1d1d1aa4a0 R15: 00007f1d1d1a98a0 Modules linked in: CR2: ffff91c900000000 ---[ end trace 0000000000000000 ]--- RIP: 0010:strcmp+0xc/0x30 Code: 75 f7 31 d2 44 0f b6 04 16 44 88 04 11 48 83 c2 01 45 84 c0 75 ee c3 cc cc cc cc 0f 1f 00 31 c0 eb 08 48 83 c0 01 84 d2 74 13 <0f> b6 14 07 3a 14 06 74 ef 19 c0 83 c8 01 c3 cc cc cc cc 31 c3 RSP: 0018:ffff9b3b00f53c48 EFLAGS: 00000246 RAX: 0000000000000000 RBX: ffffffffba958a68 RCX: 0000000000000000 RDX: 0000000000000010 RSI: ffff91c943d33a90 RDI: ffff91c900000000 RBP: ffff91c900000000 R08: 00000018d604b529 R09: 0000000000000000 R10: ffff91c9483eddb1 R11: ffff91ca483eddab R12: ffff91c946171580 R13: ffff91c9479f0538 R14: ffff91c9457c2848 R15: ffff91c9479f0538 FS: 00007f1d1cfbe740(0000) GS:ffff91c9bdc80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffff91c900000000 CR3: 0000000006316000 CR4: 00000000000006e0 Link: https://lore.kernel.org/linux-trace-kernel/20221207035143.2278781-1-zhengyejian1@huawei.com Cc: Cc: Cc: stable@vger.kernel.org Fixes: d380dcde9a07 ("tracing: Fix now invalid var_ref_vals assumption in trace action") Signed-off-by: Zheng Yejian Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_hist.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index b4ad86c22b43..8264b28d5a57 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -621,7 +621,7 @@ struct action_data { * event param, and is passed to the synthetic event * invocation. */ - unsigned int var_ref_idx[TRACING_MAP_VARS_MAX]; + unsigned int var_ref_idx[SYNTH_FIELDS_MAX]; struct synth_event *synth_event; bool use_trace_keyword; char *synth_event_name; @@ -2186,7 +2186,9 @@ static struct hist_field *create_var_ref(struct hist_trigger_data *hist_data, return ref_field; } } - + /* Sanity check to avoid out-of-bound write on 'hist_data->var_refs' */ + if (hist_data->n_var_refs >= TRACING_MAP_VARS_MAX) + return NULL; ref_field = create_hist_field(var_field->hist_data, NULL, flags, NULL); if (ref_field) { if (init_var_ref(ref_field, var_field, system, event_name)) { @@ -3946,6 +3948,10 @@ static int trace_action_create(struct hist_trigger_data *hist_data, lockdep_assert_held(&event_mutex); + /* Sanity check to avoid out-of-bound write on 'data->var_ref_idx' */ + if (data->n_params > SYNTH_FIELDS_MAX) + return -EINVAL; + if (data->use_trace_keyword) synth_event_name = data->synth_event_name; else -- cgit v1.2.3 From ff4837f7fe59ff018eca4705a70eca5e0b486b97 Mon Sep 17 00:00:00 2001 From: Zheng Yejian Date: Wed, 7 Dec 2022 17:15:57 +0800 Subject: tracing: Fix issue of missing one synthetic field The maximum number of synthetic fields supported is defined as SYNTH_FIELDS_MAX which value currently is 64, but it actually fails when try to generate a synthetic event with 64 fields by executing like: # echo "my_synth_event int v1; int v2; int v3; int v4; int v5; int v6;\ int v7; int v8; int v9; int v10; int v11; int v12; int v13; int v14;\ int v15; int v16; int v17; int v18; int v19; int v20; int v21; int v22;\ int v23; int v24; int v25; int v26; int v27; int v28; int v29; int v30;\ int v31; int v32; int v33; int v34; int v35; int v36; int v37; int v38;\ int v39; int v40; int v41; int v42; int v43; int v44; int v45; int v46;\ int v47; int v48; int v49; int v50; int v51; int v52; int v53; int v54;\ int v55; int v56; int v57; int v58; int v59; int v60; int v61; int v62;\ int v63; int v64" >> /sys/kernel/tracing/synthetic_events Correct the field counting to fix it. Link: https://lore.kernel.org/linux-trace-kernel/20221207091557.3137904-1-zhengyejian1@huawei.com Cc: Cc: Cc: stable@vger.kernel.org Fixes: c9e759b1e845 ("tracing: Rework synthetic event command parsing") Signed-off-by: Zheng Yejian Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_synth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index c3b582d19b62..67592eed0be8 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -1282,12 +1282,12 @@ static int __create_synth_event(const char *name, const char *raw_fields) goto err_free_arg; } - fields[n_fields++] = field; if (n_fields == SYNTH_FIELDS_MAX) { synth_err(SYNTH_ERR_TOO_MANY_FIELDS, 0); ret = -EINVAL; goto err_free_arg; } + fields[n_fields++] = field; n_fields_this_loop++; } -- cgit v1.2.3 From 608c6ed3337850c767ab0dd6c583477922233e29 Mon Sep 17 00:00:00 2001 From: Zheng Yejian Date: Wed, 7 Dec 2022 21:53:26 +0800 Subject: tracing/hist: Fix issue of losting command info in error_log When input some constructed invalid 'trigger' command, command info in 'error_log' are lost [1]. The root cause is that there is a path that event_hist_trigger_parse() is recursely called once and 'last_cmd' which save origin command is cleared, then later calling of hist_err() will no longer record origin command info: event_hist_trigger_parse() { last_cmd_set() // <1> 'last_cmd' save origin command here at first create_actions() { onmatch_create() { action_create() { trace_action_create() { trace_action_create_field_var() { create_field_var_hist() { event_hist_trigger_parse() { // <2> recursely called once hist_err_clear() // <3> 'last_cmd' is cleared here } hist_err() // <4> No longer find origin command!!! Since 'glob' is empty string while running into the recurse call, we can trickly check it and bypass the call of hist_err_clear() to solve it. [1] # cd /sys/kernel/tracing # echo "my_synth_event int v1; int v2; int v3;" >> synthetic_events # echo 'hist:keys=pid' >> events/sched/sched_waking/trigger # echo "hist:keys=next_pid:onmatch(sched.sched_waking).my_synth_event(\ pid,pid1)" >> events/sched/sched_switch/trigger # cat error_log [ 8.405018] hist:sched:sched_switch: error: Couldn't find synthetic event Command: hist:keys=next_pid:onmatch(sched.sched_waking).my_synth_event(pid,pid1) ^ [ 8.816902] hist:sched:sched_switch: error: Couldn't find field Command: hist:keys=next_pid:onmatch(sched.sched_waking).my_synth_event(pid,pid1) ^ [ 8.816902] hist:sched:sched_switch: error: Couldn't parse field variable Command: hist:keys=next_pid:onmatch(sched.sched_waking).my_synth_event(pid,pid1) ^ [ 8.999880] : error: Couldn't find field Command: ^ [ 8.999880] : error: Couldn't parse field variable Command: ^ [ 8.999880] : error: Couldn't find field Command: ^ [ 8.999880] : error: Couldn't create histogram for field Command: ^ Link: https://lore.kernel.org/linux-trace-kernel/20221207135326.3483216-1-zhengyejian1@huawei.com Cc: Cc: Fixes: f404da6e1d46 ("tracing: Add 'last error' error facility for hist triggers") Signed-off-by: Zheng Yejian Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_hist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 8264b28d5a57..fcaf226b7744 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -6576,7 +6576,7 @@ enable: if (se) se->ref++; out: - if (ret == 0) + if (ret == 0 && glob[0]) hist_err_clear(); return ret; -- cgit v1.2.3 From 88ca6a71dcab4a4ba6e6e2ff66415a5c4f86e874 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 9 Dec 2022 10:11:51 -0500 Subject: ring-buffer: Handle resize in early boot up With the new command line option that allows trace event triggers to be added at boot, the "snapshot" trigger will allocate the snapshot buffer very early, when interrupts can not be enabled. Allocating the ring buffer is not the problem, but it also resizes it, which is, as the resize code does synchronization that can not be preformed at early boot. To handle this, first change the raw_spin_lock_irq() in rb_insert_pages() to raw_spin_lock_irqsave(), such that the unlocking of that spin lock will not enable interrupts. Next, where it calls schedule_work_on(), disable migration and check if the CPU to update is the current CPU, and if so, perform the work directly, otherwise re-enable migration and call the schedule_work_on() to the CPU that is being updated. The rb_insert_pages() just needs to be run on the CPU that it is updating, and does not need preemption nor interrupts disabled when calling it. Link: https://lore.kernel.org/lkml/Y5J%2FCajlNh1gexvo@google.com/ Link: https://lore.kernel.org/linux-trace-kernel/20221209101151.1fec1167@gandalf.local.home Cc: Masami Hiramatsu Cc: Andrew Morton Fixes: a01fdc897fa5 ("tracing: Add trace_trigger kernel command line option") Reported-by: Ross Zwisler Signed-off-by: Steven Rostedt Tested-by: Ross Zwisler Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 843818ee4814..c366a0a9ddba 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2062,8 +2062,10 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer) { struct list_head *pages = &cpu_buffer->new_pages; int retries, success; + unsigned long flags; - raw_spin_lock_irq(&cpu_buffer->reader_lock); + /* Can be called at early boot up, where interrupts must not been enabled */ + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); /* * We are holding the reader lock, so the reader page won't be swapped * in the ring buffer. Now we are racing with the writer trying to @@ -2120,7 +2122,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer) * tracing */ RB_WARN_ON(cpu_buffer, !success); - raw_spin_unlock_irq(&cpu_buffer->reader_lock); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); /* free pages if they weren't inserted */ if (!success) { @@ -2248,8 +2250,16 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, rb_update_pages(cpu_buffer); cpu_buffer->nr_pages_to_update = 0; } else { - schedule_work_on(cpu, - &cpu_buffer->update_pages_work); + /* Run directly if possible. */ + migrate_disable(); + if (cpu != smp_processor_id()) { + migrate_enable(); + schedule_work_on(cpu, + &cpu_buffer->update_pages_work); + } else { + update_pages_handler(&cpu_buffer->update_pages_work); + migrate_enable(); + } } } @@ -2298,9 +2308,17 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, if (!cpu_online(cpu_id)) rb_update_pages(cpu_buffer); else { - schedule_work_on(cpu_id, - &cpu_buffer->update_pages_work); - wait_for_completion(&cpu_buffer->update_done); + /* Run directly if possible. */ + migrate_disable(); + if (cpu_id == smp_processor_id()) { + rb_update_pages(cpu_buffer); + migrate_enable(); + } else { + migrate_enable(); + schedule_work_on(cpu_id, + &cpu_buffer->update_pages_work); + wait_for_completion(&cpu_buffer->update_done); + } } cpu_buffer->nr_pages_to_update = 0; -- cgit v1.2.3 From 98629dadcd020cba690ce0c49a564a1166444646 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Thu, 8 Dec 2022 17:33:10 -0700 Subject: tracing: remove unnecessary trace_trigger ifdef The trace_trigger command line option introduced by commit a01fdc897fa5 ("tracing: Add trace_trigger kernel command line option") doesn't need to depend on the CONFIG_HIST_TRIGGERS kernel config option. This code doesn't depend on the histogram code, and the run-time selection of triggers is usable without CONFIG_HIST_TRIGGERS. Link: https://lore.kernel.org/linux-trace-kernel/20221209003310.1737039-1-zwisler@google.com Cc: Masami Hiramatsu Cc: Andrew Morton Cc: Mathieu Desnoyers Cc: Joel Fernandes Cc: Tom Zanussi Fixes: a01fdc897fa5 ("tracing: Add trace_trigger kernel command line option") Signed-off-by: Ross Zwisler Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 3bfaf560ecc4..33e0b4f8ebe6 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2796,7 +2796,6 @@ trace_create_new_event(struct trace_event_call *call, return file; } -#ifdef CONFIG_HIST_TRIGGERS #define MAX_BOOT_TRIGGERS 32 static struct boot_triggers { @@ -2832,7 +2831,6 @@ static __init int setup_trace_triggers(char *str) return 1; } __setup("trace_trigger=", setup_trace_triggers); -#endif /* Add an event to a trace directory */ static int @@ -2850,7 +2848,6 @@ __trace_add_new_event(struct trace_event_call *call, struct trace_array *tr) return event_define_fields(call); } -#ifdef CONFIG_HIST_TRIGGERS static void trace_early_triggers(struct trace_event_file *file, const char *name) { int ret; @@ -2868,9 +2865,6 @@ static void trace_early_triggers(struct trace_event_file *file, const char *name bootup_triggers[i].event); } } -#else -static inline void trace_early_triggers(struct trace_event_file *file, const char *name) { } -#endif /* * Just create a descriptor for early init. A descriptor is required -- cgit v1.2.3 From ec370890f92ba8ad5476a34068655b06ba48def7 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Mon, 5 Dec 2022 09:27:38 +0100 Subject: tracing/osnoise: Make osnoise_options static Make osnoise_options static, as reported by the kernel test robot. Link: https://lkml.kernel.org/r/63255826485400d7a2270e9c5e66111079671e7a.1670228712.git.bristot@kernel.org Reported-by: kernel test robot Cc: Daniel Bristot de Oliveira Cc: Steven Rostedt Cc: Masami Hiramatsu Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_osnoise.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 3f10dd1f2f1c..8ba82c71268f 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -59,8 +59,8 @@ enum osnoise_options_index { static const char * const osnoise_options_str[OSN_MAX] = { "DEFAULTS", "OSNOISE_WORKLOAD" }; -#define OSN_DEFAULT_OPTIONS 0x2 -unsigned long osnoise_options = OSN_DEFAULT_OPTIONS; +#define OSN_DEFAULT_OPTIONS 0x2 +static unsigned long osnoise_options = OSN_DEFAULT_OPTIONS; /* * trace_array of the enabled osnoise/timerlat instances. -- cgit v1.2.3 From bfd5a5e82d22da43afa0e2bb9fb72339aa79c6cc Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 5 Dec 2022 10:21:52 +0000 Subject: tracing: Fix some checker warnings Fix some checker warnings in the trace code by adding __printf attributes to a number of trace functions and their declarations. Changes: ======== ver #2) - Dropped the fix for the unconditional tracing_max_lat_fops decl[1]. Link: https://lore.kernel.org/r/20221205180617.9b9d3971cbe06ee536603523@kernel.org/ [1] Link: https://lore.kernel.org/r/166992525941.1716618.13740663757583361463.stgit@warthog.procyon.org.uk/ # v1 Link: https://lkml.kernel.org/r/167023571258.382307.15314866482834835192.stgit@warthog.procyon.org.uk Signed-off-by: David Howells Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_events.h | 3 ++- include/linux/trace_seq.h | 3 ++- kernel/trace/trace.h | 2 +- kernel/trace/trace_output.c | 5 +++-- 4 files changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index bb2053246d6a..4342e996bcdb 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -234,7 +234,8 @@ void tracing_record_taskinfo_sched_switch(struct task_struct *prev, void tracing_record_cmdline(struct task_struct *task); void tracing_record_tgid(struct task_struct *task); -int trace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...); +int trace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...) + __printf(3, 4); struct event_filter; diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h index 5a2c650d9e1c..0c4c7587d6c3 100644 --- a/include/linux/trace_seq.h +++ b/include/linux/trace_seq.h @@ -97,7 +97,8 @@ extern int trace_seq_hex_dump(struct trace_seq *s, const char *prefix_str, const void *buf, size_t len, bool ascii); #else /* CONFIG_TRACING */ -static inline void trace_seq_printf(struct trace_seq *s, const char *fmt, ...) +static inline __printf(2, 3) +void trace_seq_printf(struct trace_seq *s, const char *fmt, ...) { } static inline void diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 9dc920b01c17..e46a49269be2 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -614,7 +614,7 @@ void trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer, bool trace_is_tracepoint_string(const char *str); const char *trace_event_format(struct trace_iterator *iter, const char *fmt); void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, - va_list ap); + va_list ap) __printf(2, 0); int trace_empty(struct trace_iterator *iter); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index f0ba97121345..57a13b61f186 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -322,8 +322,9 @@ void trace_event_printf(struct trace_iterator *iter, const char *fmt, ...) } EXPORT_SYMBOL(trace_event_printf); -static int trace_output_raw(struct trace_iterator *iter, char *name, - char *fmt, va_list ap) +static __printf(3, 0) +int trace_output_raw(struct trace_iterator *iter, char *name, + char *fmt, va_list ap) { struct trace_seq *s = &iter->seq; -- cgit v1.2.3 From 1603dda47714cebe8a29b2154407da7a929d13f4 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 9 Dec 2022 23:05:52 +0100 Subject: tracing/osnoise: Add PANIC_ON_STOP option Often the latency observed in a CPU is not caused by the work being done in the CPU itself, but by work done on another CPU that causes the hardware to stall all CPUs. In this case, it is interesting to know what is happening on ALL CPUs, and the best way to do this is via crash dump analysis. Add the PANIC_ON_STOP option to osnoise/timerlat tracers. The default behavior is having this option off. When enabled by the user, the system will panic after hitting a stop tracing condition. This option was motivated by a real scenario that Juri Lelli and I were debugging. Link: https://lkml.kernel.org/r/249ce4287c6725543e6db845a6e0df621dc67db5.1670623111.git.bristot@kernel.org Cc: Juri Lelli Cc: Clark Williams Cc: Bagas Sanjaya Cc: Daniel Bristot de Oliveira Cc: Masami Hiramatsu Cc: Jonathan Corbet Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_osnoise.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 8ba82c71268f..5a7613942223 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -54,10 +54,11 @@ enum osnoise_options_index { OSN_DEFAULTS = 0, OSN_WORKLOAD, + OSN_PANIC_ON_STOP, OSN_MAX }; -static const char * const osnoise_options_str[OSN_MAX] = { "DEFAULTS", "OSNOISE_WORKLOAD" }; +static const char * const osnoise_options_str[OSN_MAX] = { "DEFAULTS", "OSNOISE_WORKLOAD", "PANIC_ON_STOP" }; #define OSN_DEFAULT_OPTIONS 0x2 static unsigned long osnoise_options = OSN_DEFAULT_OPTIONS; @@ -1270,6 +1271,9 @@ static __always_inline void osnoise_stop_tracing(void) trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_, "stop tracing hit on cpu %d\n", smp_processor_id()); + if (test_bit(OSN_PANIC_ON_STOP, &osnoise_options)) + panic("tracer hit stop condition on CPU %d\n", smp_processor_id()); + tracer_tracing_off(tr); } rcu_read_unlock(); -- cgit v1.2.3 From b5dce2002567a9b1a83ef3e3a8678d8c32be2a78 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 9 Dec 2022 23:05:53 +0100 Subject: tracing/osnoise: Add preempt and/or irq disabled options The osnoise workload runs with preemption and IRQs enabled in such a way as to allow all sorts of noise to disturb osnoise's execution. hwlat tracer has a similar workload but works with irq disabled, allowing only NMIs and the hardware to generate noise. While thinking about adding an options file to hwlat tracer to allow the system to panic, and other features I was thinking to add, like having a tracepoint at each noise detection, it came to my mind that is easier to make osnoise and also do hardware latency detection than making hwlat "feature compatible" with osnoise. Other points are: - osnoise already has an independent cpu file. - osnoise has a more intuitive interface, e.g., runtime/period vs. window/width (and people often need help remembering what it is). - osnoise: tracepoints - osnoise stop options - osnoise options file itself Moreover, the user-space side (in rtla) is simplified by reusing the existing osnoise code. Finally, people have been asking me about using osnoise for hw latency detection, and I have to explain that it was sufficient but not necessary. These options make it sufficient and necessary. Adding a Suggested-by Clark, as he often asked me about this possibility. Link: https://lkml.kernel.org/r/d9c6c19135497054986900f94c8e47410b15316a.1670623111.git.bristot@kernel.org Cc: Suggested-by: Clark Williams Cc: Juri Lelli Cc: Bagas Sanjaya Cc: Daniel Bristot de Oliveira Cc: Masami Hiramatsu Cc: Jonathan Corbet Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_osnoise.c | 48 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 43 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 5a7613942223..94c1b5eb1dc0 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -55,10 +55,17 @@ enum osnoise_options_index { OSN_DEFAULTS = 0, OSN_WORKLOAD, OSN_PANIC_ON_STOP, + OSN_PREEMPT_DISABLE, + OSN_IRQ_DISABLE, OSN_MAX }; -static const char * const osnoise_options_str[OSN_MAX] = { "DEFAULTS", "OSNOISE_WORKLOAD", "PANIC_ON_STOP" }; +static const char * const osnoise_options_str[OSN_MAX] = { + "DEFAULTS", + "OSNOISE_WORKLOAD", + "PANIC_ON_STOP", + "OSNOISE_PREEMPT_DISABLE", + "OSNOISE_IRQ_DISABLE" }; #define OSN_DEFAULT_OPTIONS 0x2 static unsigned long osnoise_options = OSN_DEFAULT_OPTIONS; @@ -1308,18 +1315,26 @@ static void notify_new_max_latency(u64 latency) */ static int run_osnoise(void) { + bool disable_irq = test_bit(OSN_IRQ_DISABLE, &osnoise_options); struct osnoise_variables *osn_var = this_cpu_osn_var(); u64 start, sample, last_sample; u64 last_int_count, int_count; s64 noise = 0, max_noise = 0; s64 total, last_total = 0; struct osnoise_sample s; + bool disable_preemption; unsigned int threshold; u64 runtime, stop_in; u64 sum_noise = 0; int hw_count = 0; int ret = -1; + /* + * Disabling preemption is only required if IRQs are enabled, + * and the options is set on. + */ + disable_preemption = !disable_irq && test_bit(OSN_PREEMPT_DISABLE, &osnoise_options); + /* * Considers the current thread as the workload. */ @@ -1335,6 +1350,15 @@ static int run_osnoise(void) */ threshold = tracing_thresh ? : 5000; + /* + * Apply PREEMPT and IRQ disabled options. + */ + if (disable_irq) + local_irq_disable(); + + if (disable_preemption) + preempt_disable(); + /* * Make sure NMIs see sampling first */ @@ -1422,16 +1446,21 @@ static int run_osnoise(void) * cond_resched() */ if (IS_ENABLED(CONFIG_PREEMPT_RCU)) { - local_irq_disable(); + if (!disable_irq) + local_irq_disable(); + rcu_momentary_dyntick_idle(); - local_irq_enable(); + + if (!disable_irq) + local_irq_enable(); } /* * For the non-preemptive kernel config: let threads runs, if - * they so wish. + * they so wish, unless set not do to so. */ - cond_resched(); + if (!disable_irq && !disable_preemption) + cond_resched(); last_sample = sample; last_int_count = int_count; @@ -1450,6 +1479,15 @@ static int run_osnoise(void) */ barrier(); + /* + * Return to the preemptive state. + */ + if (disable_preemption) + preempt_enable(); + + if (disable_irq) + local_irq_enable(); + /* * Save noise info. */ -- cgit v1.2.3 From c1ac03af6ed45d05786c219d102f37eb44880f28 Mon Sep 17 00:00:00 2001 From: Yang Jihong Date: Tue, 29 Nov 2022 19:30:09 +0800 Subject: tracing: Fix infinite loop in tracing_read_pipe on overflowed print_trace_line print_trace_line may overflow seq_file buffer. If the event is not consumed, the while loop keeps peeking this event, causing a infinite loop. Link: https://lkml.kernel.org/r/20221129113009.182425-1-yangjihong1@huawei.com Cc: Masami Hiramatsu Cc: stable@vger.kernel.org Fixes: 088b1e427dbba ("ftrace: pipe fixes") Signed-off-by: Yang Jihong Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 664619b3f1e1..548890c7c0f5 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6802,7 +6802,20 @@ waitagain: ret = print_trace_line(iter); if (ret == TRACE_TYPE_PARTIAL_LINE) { - /* don't print partial lines */ + /* + * If one print_trace_line() fills entire trace_seq in one shot, + * trace_seq_to_user() will returns -EBUSY because save_len == 0, + * In this case, we need to consume it, otherwise, loop will peek + * this event next time, resulting in an infinite loop. + */ + if (save_len == 0) { + iter->seq.full = 0; + trace_seq_puts(&iter->seq, "[LINE TOO BIG]\n"); + trace_consume(iter); + break; + } + + /* In other cases, don't print partial lines */ iter->seq.seq.len = save_len; break; } -- cgit v1.2.3 From a785736d7e587abbfd59df44e9a815d8d109c28c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 13 Dec 2022 09:56:02 -0500 Subject: tracing: Have trigger filter parsing errors show up in error_log It is annoying that the filter parsing of triggers do not show up in the error_log. Trying to figure out what is incorrect in the input is difficult when it fails for a typo. Have the errors of filter parsing show up in error_log as well. Link: https://lore.kernel.org/linux-trace-kernel/20221213095602.083fa9fd@gandalf.local.home Cc: Masami Hiramatsu Cc: Tom Zanussi Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_trigger.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 918730d74932..19ce9d22bfd7 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -1067,7 +1067,14 @@ int set_trigger_filter(char *filter_str, /* The filter is for the 'trigger' event, not the triggered event */ ret = create_event_filter(file->tr, file->event_call, - filter_str, false, &filter); + filter_str, true, &filter); + + /* Only enabled set_str for error handling */ + if (filter) { + kfree(filter->filter_string); + filter->filter_string = NULL; + } + /* * If create_event_filter() fails, filter still needs to be freed. * Which the calling code will do with data->filter. -- cgit v1.2.3 From fb9f5ee9bf5cf7ebc8731a7033e57d98832dc650 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 13 Dec 2022 17:24:29 -0500 Subject: tracing: Do not synchronize freeing of trigger filter on boot up If a trigger filter on the kernel command line fails to apply (due to syntax error), it will be freed. The freeing will call tracepoint_synchronize_unregister(), but this is not needed during early boot up, and will even trigger a lockdep splat. Avoid calling the synchronization function when system_state is SYSTEM_BOOTING. Link: https://lore.kernel.org/linux-trace-kernel/20221213172429.7774f4ba@gandalf.local.home Cc: Andrew Morton Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_trigger.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 19ce9d22bfd7..e535959939d3 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -1085,8 +1085,14 @@ int set_trigger_filter(char *filter_str, rcu_assign_pointer(data->filter, filter); if (tmp) { - /* Make sure the call is done with the filter */ - tracepoint_synchronize_unregister(); + /* + * Make sure the call is done with the filter. + * It is possible that a filter could fail at boot up, + * and then this path will be called. Avoid the synchronization + * in that case. + */ + if (system_state != SYSTEM_BOOTING) + tracepoint_synchronize_unregister(); free_event_filter(tmp); } -- cgit v1.2.3 From d0b24b4e91fcb8408c4979888547f86be514e337 Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Tue, 15 Nov 2022 17:48:47 -0300 Subject: ftrace: Prevent RCU stall on PREEMPT_VOLUNTARY kernels The function match_records() may take a while due to a large number of string comparisons, so when in PREEMPT_VOLUNTARY kernels we could face RCU stalls due to that. Add a cond_resched() to prevent that. Link: https://lkml.kernel.org/r/20221115204847.593616-1-gpiccoli@igalia.com Cc: Mark Rutland Suggested-by: Steven Rostedt Acked-by: Paul E. McKenney # from RCU CPU stall warning perspective Signed-off-by: Guilherme G. Piccoli Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index d04552c0c275..b8e374a372e5 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4204,6 +4204,7 @@ match_records(struct ftrace_hash *hash, char *func, int len, char *mod) } found = 1; } + cond_resched(); } while_for_each_ftrace_rec(); out_unlock: mutex_unlock(&ftrace_lock); -- cgit v1.2.3 From ea47666ca4288becf82c51eb3da6d3ff891470a4 Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Fri, 19 Aug 2022 19:17:26 -0300 Subject: tracing: Improve panic/die notifiers Currently the tracing dump_on_oops feature is implemented through separate notifiers, one for die/oops and the other for panic; given they have the same functionality, let's unify them. Also improve the function comment and change the priority of the notifier to make it execute earlier, avoiding showing useless trace data (like the callback names for the other notifiers); finally, we also removed an unnecessary header inclusion. Link: https://lkml.kernel.org/r/20220819221731.480795-7-gpiccoli@igalia.com Cc: Petr Mladek Cc: Sergei Shtylyov Signed-off-by: Guilherme G. Piccoli Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 55 ++++++++++++++++++++++++++-------------------------- 1 file changed, 27 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 548890c7c0f5..6d7ef130f57e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include @@ -9876,40 +9875,40 @@ static __init int tracer_init_tracefs(void) fs_initcall(tracer_init_tracefs); -static int trace_panic_handler(struct notifier_block *this, - unsigned long event, void *unused) -{ - if (ftrace_dump_on_oops) - ftrace_dump(ftrace_dump_on_oops); - return NOTIFY_OK; -} +static int trace_die_panic_handler(struct notifier_block *self, + unsigned long ev, void *unused); static struct notifier_block trace_panic_notifier = { - .notifier_call = trace_panic_handler, - .next = NULL, - .priority = 150 /* priority: INT_MAX >= x >= 0 */ + .notifier_call = trace_die_panic_handler, + .priority = INT_MAX - 1, }; -static int trace_die_handler(struct notifier_block *self, - unsigned long val, - void *data) -{ - switch (val) { - case DIE_OOPS: - if (ftrace_dump_on_oops) - ftrace_dump(ftrace_dump_on_oops); - break; - default: - break; - } - return NOTIFY_OK; -} - static struct notifier_block trace_die_notifier = { - .notifier_call = trace_die_handler, - .priority = 200 + .notifier_call = trace_die_panic_handler, + .priority = INT_MAX - 1, }; +/* + * The idea is to execute the following die/panic callback early, in order + * to avoid showing irrelevant information in the trace (like other panic + * notifier functions); we are the 2nd to run, after hung_task/rcu_stall + * warnings get disabled (to prevent potential log flooding). + */ +static int trace_die_panic_handler(struct notifier_block *self, + unsigned long ev, void *unused) +{ + if (!ftrace_dump_on_oops) + return NOTIFY_DONE; + + /* The die notifier requires DIE_OOPS to trigger */ + if (self == &trace_die_notifier && ev != DIE_OOPS) + return NOTIFY_DONE; + + ftrace_dump(ftrace_dump_on_oops); + + return NOTIFY_DONE; +} + /* * printk is set to max of 1024, we really don't need it that big. * Nothing should be printing 1000 characters anyway. -- cgit v1.2.3