diff options
Diffstat (limited to 'kernel/trace')
34 files changed, 650 insertions, 347 deletions
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 2f8d9cc42c40..d5a19413d4f8 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -31,6 +31,15 @@ config HAVE_DYNAMIC_FTRACE_WITH_REGS config HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS bool +config HAVE_DYNAMIC_FTRACE_WITH_ARGS + bool + help + If this is set, then arguments and stack can be found from + the pt_regs passed into the function callback regs parameter + by default, even without setting the REGS flag in the ftrace_ops. + This allows for use of regs_get_kernel_argument() and + kernel_stack_pointer(). + config HAVE_FTRACE_MCOUNT_RECORD bool help @@ -725,6 +734,45 @@ config TRACE_EVAL_MAP_FILE If unsure, say N. +config FTRACE_RECORD_RECURSION + bool "Record functions that recurse in function tracing" + depends on FUNCTION_TRACER + help + All callbacks that attach to the function tracing have some sort + of protection against recursion. Even though the protection exists, + it adds overhead. This option will create a file in the tracefs + file system called "recursed_functions" that will list the functions + that triggered a recursion. + + This will add more overhead to cases that have recursion. + + If unsure, say N + +config FTRACE_RECORD_RECURSION_SIZE + int "Max number of recursed functions to record" + default 128 + depends on FTRACE_RECORD_RECURSION + help + This defines the limit of number of functions that can be + listed in the "recursed_functions" file, that lists all + the functions that caused a recursion to happen. + This file can be reset, but the limit can not change in + size at runtime. + +config RING_BUFFER_RECORD_RECURSION + bool "Record functions that recurse in the ring buffer" + depends on FTRACE_RECORD_RECURSION + # default y, because it is coupled with FTRACE_RECORD_RECURSION + default y + help + The ring buffer has its own internal recursion. Although when + recursion happens it wont cause harm because of the protection, + but it does cause an unwanted overhead. Enabling this option will + place where recursion was detected into the ftrace "recursed_functions" + file. + + This will add more overhead to cases that have recursion. + config GCOV_PROFILE_FTRACE bool "Enable GCOV profiling on ftrace subsystem" depends on GCOV_KERNEL @@ -795,6 +843,26 @@ config RING_BUFFER_STARTUP_TEST If unsure, say N +config RING_BUFFER_VALIDATE_TIME_DELTAS + bool "Verify ring buffer time stamp deltas" + depends on RING_BUFFER + help + This will audit the time stamps on the ring buffer sub + buffer to make sure that all the time deltas for the + events on a sub buffer matches the current time stamp. + This audit is performed for every event that is not + interrupted, or interrupting another event. A check + is also made when traversing sub buffers to make sure + that all the deltas on the previous sub buffer do not + add up to be greater than the current time stamp. + + NOTE: This adds significant overhead to recording of events, + and should only be used to test the logic of the ring buffer. + Do not use it on production systems. + + Only say Y if you understand what this does, and you + still want it enabled. Otherwise say N + config MMIOTRACE_TEST tristate "Test module for mmiotrace" depends on MMIOTRACE && m diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index e153be351548..7e44cea89fdc 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -92,6 +92,7 @@ obj-$(CONFIG_DYNAMIC_EVENTS) += trace_dynevent.o obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o obj-$(CONFIG_UPROBE_EVENTS) += trace_uprobe.o obj-$(CONFIG_BOOTTIME_TRACING) += trace_boot.o +obj-$(CONFIG_FTRACE_RECORD_RECURSION) += trace_recursion_record.o obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 456fe4ce6942..fb0fe4c66b84 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1279,7 +1279,7 @@ static void blk_log_action(struct trace_iterator *iter, const char *act, * ones now use the 64bit ino as the whole ID and * no longer use generation. * - * Regarldess of the content, always output + * Regardless of the content, always output * "LOW32,HIGH32" so that FILEID_INO32_GEN fid can * be mapped back to @id on both 64 and 32bit ino * setups. See __kernfs_fh_to_dentry(). @@ -1321,7 +1321,7 @@ static void blk_log_dump_pdu(struct trace_seq *s, i == 0 ? "" : " ", pdu_buf[i]); /* - * stop when the rest is just zeroes and indicate so + * stop when the rest is just zeros and indicate so * with a ".." appended */ if (i == end && end != pdu_len - 1) { diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index ebadaa83502c..6c0018abe68a 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -116,7 +116,7 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) * Instead of moving rcu_read_lock/rcu_dereference/rcu_read_unlock * to all call sites, we did a bpf_prog_array_valid() there to check * whether call->prog_array is empty or not, which is - * a heurisitc to speed up execution. + * a heuristic to speed up execution. * * If bpf_prog_array_valid() fetched prog_array was * non-NULL, we go into trace_call_bpf() and do the actual diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 5658f13037b3..73edb9e4f354 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -334,8 +334,7 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, static struct ftrace_ops graph_ops = { .func = ftrace_stub, - .flags = FTRACE_OPS_FL_RECURSION_SAFE | - FTRACE_OPS_FL_INITIALIZED | + .flags = FTRACE_OPS_FL_INITIALIZED | FTRACE_OPS_FL_PID | FTRACE_OPS_FL_STUB, #ifdef FTRACE_GRAPH_TRAMP_ADDR diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 9c1bba8cc51b..4d8e35575549 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -80,7 +80,7 @@ enum { struct ftrace_ops ftrace_list_end __read_mostly = { .func = ftrace_stub, - .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB, + .flags = FTRACE_OPS_FL_STUB, INIT_OPS_HASH(ftrace_list_end) }; @@ -121,7 +121,7 @@ struct ftrace_ops global_ops; #if ARCH_SUPPORTS_FTRACE_OPS static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *regs); + struct ftrace_ops *op, struct ftrace_regs *fregs); #else /* See comment below, where ftrace_ops_list_func is defined */ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip); @@ -140,7 +140,7 @@ static inline void ftrace_ops_init(struct ftrace_ops *ops) } static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *regs) + struct ftrace_ops *op, struct ftrace_regs *fregs) { struct trace_array *tr = op->private; int pid; @@ -154,7 +154,7 @@ static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, return; } - op->saved_func(ip, parent_ip, op, regs); + op->saved_func(ip, parent_ip, op, fregs); } static void ftrace_sync_ipi(void *data) @@ -754,7 +754,7 @@ ftrace_profile_alloc(struct ftrace_profile_stat *stat, unsigned long ip) static void function_profile_call(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *ops, struct pt_regs *regs) + struct ftrace_ops *ops, struct ftrace_regs *fregs) { struct ftrace_profile_stat *stat; struct ftrace_profile *rec; @@ -866,7 +866,7 @@ static void unregister_ftrace_profiler(void) #else static struct ftrace_ops ftrace_profile_ops __read_mostly = { .func = function_profile_call, - .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, + .flags = FTRACE_OPS_FL_INITIALIZED, INIT_OPS_HASH(ftrace_profile_ops) }; @@ -1040,8 +1040,7 @@ struct ftrace_ops global_ops = { .local_hash.notrace_hash = EMPTY_HASH, .local_hash.filter_hash = EMPTY_HASH, INIT_OPS_HASH(global_ops) - .flags = FTRACE_OPS_FL_RECURSION_SAFE | - FTRACE_OPS_FL_INITIALIZED | + .flags = FTRACE_OPS_FL_INITIALIZED | FTRACE_OPS_FL_PID, }; @@ -2146,6 +2145,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update) else rec->flags &= ~FTRACE_FL_TRAMP_EN; } + if (flag & FTRACE_FL_DIRECT) { /* * If there's only one user (direct_ops helper) @@ -2389,8 +2389,9 @@ unsigned long ftrace_find_rec_direct(unsigned long ip) } static void call_direct_funcs(unsigned long ip, unsigned long pip, - struct ftrace_ops *ops, struct pt_regs *regs) + struct ftrace_ops *ops, struct ftrace_regs *fregs) { + struct pt_regs *regs = ftrace_get_regs(fregs); unsigned long addr; addr = ftrace_find_rec_direct(ip); @@ -2402,7 +2403,7 @@ static void call_direct_funcs(unsigned long ip, unsigned long pip, struct ftrace_ops direct_ops = { .func = call_direct_funcs, - .flags = FTRACE_OPS_FL_IPMODIFY | FTRACE_OPS_FL_RECURSION_SAFE + .flags = FTRACE_OPS_FL_IPMODIFY | FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS | FTRACE_OPS_FL_PERMANENT, /* @@ -4183,7 +4184,6 @@ static void process_mod_list(struct list_head *head, struct ftrace_ops *ops, struct ftrace_hash **orig_hash, *new_hash; LIST_HEAD(process_mods); char *func; - int ret; mutex_lock(&ops->func_hash->regex_lock); @@ -4236,7 +4236,7 @@ static void process_mod_list(struct list_head *head, struct ftrace_ops *ops, mutex_lock(&ftrace_lock); - ret = ftrace_hash_move_and_update_ops(ops, orig_hash, + ftrace_hash_move_and_update_ops(ops, orig_hash, new_hash, enable); mutex_unlock(&ftrace_lock); @@ -4314,7 +4314,7 @@ static int __init ftrace_mod_cmd_init(void) core_initcall(ftrace_mod_cmd_init); static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *pt_regs) + struct ftrace_ops *op, struct ftrace_regs *fregs) { struct ftrace_probe_ops *probe_ops; struct ftrace_func_probe *probe; @@ -5588,7 +5588,6 @@ int ftrace_regex_release(struct inode *inode, struct file *file) struct ftrace_hash **orig_hash; struct trace_parser *parser; int filter_hash; - int ret; if (file->f_mode & FMODE_READ) { iter = m->private; @@ -5616,7 +5615,7 @@ int ftrace_regex_release(struct inode *inode, struct file *file) orig_hash = &iter->ops->func_hash->notrace_hash; mutex_lock(&ftrace_lock); - ret = ftrace_hash_move_and_update_ops(iter->ops, orig_hash, + ftrace_hash_move_and_update_ops(iter->ops, orig_hash, iter->hash, filter_hash); mutex_unlock(&ftrace_lock); } else { @@ -6884,8 +6883,7 @@ void ftrace_init_trace_array(struct trace_array *tr) struct ftrace_ops global_ops = { .func = ftrace_stub, - .flags = FTRACE_OPS_FL_RECURSION_SAFE | - FTRACE_OPS_FL_INITIALIZED | + .flags = FTRACE_OPS_FL_INITIALIZED | FTRACE_OPS_FL_PID, }; @@ -6935,12 +6933,13 @@ void ftrace_reset_array_ops(struct trace_array *tr) static nokprobe_inline void __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *ignored, struct pt_regs *regs) + struct ftrace_ops *ignored, struct ftrace_regs *fregs) { + struct pt_regs *regs = ftrace_get_regs(fregs); struct ftrace_ops *op; int bit; - bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX); + bit = trace_test_and_set_recursion(ip, parent_ip, TRACE_LIST_START, TRACE_LIST_MAX); if (bit < 0) return; @@ -6969,7 +6968,7 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, pr_warn("op=%p %pS\n", op, op); goto out; } - op->func(ip, parent_ip, op, regs); + op->func(ip, parent_ip, op, fregs); } } while_for_each_ftrace_op(op); out: @@ -6992,9 +6991,9 @@ out: */ #if ARCH_SUPPORTS_FTRACE_OPS static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *regs) + struct ftrace_ops *op, struct ftrace_regs *fregs) { - __ftrace_ops_list_func(ip, parent_ip, NULL, regs); + __ftrace_ops_list_func(ip, parent_ip, NULL, fregs); } NOKPROBE_SYMBOL(ftrace_ops_list_func); #else @@ -7011,18 +7010,18 @@ NOKPROBE_SYMBOL(ftrace_ops_no_ops); * this function will be called by the mcount trampoline. */ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *regs) + struct ftrace_ops *op, struct ftrace_regs *fregs) { int bit; - bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX); + bit = trace_test_and_set_recursion(ip, parent_ip, TRACE_LIST_START, TRACE_LIST_MAX); if (bit < 0) return; preempt_disable_notrace(); if (!(op->flags & FTRACE_OPS_FL_RCU) || rcu_is_watching()) - op->func(ip, parent_ip, op, regs); + op->func(ip, parent_ip, op, fregs); preempt_enable_notrace(); trace_clear_recursion(bit); @@ -7043,11 +7042,11 @@ NOKPROBE_SYMBOL(ftrace_ops_assist_func); ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops) { /* - * If the function does not handle recursion, needs to be RCU safe, - * or does per cpu logic, then we need to call the assist handler. + * If the function does not handle recursion or needs to be RCU safe, + * then we need to call the assist handler. */ - if (!(ops->flags & FTRACE_OPS_FL_RECURSION_SAFE) || - ops->flags & FTRACE_OPS_FL_RCU) + if (ops->flags & (FTRACE_OPS_FL_RECURSION | + FTRACE_OPS_FL_RCU)) return ftrace_ops_assist_func; return ops->func; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index a6268e09160a..ec08f948dd80 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -4,6 +4,7 @@ * * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com> */ +#include <linux/trace_recursion.h> #include <linux/trace_events.h> #include <linux/ring_buffer.h> #include <linux/trace_clock.h> @@ -129,7 +130,16 @@ int ring_buffer_print_entry_header(struct trace_seq *s) #define RB_ALIGNMENT 4U #define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) #define RB_EVNT_MIN_SIZE 8U /* two 32bit words */ -#define RB_ALIGN_DATA __aligned(RB_ALIGNMENT) + +#ifndef CONFIG_HAVE_64BIT_ALIGNED_ACCESS +# define RB_FORCE_8BYTE_ALIGNMENT 0 +# define RB_ARCH_ALIGNMENT RB_ALIGNMENT +#else +# define RB_FORCE_8BYTE_ALIGNMENT 1 +# define RB_ARCH_ALIGNMENT 8U +#endif + +#define RB_ALIGN_DATA __aligned(RB_ARCH_ALIGNMENT) /* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ #define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX @@ -1422,7 +1432,8 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) return 0; } -static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu) +static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, + long nr_pages, struct list_head *pages) { struct buffer_page *bpage, *tmp; bool user_thread = current->mm != NULL; @@ -1462,13 +1473,15 @@ static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu) struct page *page; bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), - mflags, cpu_to_node(cpu)); + mflags, cpu_to_node(cpu_buffer->cpu)); if (!bpage) goto free_pages; + rb_check_bpage(cpu_buffer, bpage); + list_add(&bpage->list, pages); - page = alloc_pages_node(cpu_to_node(cpu), mflags, 0); + page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), mflags, 0); if (!page) goto free_pages; bpage->page = page_address(page); @@ -1500,7 +1513,7 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, WARN_ON(!nr_pages); - if (__rb_allocate_pages(nr_pages, &pages, cpu_buffer->cpu)) + if (__rb_allocate_pages(cpu_buffer, nr_pages, &pages)) return -ENOMEM; /* @@ -1973,8 +1986,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, if (nr_pages < 2) nr_pages = 2; - size = nr_pages * BUF_PAGE_SIZE; - /* prevent another thread from changing buffer sizes */ mutex_lock(&buffer->mutex); @@ -2009,8 +2020,8 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, * allocated without receiving ENOMEM */ INIT_LIST_HEAD(&cpu_buffer->new_pages); - if (__rb_allocate_pages(cpu_buffer->nr_pages_to_update, - &cpu_buffer->new_pages, cpu)) { + if (__rb_allocate_pages(cpu_buffer, cpu_buffer->nr_pages_to_update, + &cpu_buffer->new_pages)) { /* not enough memory for new pages */ err = -ENOMEM; goto out_err; @@ -2075,8 +2086,8 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, INIT_LIST_HEAD(&cpu_buffer->new_pages); if (cpu_buffer->nr_pages_to_update > 0 && - __rb_allocate_pages(cpu_buffer->nr_pages_to_update, - &cpu_buffer->new_pages, cpu_id)) { + __rb_allocate_pages(cpu_buffer, cpu_buffer->nr_pages_to_update, + &cpu_buffer->new_pages)) { err = -ENOMEM; goto out_err; } @@ -2628,9 +2639,6 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs) return skip_time_extend(event); } -static inline bool rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event); - #ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK static inline bool sched_clock_stable(void) { @@ -2719,7 +2727,7 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, event->time_delta = delta; length -= RB_EVNT_HDR_SIZE; - if (length > RB_MAX_SMALL_DATA) { + if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) { event->type_len = 0; event->array[0] = length; } else @@ -2734,11 +2742,11 @@ static unsigned rb_calculate_event_length(unsigned length) if (!length) length++; - if (length > RB_MAX_SMALL_DATA) + if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) length += sizeof(event.array[0]); length += RB_EVNT_HDR_SIZE; - length = ALIGN(length, RB_ALIGNMENT); + length = ALIGN(length, RB_ARCH_ALIGNMENT); /* * In case the time delta is larger than the 27 bits for it @@ -2758,20 +2766,6 @@ static unsigned rb_calculate_event_length(unsigned length) return length; } -static __always_inline bool -rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event) -{ - unsigned long addr = (unsigned long)event; - unsigned long index; - - index = rb_event_index(event); - addr &= PAGE_MASK; - - return cpu_buffer->commit_page->page == (void *)addr && - rb_commit_index(cpu_buffer) == index; -} - static u64 rb_time_delta(struct ring_buffer_event *event) { switch (event->type_len) { @@ -3006,6 +3000,13 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) irq_work_queue(&cpu_buffer->irq_work.work); } +#ifdef CONFIG_RING_BUFFER_RECORD_RECURSION +# define do_ring_buffer_record_recursion() \ + do_ftrace_record_recursion(_THIS_IP_, _RET_IP_) +#else +# define do_ring_buffer_record_recursion() do { } while (0) +#endif + /* * The lock and unlock are done within a preempt disable section. * The current_context per_cpu variable can only be modified @@ -3088,8 +3089,10 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) * been updated yet. In this case, use the TRANSITION bit. */ bit = RB_CTX_TRANSITION; - if (val & (1 << (bit + cpu_buffer->nest))) + if (val & (1 << (bit + cpu_buffer->nest))) { + do_ring_buffer_record_recursion(); return 1; + } } val |= (1 << (bit + cpu_buffer->nest)); @@ -3183,6 +3186,153 @@ int ring_buffer_unlock_commit(struct trace_buffer *buffer, } EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); +/* Special value to validate all deltas on a page. */ +#define CHECK_FULL_PAGE 1L + +#ifdef CONFIG_RING_BUFFER_VALIDATE_TIME_DELTAS +static void dump_buffer_page(struct buffer_data_page *bpage, + struct rb_event_info *info, + unsigned long tail) +{ + struct ring_buffer_event *event; + u64 ts, delta; + int e; + + ts = bpage->time_stamp; + pr_warn(" [%lld] PAGE TIME STAMP\n", ts); + + for (e = 0; e < tail; e += rb_event_length(event)) { + + event = (struct ring_buffer_event *)(bpage->data + e); + + switch (event->type_len) { + + case RINGBUF_TYPE_TIME_EXTEND: + delta = ring_buffer_event_time_stamp(event); + ts += delta; + pr_warn(" [%lld] delta:%lld TIME EXTEND\n", ts, delta); + break; + + case RINGBUF_TYPE_TIME_STAMP: + delta = ring_buffer_event_time_stamp(event); + ts = delta; + pr_warn(" [%lld] absolute:%lld TIME STAMP\n", ts, delta); + break; + + case RINGBUF_TYPE_PADDING: + ts += event->time_delta; + pr_warn(" [%lld] delta:%d PADDING\n", ts, event->time_delta); + break; + + case RINGBUF_TYPE_DATA: + ts += event->time_delta; + pr_warn(" [%lld] delta:%d\n", ts, event->time_delta); + break; + + default: + break; + } + } +} + +static DEFINE_PER_CPU(atomic_t, checking); +static atomic_t ts_dump; + +/* + * Check if the current event time stamp matches the deltas on + * the buffer page. + */ +static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer, + struct rb_event_info *info, + unsigned long tail) +{ + struct ring_buffer_event *event; + struct buffer_data_page *bpage; + u64 ts, delta; + bool full = false; + int e; + + bpage = info->tail_page->page; + + if (tail == CHECK_FULL_PAGE) { + full = true; + tail = local_read(&bpage->commit); + } else if (info->add_timestamp & + (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE)) { + /* Ignore events with absolute time stamps */ + return; + } + + /* + * Do not check the first event (skip possible extends too). + * Also do not check if previous events have not been committed. + */ + if (tail <= 8 || tail > local_read(&bpage->commit)) + return; + + /* + * If this interrupted another event, + */ + if (atomic_inc_return(this_cpu_ptr(&checking)) != 1) + goto out; + + ts = bpage->time_stamp; + + for (e = 0; e < tail; e += rb_event_length(event)) { + + event = (struct ring_buffer_event *)(bpage->data + e); + + switch (event->type_len) { + + case RINGBUF_TYPE_TIME_EXTEND: + delta = ring_buffer_event_time_stamp(event); + ts += delta; + break; + + case RINGBUF_TYPE_TIME_STAMP: + delta = ring_buffer_event_time_stamp(event); + ts = delta; + break; + + case RINGBUF_TYPE_PADDING: + if (event->time_delta == 1) + break; + /* fall through */ + case RINGBUF_TYPE_DATA: + ts += event->time_delta; + break; + + default: + RB_WARN_ON(cpu_buffer, 1); + } + } + if ((full && ts > info->ts) || + (!full && ts + info->delta != info->ts)) { + /* If another report is happening, ignore this one */ + if (atomic_inc_return(&ts_dump) != 1) { + atomic_dec(&ts_dump); + goto out; + } + atomic_inc(&cpu_buffer->record_disabled); + pr_warn("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld after:%lld\n", + cpu_buffer->cpu, + ts + info->delta, info->ts, info->delta, info->after); + dump_buffer_page(bpage, info, tail); + atomic_dec(&ts_dump); + /* Do not re-enable checking */ + return; + } +out: + atomic_dec(this_cpu_ptr(&checking)); +} +#else +static inline void check_buffer(struct ring_buffer_per_cpu *cpu_buffer, + struct rb_event_info *info, + unsigned long tail) +{ +} +#endif /* CONFIG_RING_BUFFER_VALIDATE_TIME_DELTAS */ + static struct ring_buffer_event * __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, struct rb_event_info *info) @@ -3240,6 +3390,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, if (a_ok && b_ok && info->before != info->after) (void)rb_time_cmpxchg(&cpu_buffer->before_stamp, info->before, info->after); + if (a_ok && b_ok) + check_buffer(cpu_buffer, info, CHECK_FULL_PAGE); return rb_move_tail(cpu_buffer, tail, info); } @@ -3257,9 +3409,10 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, /* This did not interrupt any time update */ info->delta = info->ts - info->after; else - /* Just use full timestamp for inerrupting event */ + /* Just use full timestamp for interrupting event */ info->delta = info->ts; barrier(); + check_buffer(cpu_buffer, info, tail); if (unlikely(info->ts != save_before)) { /* SLOW PATH - Interrupted between C and E */ @@ -3293,7 +3446,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, info->ts = ts; } else { /* - * Interrupted beween C and E: + * Interrupted between C and E: * Lost the previous events time stamp. Just set the * delta to zero, and this will be the same time as * the event this event interrupted. And the events that @@ -3500,7 +3653,7 @@ rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer, } /** - * ring_buffer_commit_discard - discard an event that has not been committed + * ring_buffer_discard_commit - discard an event that has not been committed * @buffer: the ring buffer * @event: non committed event to discard * diff --git a/kernel/trace/synth_event_gen_test.c b/kernel/trace/synth_event_gen_test.c index edd912cd14aa..a4b4bbf8c3bf 100644 --- a/kernel/trace/synth_event_gen_test.c +++ b/kernel/trace/synth_event_gen_test.c @@ -307,7 +307,7 @@ static int __init test_create_synth_event(void) return ret; delete: /* We got an error after creating the event, delete it */ - ret = synth_event_delete("create_synth_test"); + synth_event_delete("create_synth_test"); goto out; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 06134189e9a7..b8a2d786b503 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -68,10 +68,21 @@ bool ring_buffer_expanded; static bool __read_mostly tracing_selftest_running; /* - * If a tracer is running, we do not want to run SELFTEST. + * If boot-time tracing including tracers/events via kernel cmdline + * is running, we do not want to run SELFTEST. */ bool __read_mostly tracing_selftest_disabled; +#ifdef CONFIG_FTRACE_STARTUP_TEST +void __init disable_tracing_selftest(const char *reason) +{ + if (!tracing_selftest_disabled) { + tracing_selftest_disabled = true; + pr_info("Ftrace startup test is disabled due to %s\n", reason); + } +} +#endif + /* Pipe tracepoints to printk */ struct trace_iterator *tracepoint_print_iter; int tracepoint_printk; @@ -2113,11 +2124,7 @@ int __init register_tracer(struct tracer *type) apply_trace_boot_options(); /* disable other selftests, since this will break it. */ - tracing_selftest_disabled = true; -#ifdef CONFIG_FTRACE_STARTUP_TEST - printk(KERN_INFO "Disabling FTRACE selftests due to running tracer '%s'\n", - type->name); -#endif + disable_tracing_selftest("running a tracer"); out_unlock: return ret; @@ -3121,7 +3128,7 @@ struct trace_buffer_struct { static struct trace_buffer_struct *trace_percpu_buffer; /* - * Thise allows for lockless recording. If we're nested too deeply, then + * This allows for lockless recording. If we're nested too deeply, then * this returns NULL. */ static char *get_trace_buf(void) @@ -9062,7 +9069,10 @@ int tracing_init_dentry(void) extern struct trace_eval_map *__start_ftrace_eval_maps[]; extern struct trace_eval_map *__stop_ftrace_eval_maps[]; -static void __init trace_eval_init(void) +static struct workqueue_struct *eval_map_wq __initdata; +static struct work_struct eval_map_work __initdata; + +static void __init eval_map_work_func(struct work_struct *work) { int len; @@ -9070,6 +9080,33 @@ static void __init trace_eval_init(void) trace_insert_eval_map(NULL, __start_ftrace_eval_maps, len); } +static int __init trace_eval_init(void) +{ + INIT_WORK(&eval_map_work, eval_map_work_func); + + eval_map_wq = alloc_workqueue("eval_map_wq", WQ_UNBOUND, 0); + if (!eval_map_wq) { + pr_err("Unable to allocate eval_map_wq\n"); + /* Do work here */ + eval_map_work_func(&eval_map_work); + return -ENOMEM; + } + + queue_work(eval_map_wq, &eval_map_work); + return 0; +} + +static int __init trace_eval_sync(void) +{ + /* Make sure the eval map updates are finished */ + if (eval_map_wq) + destroy_workqueue(eval_map_wq); + return 0; +} + +late_initcall_sync(trace_eval_sync); + + #ifdef CONFIG_MODULES static void trace_module_add_evals(struct module *mod) { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 1dadef445cd1..e448d2da0b99 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -558,183 +558,6 @@ struct tracer { bool noboot; }; - -/* Only current can touch trace_recursion */ - -/* - * For function tracing recursion: - * The order of these bits are important. - * - * When function tracing occurs, the following steps are made: - * If arch does not support a ftrace feature: - * call internal function (uses INTERNAL bits) which calls... - * If callback is registered to the "global" list, the list - * function is called and recursion checks the GLOBAL bits. - * then this function calls... - * The function callback, which can use the FTRACE bits to - * check for recursion. - * - * Now if the arch does not support a feature, and it calls - * the global list function which calls the ftrace callback - * all three of these steps will do a recursion protection. - * There's no reason to do one if the previous caller already - * did. The recursion that we are protecting against will - * go through the same steps again. - * - * To prevent the multiple recursion checks, if a recursion - * bit is set that is higher than the MAX bit of the current - * check, then we know that the check was made by the previous - * caller, and we can skip the current check. - */ -enum { - /* Function recursion bits */ - TRACE_FTRACE_BIT, - TRACE_FTRACE_NMI_BIT, - TRACE_FTRACE_IRQ_BIT, - TRACE_FTRACE_SIRQ_BIT, - - /* INTERNAL_BITs must be greater than FTRACE_BITs */ - TRACE_INTERNAL_BIT, - TRACE_INTERNAL_NMI_BIT, - TRACE_INTERNAL_IRQ_BIT, - TRACE_INTERNAL_SIRQ_BIT, - - TRACE_BRANCH_BIT, -/* - * Abuse of the trace_recursion. - * As we need a way to maintain state if we are tracing the function - * graph in irq because we want to trace a particular function that - * was called in irq context but we have irq tracing off. Since this - * can only be modified by current, we can reuse trace_recursion. - */ - TRACE_IRQ_BIT, - - /* Set if the function is in the set_graph_function file */ - TRACE_GRAPH_BIT, - - /* - * In the very unlikely case that an interrupt came in - * at a start of graph tracing, and we want to trace - * the function in that interrupt, the depth can be greater - * than zero, because of the preempted start of a previous - * trace. In an even more unlikely case, depth could be 2 - * if a softirq interrupted the start of graph tracing, - * followed by an interrupt preempting a start of graph - * tracing in the softirq, and depth can even be 3 - * if an NMI came in at the start of an interrupt function - * that preempted a softirq start of a function that - * preempted normal context!!!! Luckily, it can't be - * greater than 3, so the next two bits are a mask - * of what the depth is when we set TRACE_GRAPH_BIT - */ - - TRACE_GRAPH_DEPTH_START_BIT, - TRACE_GRAPH_DEPTH_END_BIT, - - /* - * To implement set_graph_notrace, if this bit is set, we ignore - * function graph tracing of called functions, until the return - * function is called to clear it. - */ - TRACE_GRAPH_NOTRACE_BIT, - - /* - * When transitioning between context, the preempt_count() may - * not be correct. Allow for a single recursion to cover this case. - */ - TRACE_TRANSITION_BIT, -}; - -#define trace_recursion_set(bit) do { (current)->trace_recursion |= (1<<(bit)); } while (0) -#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(1<<(bit)); } while (0) -#define trace_recursion_test(bit) ((current)->trace_recursion & (1<<(bit))) - -#define trace_recursion_depth() \ - (((current)->trace_recursion >> TRACE_GRAPH_DEPTH_START_BIT) & 3) -#define trace_recursion_set_depth(depth) \ - do { \ - current->trace_recursion &= \ - ~(3 << TRACE_GRAPH_DEPTH_START_BIT); \ - current->trace_recursion |= \ - ((depth) & 3) << TRACE_GRAPH_DEPTH_START_BIT; \ - } while (0) - -#define TRACE_CONTEXT_BITS 4 - -#define TRACE_FTRACE_START TRACE_FTRACE_BIT -#define TRACE_FTRACE_MAX ((1 << (TRACE_FTRACE_START + TRACE_CONTEXT_BITS)) - 1) - -#define TRACE_LIST_START TRACE_INTERNAL_BIT -#define TRACE_LIST_MAX ((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1) - -#define TRACE_CONTEXT_MASK TRACE_LIST_MAX - -static __always_inline int trace_get_context_bit(void) -{ - int bit; - - if (in_interrupt()) { - if (in_nmi()) - bit = 0; - - else if (in_irq()) - bit = 1; - else - bit = 2; - } else - bit = 3; - - return bit; -} - -static __always_inline int trace_test_and_set_recursion(int start, int max) -{ - unsigned int val = current->trace_recursion; - int bit; - - /* A previous recursion check was made */ - if ((val & TRACE_CONTEXT_MASK) > max) - return 0; - - bit = trace_get_context_bit() + start; - if (unlikely(val & (1 << bit))) { - /* - * It could be that preempt_count has not been updated during - * a switch between contexts. Allow for a single recursion. - */ - bit = TRACE_TRANSITION_BIT; - if (trace_recursion_test(bit)) - return -1; - trace_recursion_set(bit); - barrier(); - return bit + 1; - } - - /* Normal check passed, clear the transition to allow it again */ - trace_recursion_clear(TRACE_TRANSITION_BIT); - - val |= 1 << bit; - current->trace_recursion = val; - barrier(); - - return bit + 1; -} - -static __always_inline void trace_clear_recursion(int bit) -{ - unsigned int val = current->trace_recursion; - - if (!bit) - return; - - bit--; - bit = 1 << bit; - val &= ~bit; - - barrier(); - current->trace_recursion = val; -} - static inline struct ring_buffer_iter * trace_buffer_iter(struct trace_iterator *iter, int cpu) { @@ -896,6 +719,8 @@ extern bool ring_buffer_expanded; extern bool tracing_selftest_disabled; #ifdef CONFIG_FTRACE_STARTUP_TEST +extern void __init disable_tracing_selftest(const char *reason); + extern int trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr); extern int trace_selftest_startup_function_graph(struct tracer *trace, @@ -919,6 +744,9 @@ extern int trace_selftest_startup_branch(struct tracer *trace, */ #define __tracer_data __refdata #else +static inline void __init disable_tracing_selftest(const char *reason) +{ +} /* Tracers are seldom changed. Optimize when selftests are disabled. */ #define __tracer_data __read_mostly #endif /* CONFIG_FTRACE_STARTUP_TEST */ diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c index 2e9a4746ea85..801c2a7f7605 100644 --- a/kernel/trace/trace_benchmark.c +++ b/kernel/trace/trace_benchmark.c @@ -31,7 +31,7 @@ static bool ok_to_run; * it simply writes "START". As the first write is cold cache and * the rest is hot, we save off that time in bm_first and it is * reported as "first", which is shown in the second write to the - * tracepoint. The "first" field is writen within the statics from + * tracepoint. The "first" field is written within the statics from * then on but never changes. */ static void trace_do_benchmark(void) @@ -112,7 +112,7 @@ static void trace_do_benchmark(void) int i = 0; /* * stddev is the square of standard deviation but - * we want the actualy number. Use the average + * we want the actually number. Use the average * as our seed to find the std. * * The next try is: @@ -155,7 +155,7 @@ static int benchmark_event_kthread(void *arg) /* * We don't go to sleep, but let others run as well. - * This is bascially a "yield()" to let any task that + * This is basically a "yield()" to let any task that * wants to run, schedule in, but if the CPU is idle, * we'll keep burning cycles. * diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index c22a152ef0b4..a82f03f385f8 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -344,6 +344,8 @@ static int __init trace_boot_init(void) trace_boot_init_one_instance(tr, trace_node); trace_boot_init_instances(trace_node); + disable_tracing_selftest("running boot-time tracing"); + return 0; } /* diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index 5fa49cfd2bb6..4f967d5cd917 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -276,7 +276,7 @@ int dynevent_arg_add(struct dynevent_cmd *cmd, * arguments of the form 'type variable_name;' or 'x+y'. * * The lhs argument string will be appended to the current cmd string, - * followed by an operator, if applicable, followd by the rhs string, + * followed by an operator, if applicable, followed by the rhs string, * followed finally by a separator, if applicable. Before the * argument is added, the @check_arg function, if present, will be * used to check the sanity of the current arg strings. diff --git a/kernel/trace/trace_dynevent.h b/kernel/trace/trace_dynevent.h index d6857a254ede..d6f72dcb7269 100644 --- a/kernel/trace/trace_dynevent.h +++ b/kernel/trace/trace_dynevent.h @@ -29,10 +29,10 @@ struct dyn_event; * @show: Showing method. This is invoked when user reads the event definitions * via dynamic_events interface. * @is_busy: Check whether given event is busy so that it can not be deleted. - * Return true if it is busy, otherwides false. - * @free: Delete the given event. Return 0 if success, otherwides error. + * Return true if it is busy, otherwise false. + * @free: Delete the given event. Return 0 if success, otherwise error. * @match: Check whether given event and system name match this event. The argc - * and argv is used for exact match. Return true if it matches, otherwides + * and argv is used for exact match. Return true if it matches, otherwise * false. * * Except for @create, these methods are called under holding event_mutex. diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 18c4a58aff79..4547ac59da61 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -32,7 +32,7 @@ * to be deciphered for the format file. Although these macros * may become out of sync with the internal structure, they * will create a compile error if it happens. Since the - * internel structures are just tracing helpers, this is not + * internal structures are just tracing helpers, this is not * an issue. * * When an internal structure is used, it should use: @@ -93,10 +93,10 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, F_STRUCT( __field_struct( struct ftrace_graph_ret, ret ) __field_packed( unsigned long, ret, func ) - __field_packed( unsigned long, ret, overrun ) + __field_packed( int, ret, depth ) + __field_packed( unsigned int, ret, overrun ) __field_packed( unsigned long long, ret, calltime) __field_packed( unsigned long long, ret, rettime ) - __field_packed( int, ret, depth ) ), F_printk("<-- %ps (%d) (start: %llx end: %llx) over: %d", diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 643e0b19920d..a71181655958 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -432,17 +432,25 @@ NOKPROBE_SYMBOL(perf_trace_buf_update); #ifdef CONFIG_FUNCTION_TRACER static void perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *ops, struct pt_regs *pt_regs) + struct ftrace_ops *ops, struct ftrace_regs *fregs) { struct ftrace_entry *entry; struct perf_event *event; struct hlist_head head; struct pt_regs regs; int rctx; + int bit; + + if (!rcu_is_watching()) + return; if ((unsigned long)ops->private != smp_processor_id()) return; + bit = ftrace_test_recursion_trylock(ip, parent_ip); + if (bit < 0) + return; + event = container_of(ops, struct perf_event, ftrace_ops); /* @@ -463,13 +471,15 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, entry = perf_trace_buf_alloc(ENTRY_SIZE, NULL, &rctx); if (!entry) - return; + goto out; entry->ip = ip; entry->parent_ip = parent_ip; perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN, 1, ®s, &head, NULL); +out: + ftrace_test_recursion_unlock(bit); #undef ENTRY_SIZE } @@ -477,7 +487,6 @@ static int perf_ftrace_function_register(struct perf_event *event) { struct ftrace_ops *ops = &event->ftrace_ops; - ops->flags = FTRACE_OPS_FL_RCU; ops->func = perf_ftrace_function_call; ops->private = (void *)(unsigned long)nr_cpu_ids; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index adf65b502453..e9d28eeccb7e 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2436,7 +2436,7 @@ void trace_event_eval_update(struct trace_eval_map **map, int len) /* * Since calls are grouped by systems, the likelyhood that the * next call in the iteration belongs to the same system as the - * previous call is high. As an optimization, we skip seaching + * previous call is high. As an optimization, we skip searching * for a map[] that matches the call's system if the last call * was from the same system. That's what last_i is for. If the * call has the same system as the previous call, then last_i @@ -3201,7 +3201,7 @@ static __init int setup_trace_event(char *str) { strlcpy(bootup_event_buf, str, COMMAND_LINE_SIZE); ring_buffer_expanded = true; - tracing_selftest_disabled = true; + disable_tracing_selftest("running event tracing"); return 1; } @@ -3271,7 +3271,7 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) * * When a new instance is created, it needs to set up its events * directory, as well as other files associated with events. It also - * creates the event hierachry in the @parent/events directory. + * creates the event hierarchy in the @parent/events directory. * * Returns 0 on success. * @@ -3673,7 +3673,7 @@ static struct trace_event_file event_trace_file __initdata; static void __init function_test_events_call(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *pt_regs) + struct ftrace_ops *op, struct ftrace_regs *regs) { struct trace_buffer *buffer; struct ring_buffer_event *event; @@ -3712,7 +3712,6 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip, static struct ftrace_ops trace_ops __initdata = { .func = function_test_events_call, - .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; static __init void event_trace_self_test_with_function(void) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 78a678eeb140..e91259f6a722 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1561,27 +1561,6 @@ static inline void event_clear_filter(struct trace_event_file *file) RCU_INIT_POINTER(file->filter, NULL); } -static inline void -event_set_no_set_filter_flag(struct trace_event_file *file) -{ - file->flags |= EVENT_FILE_FL_NO_SET_FILTER; -} - -static inline void -event_clear_no_set_filter_flag(struct trace_event_file *file) -{ - file->flags &= ~EVENT_FILE_FL_NO_SET_FILTER; -} - -static inline bool -event_no_set_filter_flag(struct trace_event_file *file) -{ - if (file->flags & EVENT_FILE_FL_NO_SET_FILTER) - return true; - - return false; -} - struct filter_list { struct list_head list; struct event_filter *filter; @@ -1950,7 +1929,7 @@ static int __ftrace_function_set_filter(int filter, char *buf, int len, /* * The 'ip' field could have multiple filters set, separated * either by space or comma. We first cut the filter and apply - * all pieces separatelly. + * all pieces separately. */ re = ftrace_function_filter_re(buf, len, &re_cnt); if (!re) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 96c3f86b81c5..39ebe1826fc3 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -3355,7 +3355,7 @@ trace_action_create_field_var(struct hist_trigger_data *hist_data, } else { field_var = NULL; /* - * If no explicit system.event is specfied, default to + * If no explicit system.event is specified, default to * looking for fields on the onmatch(system.event.xxx) * event. */ diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 881df991742a..5a8bc0b421f1 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -1276,7 +1276,7 @@ static int __create_synth_event(int argc, const char *name, const char **argv) /** * synth_event_create - Create a new synthetic event - * @name: The name of the new sythetic event + * @name: The name of the new synthetic event * @fields: An array of type/name field descriptions * @n_fields: The number of field descriptions contained in the fields array * @mod: The module creating the event, NULL if not created from a module @@ -1446,7 +1446,7 @@ __synth_event_trace_init(struct trace_event_file *file, * this code to be called, etc). Because this is called * directly by the user, we don't have that but we still need * to honor not logging when disabled. For the iterated - * trace case, we save the enabed state upon start and just + * trace case, we save the enabled state upon start and just * ignore the following data calls. */ if (!(file->flags & EVENT_FILE_FL_ENABLED) || diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 90f81d33fa3f..d960f6b11b5e 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -26,7 +26,7 @@ static int ftrace_event_register(struct trace_event_call *call, /* * The FTRACE_ENTRY_REG macro allows ftrace entry to define register - * function and thus become accesible via perf. + * function and thus become accessible via perf. */ #undef FTRACE_ENTRY_REG #define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, regfn) \ diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 2c2126e1871d..c5095dd28e20 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -23,10 +23,10 @@ static void tracing_start_function_trace(struct trace_array *tr); static void tracing_stop_function_trace(struct trace_array *tr); static void function_trace_call(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *pt_regs); + struct ftrace_ops *op, struct ftrace_regs *fregs); static void function_stack_trace_call(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *pt_regs); + struct ftrace_ops *op, struct ftrace_regs *fregs); static struct tracer_flags func_flags; /* Our option */ @@ -48,7 +48,7 @@ int ftrace_allocate_ftrace_ops(struct trace_array *tr) /* Currently only the non stack version is supported */ ops->func = function_trace_call; - ops->flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_PID; + ops->flags = FTRACE_OPS_FL_PID; tr->ops = ops; ops->private = tr; @@ -89,7 +89,6 @@ void ftrace_destroy_function_files(struct trace_array *tr) static int function_trace_init(struct trace_array *tr) { ftrace_func_t func; - /* * Instance trace_arrays get their ops allocated * at instance creation. Unless it failed @@ -129,7 +128,7 @@ static void function_trace_start(struct trace_array *tr) static void function_trace_call(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *pt_regs) + struct ftrace_ops *op, struct ftrace_regs *fregs) { struct trace_array *tr = op->private; struct trace_array_cpu *data; @@ -141,22 +140,20 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, if (unlikely(!tr->function_enabled)) return; + bit = ftrace_test_recursion_trylock(ip, parent_ip); + if (bit < 0) + return; + pc = preempt_count(); preempt_disable_notrace(); - bit = trace_test_and_set_recursion(TRACE_FTRACE_START, TRACE_FTRACE_MAX); - if (bit < 0) - goto out; - cpu = smp_processor_id(); data = per_cpu_ptr(tr->array_buffer.data, cpu); if (!atomic_read(&data->disabled)) { local_save_flags(flags); trace_function(tr, ip, parent_ip, flags, pc); } - trace_clear_recursion(bit); - - out: + ftrace_test_recursion_unlock(bit); preempt_enable_notrace(); } @@ -180,7 +177,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, static void function_stack_trace_call(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *pt_regs) + struct ftrace_ops *op, struct ftrace_regs *fregs) { struct trace_array *tr = op->private; struct trace_array_cpu *data; diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 60d66278aa0d..d874dec87131 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -957,7 +957,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, /* Overrun */ if (flags & TRACE_GRAPH_PRINT_OVERRUN) - trace_seq_printf(s, " (Overruns: %lu)\n", + trace_seq_printf(s, " (Overruns: %u)\n", trace->overrun); print_graph_irq(iter, trace->func, TRACE_GRAPH_RET, diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index d071fc271eef..c0df9b97f147 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -485,11 +485,11 @@ hwlat_width_write(struct file *filp, const char __user *ubuf, * @ppos: The current position in @file * * This function provides a write implementation for the "window" interface - * to the hardware latency detetector. The window is the total time + * to the hardware latency detector. The window is the total time * in us that will be considered one sample period. Conceptually, windows * occur back-to-back and contain a sample width period during which * actual sampling occurs. Can be used to write a new total window size. It - * is enfoced that any value written must be greater than the sample width + * is enforced that any value written must be greater than the sample width * size, or an error results. */ static ssize_t diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 10bbb0f381d5..d06aab4dcbb8 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -138,7 +138,7 @@ static int func_prolog_dec(struct trace_array *tr, */ static void irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *pt_regs) + struct ftrace_ops *op, struct ftrace_regs *fregs) { struct trace_array *tr = irqsoff_trace; struct trace_array_cpu *data; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 97c7a7782db7..9c31f42245e9 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -25,11 +25,12 @@ /* Kprobe early definition from command line */ static char kprobe_boot_events_buf[COMMAND_LINE_SIZE] __initdata; -static bool kprobe_boot_events_enabled __initdata; static int __init set_kprobe_boot_events(char *str) { strlcpy(kprobe_boot_events_buf, str, COMMAND_LINE_SIZE); + disable_tracing_selftest("running kprobe events"); + return 0; } __setup("kprobe_event=", set_kprobe_boot_events); @@ -1888,8 +1889,6 @@ static __init void setup_boot_kprobe_events(void) ret = trace_run_command(cmd, create_or_delete_trace_kprobe); if (ret) pr_warn("Failed to add event(%d): %s\n", ret, cmd); - else - kprobe_boot_events_enabled = true; cmd = p; } @@ -1974,10 +1973,8 @@ static __init int kprobe_trace_self_tests_init(void) if (tracing_is_disabled()) return -ENODEV; - if (kprobe_boot_events_enabled) { - pr_info("Skipping kprobe tests due to kprobe_event on cmdline\n"); + if (tracing_selftest_disabled) return 0; - } target = kprobe_trace_selftest_target; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 000e9dc224c6..92b1575ae0ca 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -353,8 +353,8 @@ static inline const char *kretprobed(const char *name) } #endif /* CONFIG_KRETPROBES */ -static void -seq_print_sym(struct trace_seq *s, unsigned long address, bool offset) +void +trace_seq_print_sym(struct trace_seq *s, unsigned long address, bool offset) { #ifdef CONFIG_KALLSYMS char str[KSYM_SYMBOL_LEN]; @@ -420,7 +420,7 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) goto out; } - seq_print_sym(s, ip, sym_flags & TRACE_ITER_SYM_OFFSET); + trace_seq_print_sym(s, ip, sym_flags & TRACE_ITER_SYM_OFFSET); if (sym_flags & TRACE_ITER_SYM_ADDR) trace_seq_printf(s, " <" IP_FMT ">", ip); diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index 2f742b74e7e6..4c954636caf0 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -16,6 +16,7 @@ extern int seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags); +extern void trace_seq_print_sym(struct trace_seq *s, unsigned long address, bool offset); extern int trace_print_context(struct trace_iterator *iter); extern int trace_print_lat_context(struct trace_iterator *iter); diff --git a/kernel/trace/trace_recursion_record.c b/kernel/trace/trace_recursion_record.c new file mode 100644 index 000000000000..b2edac1fe156 --- /dev/null +++ b/kernel/trace/trace_recursion_record.c @@ -0,0 +1,236 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/seq_file.h> +#include <linux/kallsyms.h> +#include <linux/module.h> +#include <linux/ftrace.h> +#include <linux/fs.h> + +#include "trace_output.h" + +struct recursed_functions { + unsigned long ip; + unsigned long parent_ip; +}; + +static struct recursed_functions recursed_functions[CONFIG_FTRACE_RECORD_RECURSION_SIZE]; +static atomic_t nr_records; + +/* + * Cache the last found function. Yes, updates to this is racey, but + * so is memory cache ;-) + */ +static unsigned long cached_function; + +void ftrace_record_recursion(unsigned long ip, unsigned long parent_ip) +{ + int index = 0; + int i; + unsigned long old; + + again: + /* First check the last one recorded */ + if (ip == cached_function) + return; + + i = atomic_read(&nr_records); + /* nr_records is -1 when clearing records */ + smp_mb__after_atomic(); + if (i < 0) + return; + + /* + * If there's two writers and this writer comes in second, + * the cmpxchg() below to update the ip will fail. Then this + * writer will try again. It is possible that index will now + * be greater than nr_records. This is because the writer + * that succeeded has not updated the nr_records yet. + * This writer could keep trying again until the other writer + * updates nr_records. But if the other writer takes an + * interrupt, and that interrupt locks up that CPU, we do + * not want this CPU to lock up due to the recursion protection, + * and have a bug report showing this CPU as the cause of + * locking up the computer. To not lose this record, this + * writer will simply use the next position to update the + * recursed_functions, and it will update the nr_records + * accordingly. + */ + if (index < i) + index = i; + if (index >= CONFIG_FTRACE_RECORD_RECURSION_SIZE) + return; + + for (i = index - 1; i >= 0; i--) { + if (recursed_functions[i].ip == ip) { + cached_function = ip; + return; + } + } + + cached_function = ip; + + /* + * We only want to add a function if it hasn't been added before. + * Add to the current location before incrementing the count. + * If it fails to add, then increment the index (save in i) + * and try again. + */ + old = cmpxchg(&recursed_functions[index].ip, 0, ip); + if (old != 0) { + /* Did something else already added this for us? */ + if (old == ip) + return; + /* Try the next location (use i for the next index) */ + index++; + goto again; + } + + recursed_functions[index].parent_ip = parent_ip; + + /* + * It's still possible that we could race with the clearing + * CPU0 CPU1 + * ---- ---- + * ip = func + * nr_records = -1; + * recursed_functions[0] = 0; + * i = -1 + * if (i < 0) + * nr_records = 0; + * (new recursion detected) + * recursed_functions[0] = func + * cmpxchg(recursed_functions[0], + * func, 0) + * + * But the worse that could happen is that we get a zero in + * the recursed_functions array, and it's likely that "func" will + * be recorded again. + */ + i = atomic_read(&nr_records); + smp_mb__after_atomic(); + if (i < 0) + cmpxchg(&recursed_functions[index].ip, ip, 0); + else if (i <= index) + atomic_cmpxchg(&nr_records, i, index + 1); +} +EXPORT_SYMBOL_GPL(ftrace_record_recursion); + +static DEFINE_MUTEX(recursed_function_lock); +static struct trace_seq *tseq; + +static void *recursed_function_seq_start(struct seq_file *m, loff_t *pos) +{ + void *ret = NULL; + int index; + + mutex_lock(&recursed_function_lock); + index = atomic_read(&nr_records); + if (*pos < index) { + ret = &recursed_functions[*pos]; + } + + tseq = kzalloc(sizeof(*tseq), GFP_KERNEL); + if (!tseq) + return ERR_PTR(-ENOMEM); + + trace_seq_init(tseq); + + return ret; +} + +static void *recursed_function_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + int index; + int p; + + index = atomic_read(&nr_records); + p = ++(*pos); + + return p < index ? &recursed_functions[p] : NULL; +} + +static void recursed_function_seq_stop(struct seq_file *m, void *v) +{ + kfree(tseq); + mutex_unlock(&recursed_function_lock); +} + +static int recursed_function_seq_show(struct seq_file *m, void *v) +{ + struct recursed_functions *record = v; + int ret = 0; + + if (record) { + trace_seq_print_sym(tseq, record->parent_ip, true); + trace_seq_puts(tseq, ":\t"); + trace_seq_print_sym(tseq, record->ip, true); + trace_seq_putc(tseq, '\n'); + ret = trace_print_seq(m, tseq); + } + + return ret; +} + +static const struct seq_operations recursed_function_seq_ops = { + .start = recursed_function_seq_start, + .next = recursed_function_seq_next, + .stop = recursed_function_seq_stop, + .show = recursed_function_seq_show +}; + +static int recursed_function_open(struct inode *inode, struct file *file) +{ + int ret = 0; + + mutex_lock(&recursed_function_lock); + /* If this file was opened for write, then erase contents */ + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { + /* disable updating records */ + atomic_set(&nr_records, -1); + smp_mb__after_atomic(); + memset(recursed_functions, 0, sizeof(recursed_functions)); + smp_wmb(); + /* enable them again */ + atomic_set(&nr_records, 0); + } + if (file->f_mode & FMODE_READ) + ret = seq_open(file, &recursed_function_seq_ops); + mutex_unlock(&recursed_function_lock); + + return ret; +} + +static ssize_t recursed_function_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *ppos) +{ + return count; +} + +static int recursed_function_release(struct inode *inode, struct file *file) +{ + if (file->f_mode & FMODE_READ) + seq_release(inode, file); + return 0; +} + +static const struct file_operations recursed_functions_fops = { + .open = recursed_function_open, + .write = recursed_function_write, + .read = seq_read, + .llseek = seq_lseek, + .release = recursed_function_release, +}; + +__init static int create_recursed_functions(void) +{ + struct dentry *dentry; + + dentry = trace_create_file("recursed_functions", 0644, NULL, NULL, + &recursed_functions_fops); + if (!dentry) + pr_warn("WARNING: Failed to create recursed_functions\n"); + return 0; +} + +fs_initcall(create_recursed_functions); diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 97b10bb31a1f..c0181066dbe9 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -212,7 +212,7 @@ static void wakeup_print_header(struct seq_file *s) */ static void wakeup_tracer_call(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *pt_regs) + struct ftrace_ops *op, struct ftrace_regs *fregs) { struct trace_array *tr = wakeup_trace; struct trace_array_cpu *data; diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 4738ad48a667..73ef12092250 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -107,7 +107,7 @@ static int trace_selftest_test_probe1_cnt; static void trace_selftest_test_probe1_func(unsigned long ip, unsigned long pip, struct ftrace_ops *op, - struct pt_regs *pt_regs) + struct ftrace_regs *fregs) { trace_selftest_test_probe1_cnt++; } @@ -116,7 +116,7 @@ static int trace_selftest_test_probe2_cnt; static void trace_selftest_test_probe2_func(unsigned long ip, unsigned long pip, struct ftrace_ops *op, - struct pt_regs *pt_regs) + struct ftrace_regs *fregs) { trace_selftest_test_probe2_cnt++; } @@ -125,7 +125,7 @@ static int trace_selftest_test_probe3_cnt; static void trace_selftest_test_probe3_func(unsigned long ip, unsigned long pip, struct ftrace_ops *op, - struct pt_regs *pt_regs) + struct ftrace_regs *fregs) { trace_selftest_test_probe3_cnt++; } @@ -134,7 +134,7 @@ static int trace_selftest_test_global_cnt; static void trace_selftest_test_global_func(unsigned long ip, unsigned long pip, struct ftrace_ops *op, - struct pt_regs *pt_regs) + struct ftrace_regs *fregs) { trace_selftest_test_global_cnt++; } @@ -143,24 +143,21 @@ static int trace_selftest_test_dyn_cnt; static void trace_selftest_test_dyn_func(unsigned long ip, unsigned long pip, struct ftrace_ops *op, - struct pt_regs *pt_regs) + struct ftrace_regs *fregs) { trace_selftest_test_dyn_cnt++; } static struct ftrace_ops test_probe1 = { .func = trace_selftest_test_probe1_func, - .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; static struct ftrace_ops test_probe2 = { .func = trace_selftest_test_probe2_func, - .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; static struct ftrace_ops test_probe3 = { .func = trace_selftest_test_probe3_func, - .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; static void print_counts(void) @@ -417,7 +414,7 @@ static int trace_selftest_recursion_cnt; static void trace_selftest_test_recursion_func(unsigned long ip, unsigned long pip, struct ftrace_ops *op, - struct pt_regs *pt_regs) + struct ftrace_regs *fregs) { /* * This function is registered without the recursion safe flag. @@ -432,7 +429,7 @@ static void trace_selftest_test_recursion_func(unsigned long ip, static void trace_selftest_test_recursion_safe_func(unsigned long ip, unsigned long pip, struct ftrace_ops *op, - struct pt_regs *pt_regs) + struct ftrace_regs *fregs) { /* * We said we would provide our own recursion. By calling @@ -448,11 +445,11 @@ static void trace_selftest_test_recursion_safe_func(unsigned long ip, static struct ftrace_ops test_rec_probe = { .func = trace_selftest_test_recursion_func, + .flags = FTRACE_OPS_FL_RECURSION, }; static struct ftrace_ops test_recsafe_probe = { .func = trace_selftest_test_recursion_safe_func, - .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; static int @@ -551,9 +548,11 @@ static enum { static void trace_selftest_test_regs_func(unsigned long ip, unsigned long pip, struct ftrace_ops *op, - struct pt_regs *pt_regs) + struct ftrace_regs *fregs) { - if (pt_regs) + struct pt_regs *regs = ftrace_get_regs(fregs); + + if (regs) trace_selftest_regs_stat = TRACE_SELFTEST_REGS_FOUND; else trace_selftest_regs_stat = TRACE_SELFTEST_REGS_NOT_FOUND; @@ -561,7 +560,7 @@ static void trace_selftest_test_regs_func(unsigned long ip, static struct ftrace_ops test_regs_probe = { .func = trace_selftest_test_regs_func, - .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_SAVE_REGS, + .flags = FTRACE_OPS_FL_SAVE_REGS, }; static int @@ -787,7 +786,7 @@ trace_selftest_startup_function_graph(struct tracer *trace, /* Have we just recovered from a hang? */ if (graph_hang_thresh > GRAPH_MAX_FUNC_TEST) { - tracing_selftest_disabled = true; + disable_tracing_selftest("recovering from a hang"); ret = -1; goto out; } diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index c408423e5d65..63c285042051 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -290,7 +290,7 @@ static void check_stack(unsigned long ip, unsigned long *stack) static void stack_trace_call(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *pt_regs) + struct ftrace_ops *op, struct ftrace_regs *fregs) { unsigned long stack; @@ -318,7 +318,6 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip, static struct ftrace_ops trace_ops __read_mostly = { .func = stack_trace_call, - .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; static ssize_t diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index 4b50fc0cb12c..d6bddb157ef2 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -609,7 +609,7 @@ __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only) * signal that state. There are two user-visible tracing_map * variables, 'hits' and 'drops', which are updated by this function. * Every time an element is either successfully inserted or retrieved, - * the 'hits' value is incrememented. Every time an element insertion + * the 'hits' value is incremented. Every time an element insertion * fails, the 'drops' value is incremented. * * This is a lock-free tracing map insertion function implementing a @@ -642,9 +642,9 @@ struct tracing_map_elt *tracing_map_insert(struct tracing_map *map, void *key) * tracing_map_elt. This is a lock-free lookup; see * tracing_map_insert() for details on tracing_map and how it works. * Every time an element is retrieved, the 'hits' value is - * incrememented. There is one user-visible tracing_map variable, + * incremented. There is one user-visible tracing_map variable, * 'hits', which is updated by this function. Every time an element - * is successfully retrieved, the 'hits' value is incrememented. The + * is successfully retrieved, the 'hits' value is incremented. The * 'drops' value is never updated by this function. * * Return: the tracing_map_elt pointer val associated with the key. diff --git a/kernel/trace/tracing_map.h b/kernel/trace/tracing_map.h index a6de61fc22de..2c765ee2a4d4 100644 --- a/kernel/trace/tracing_map.h +++ b/kernel/trace/tracing_map.h @@ -50,7 +50,7 @@ typedef int (*tracing_map_cmp_fn_t) (void *val_a, void *val_b); * an instance of tracing_map_elt, where 'elt' in the latter part of * that variable name is short for 'element'. The purpose of a * tracing_map_elt is to hold values specific to the particular - * 32-bit hashed key it's assocated with. Things such as the unique + * 32-bit hashed key it's associated with. Things such as the unique * set of aggregated sums associated with the 32-bit hashed key, along * with a copy of the full key associated with the entry, and which * was used to produce the 32-bit hashed key. |