From e20044f7e9ae2b5395ca3ae9bd0907fdf43357a7 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 16 Mar 2021 12:41:01 -0400 Subject: ring-buffer: Separate out internal use of ring_buffer_event_time_stamp() The exported use of ring_buffer_event_time_stamp() is going to become different than how it is used internally. Move the internal logic out into a static function called rb_event_time_stamp(), and have the internal callers call that instead. Link: https://lkml.kernel.org/r/20210316164113.257790481@goodmis.org Reviewed-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 41 +++++++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 18 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 68744c51517e..941ac2021b97 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -287,6 +287,17 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data); #define TS_MASK ((1ULL << TS_SHIFT) - 1) #define TS_DELTA_TEST (~TS_MASK) +static u64 rb_event_time_stamp(struct ring_buffer_event *event) +{ + u64 ts; + + ts = event->array[0]; + ts <<= TS_SHIFT; + ts += event->time_delta; + + return ts; +} + /** * ring_buffer_event_time_stamp - return the event's extended timestamp * @event: the event to get the timestamp of @@ -299,13 +310,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data); */ u64 ring_buffer_event_time_stamp(struct ring_buffer_event *event) { - u64 ts; - - ts = event->array[0]; - ts <<= TS_SHIFT; - ts += event->time_delta; - - return ts; + return rb_event_time_stamp(event); } /* Flag when events were overwritten */ @@ -2766,7 +2771,7 @@ static u64 rb_time_delta(struct ring_buffer_event *event) return 0; case RINGBUF_TYPE_TIME_EXTEND: - return ring_buffer_event_time_stamp(event); + return rb_event_time_stamp(event); case RINGBUF_TYPE_TIME_STAMP: return 0; @@ -3212,13 +3217,13 @@ static void dump_buffer_page(struct buffer_data_page *bpage, switch (event->type_len) { case RINGBUF_TYPE_TIME_EXTEND: - delta = ring_buffer_event_time_stamp(event); + delta = rb_event_time_stamp(event); ts += delta; pr_warn(" [%lld] delta:%lld TIME EXTEND\n", ts, delta); break; case RINGBUF_TYPE_TIME_STAMP: - delta = ring_buffer_event_time_stamp(event); + delta = rb_event_time_stamp(event); ts = delta; pr_warn(" [%lld] absolute:%lld TIME STAMP\n", ts, delta); break; @@ -3289,12 +3294,12 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer, switch (event->type_len) { case RINGBUF_TYPE_TIME_EXTEND: - delta = ring_buffer_event_time_stamp(event); + delta = rb_event_time_stamp(event); ts += delta; break; case RINGBUF_TYPE_TIME_STAMP: - delta = ring_buffer_event_time_stamp(event); + delta = rb_event_time_stamp(event); ts = delta; break; @@ -4256,12 +4261,12 @@ rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer, return; case RINGBUF_TYPE_TIME_EXTEND: - delta = ring_buffer_event_time_stamp(event); + delta = rb_event_time_stamp(event); cpu_buffer->read_stamp += delta; return; case RINGBUF_TYPE_TIME_STAMP: - delta = ring_buffer_event_time_stamp(event); + delta = rb_event_time_stamp(event); cpu_buffer->read_stamp = delta; return; @@ -4286,12 +4291,12 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter, return; case RINGBUF_TYPE_TIME_EXTEND: - delta = ring_buffer_event_time_stamp(event); + delta = rb_event_time_stamp(event); iter->read_stamp += delta; return; case RINGBUF_TYPE_TIME_STAMP: - delta = ring_buffer_event_time_stamp(event); + delta = rb_event_time_stamp(event); iter->read_stamp = delta; return; @@ -4544,7 +4549,7 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts, case RINGBUF_TYPE_TIME_STAMP: if (ts) { - *ts = ring_buffer_event_time_stamp(event); + *ts = rb_event_time_stamp(event); ring_buffer_normalize_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu, ts); } @@ -4635,7 +4640,7 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) case RINGBUF_TYPE_TIME_STAMP: if (ts) { - *ts = ring_buffer_event_time_stamp(event); + *ts = rb_event_time_stamp(event); ring_buffer_normalize_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu, ts); } -- cgit v1.2.3 From 8672e4948d0c44272cc05f8ff563dbf6b6c1289f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 16 Mar 2021 12:41:02 -0400 Subject: ring-buffer: Add a event_stamp to cpu_buffer for each level of nesting Add a place to save the current event time stamp for each level of nesting. This will be used to retrieve the time stamp of the current event before it is committed. Link: https://lkml.kernel.org/r/20210316164113.399089673@goodmis.org Reviewed-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 941ac2021b97..470d97169081 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -492,6 +492,8 @@ struct rb_time_struct { #endif typedef struct rb_time_struct rb_time_t; +#define MAX_NEST 5 + /* * head_page == tail_page && head == tail then buffer is empty. */ @@ -529,6 +531,7 @@ struct ring_buffer_per_cpu { unsigned long read_bytes; rb_time_t write_stamp; rb_time_t before_stamp; + u64 event_stamp[MAX_NEST]; u64 read_stamp; /* ring buffer pages to update, > 0 to add, < 0 to remove */ long nr_pages_to_update; @@ -2715,6 +2718,10 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, { unsigned length = info->length; u64 delta = info->delta; + unsigned int nest = local_read(&cpu_buffer->committing) - 1; + + if (nest < MAX_NEST) + cpu_buffer->event_stamp[nest] = info->ts; /* * If we need to add a timestamp, then we @@ -3456,7 +3463,6 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, info->after, ts)) { /* Nothing came after this event between C and E */ info->delta = ts - info->after; - info->ts = ts; } else { /* * Interrupted between C and E: @@ -3468,6 +3474,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, */ info->delta = 0; } + info->ts = ts; info->add_timestamp &= ~RB_ADD_STAMP_FORCE; } @@ -5026,6 +5033,8 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) rb_time_set(&cpu_buffer->write_stamp, 0); rb_time_set(&cpu_buffer->before_stamp, 0); + memset(cpu_buffer->event_stamp, 0, sizeof(cpu_buffer->event_stamp)); + cpu_buffer->lost_events = 0; cpu_buffer->last_overrun = 0; -- cgit v1.2.3 From b47e330231acbf4506b049643145cc64268a1940 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 16 Mar 2021 12:41:03 -0400 Subject: tracing: Pass buffer of event to trigger operations The ring_buffer_event_time_stamp() is going to be updated to extract the time stamp for the event without needing it to be set to have absolute values for all events. But to do so, it needs the buffer that the event is on as the buffer saves information for the event before it is committed to the buffer. If the trace buffer is disabled, a temporary buffer is used, and there's no access to this buffer from the current histogram triggers, even though it is passed to the trace event code. Pass the buffer that the event is on all the way down to the histogram triggers. Link: https://lkml.kernel.org/r/20210316164113.542448131@goodmis.org Reviewed-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- include/linux/trace_events.h | 5 +- kernel/trace/trace.c | 2 +- kernel/trace/trace.h | 4 +- kernel/trace/trace_events_hist.c | 92 ++++++++++++++++++++++++------------- kernel/trace/trace_events_trigger.c | 45 +++++++++++------- 5 files changed, 95 insertions(+), 53 deletions(-) (limited to 'kernel/trace') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 28e7af1406f2..8cba64ce23a4 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -640,7 +640,8 @@ enum event_trigger_type { extern int filter_match_preds(struct event_filter *filter, void *rec); extern enum event_trigger_type -event_triggers_call(struct trace_event_file *file, void *rec, +event_triggers_call(struct trace_event_file *file, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event); extern void event_triggers_post_call(struct trace_event_file *file, @@ -664,7 +665,7 @@ trace_trigger_soft_disabled(struct trace_event_file *file) if (!(eflags & EVENT_FILE_FL_TRIGGER_COND)) { if (eflags & EVENT_FILE_FL_TRIGGER_MODE) - event_triggers_call(file, NULL, NULL); + event_triggers_call(file, NULL, NULL, NULL); if (eflags & EVENT_FILE_FL_SOFT_DISABLED) return true; if (eflags & EVENT_FILE_FL_PID_FILTER) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index eccb4e1187cc..f979220238a5 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6763,7 +6763,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, if (tr->trace_marker_file && !list_empty(&tr->trace_marker_file->triggers)) { /* do not add \n before testing triggers, but add \0 */ entry->buf[cnt] = '\0'; - tt = event_triggers_call(tr->trace_marker_file, entry, event); + tt = event_triggers_call(tr->trace_marker_file, buffer, entry, event); } if (entry->buf[cnt - 1] != '\n') { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index a6446c03cfbc..798773178d7e 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1329,7 +1329,7 @@ __event_trigger_test_discard(struct trace_event_file *file, unsigned long eflags = file->flags; if (eflags & EVENT_FILE_FL_TRIGGER_COND) - *tt = event_triggers_call(file, entry, event); + *tt = event_triggers_call(file, buffer, entry, event); if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags) || (unlikely(file->flags & EVENT_FILE_FL_FILTERED) && @@ -1626,7 +1626,7 @@ extern int register_trigger_hist_enable_disable_cmds(void); */ struct event_trigger_ops { void (*func)(struct event_trigger_data *data, - void *rec, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *rbe); int (*init)(struct event_trigger_ops *ops, struct event_trigger_data *data); diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 39ebe1826fc3..6978aa3ee4c5 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -81,6 +81,7 @@ struct hist_field; typedef u64 (*hist_field_fn_t) (struct hist_field *field, struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *event); @@ -153,6 +154,7 @@ struct hist_field { static u64 hist_field_none(struct hist_field *field, struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *event) { @@ -161,6 +163,7 @@ static u64 hist_field_none(struct hist_field *field, static u64 hist_field_counter(struct hist_field *field, struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *event) { @@ -169,6 +172,7 @@ static u64 hist_field_counter(struct hist_field *field, static u64 hist_field_string(struct hist_field *hist_field, struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *event) { @@ -179,6 +183,7 @@ static u64 hist_field_string(struct hist_field *hist_field, static u64 hist_field_dynstring(struct hist_field *hist_field, struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *event) { @@ -191,6 +196,7 @@ static u64 hist_field_dynstring(struct hist_field *hist_field, static u64 hist_field_pstring(struct hist_field *hist_field, struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *event) { @@ -201,52 +207,56 @@ static u64 hist_field_pstring(struct hist_field *hist_field, static u64 hist_field_log2(struct hist_field *hist_field, struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *event) { struct hist_field *operand = hist_field->operands[0]; - u64 val = operand->fn(operand, elt, rbe, event); + u64 val = operand->fn(operand, elt, buffer, rbe, event); return (u64) ilog2(roundup_pow_of_two(val)); } static u64 hist_field_plus(struct hist_field *hist_field, struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *event) { struct hist_field *operand1 = hist_field->operands[0]; struct hist_field *operand2 = hist_field->operands[1]; - u64 val1 = operand1->fn(operand1, elt, rbe, event); - u64 val2 = operand2->fn(operand2, elt, rbe, event); + u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event); + u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event); return val1 + val2; } static u64 hist_field_minus(struct hist_field *hist_field, struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *event) { struct hist_field *operand1 = hist_field->operands[0]; struct hist_field *operand2 = hist_field->operands[1]; - u64 val1 = operand1->fn(operand1, elt, rbe, event); - u64 val2 = operand2->fn(operand2, elt, rbe, event); + u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event); + u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event); return val1 - val2; } static u64 hist_field_unary_minus(struct hist_field *hist_field, struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *event) { struct hist_field *operand = hist_field->operands[0]; - s64 sval = (s64)operand->fn(operand, elt, rbe, event); + s64 sval = (s64)operand->fn(operand, elt, buffer, rbe, event); u64 val = (u64)-sval; return val; @@ -255,6 +265,7 @@ static u64 hist_field_unary_minus(struct hist_field *hist_field, #define DEFINE_HIST_FIELD_FN(type) \ static u64 hist_field_##type(struct hist_field *hist_field, \ struct tracing_map_elt *elt, \ + struct trace_buffer *buffer, \ struct ring_buffer_event *rbe, \ void *event) \ { \ @@ -380,7 +391,8 @@ struct hist_trigger_data { struct action_data; typedef void (*action_fn_t) (struct hist_trigger_data *hist_data, - struct tracing_map_elt *elt, void *rec, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *rbe, void *key, struct action_data *data, u64 *var_ref_vals); @@ -608,7 +620,8 @@ static inline void trace_synth(struct synth_event *event, u64 *var_ref_vals, } static void action_trace(struct hist_trigger_data *hist_data, - struct tracing_map_elt *elt, void *rec, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *rbe, void *key, struct action_data *data, u64 *var_ref_vals) { @@ -624,6 +637,7 @@ struct hist_var_data { static u64 hist_field_timestamp(struct hist_field *hist_field, struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *event) { @@ -640,6 +654,7 @@ static u64 hist_field_timestamp(struct hist_field *hist_field, static u64 hist_field_cpu(struct hist_field *hist_field, struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *event) { @@ -1020,6 +1035,7 @@ static struct hist_field *find_event_var(struct hist_trigger_data *hist_data, static u64 hist_field_var_ref(struct hist_field *hist_field, struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *event) { @@ -2561,6 +2577,7 @@ find_target_event_var(struct hist_trigger_data *hist_data, } static inline void __update_field_vars(struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *rec, struct field_var **field_vars, @@ -2576,7 +2593,7 @@ static inline void __update_field_vars(struct tracing_map_elt *elt, struct hist_field *var = field_var->var; struct hist_field *val = field_var->val; - var_val = val->fn(val, elt, rbe, rec); + var_val = val->fn(val, elt, buffer, rbe, rec); var_idx = var->var.idx; if (val->flags & HIST_FIELD_FL_STRING) { @@ -2592,19 +2609,21 @@ static inline void __update_field_vars(struct tracing_map_elt *elt, static void update_field_vars(struct hist_trigger_data *hist_data, struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *rec) { - __update_field_vars(elt, rbe, rec, hist_data->field_vars, + __update_field_vars(elt, buffer, rbe, rec, hist_data->field_vars, hist_data->n_field_vars, 0); } static void save_track_data_vars(struct hist_trigger_data *hist_data, - struct tracing_map_elt *elt, void *rec, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *rbe, void *key, struct action_data *data, u64 *var_ref_vals) { - __update_field_vars(elt, rbe, rec, hist_data->save_vars, + __update_field_vars(elt, buffer, rbe, rec, hist_data->save_vars, hist_data->n_save_vars, hist_data->n_field_var_str); } @@ -2780,12 +2799,14 @@ static void save_track_val(struct hist_trigger_data *hist_data, } static void save_track_data(struct hist_trigger_data *hist_data, - struct tracing_map_elt *elt, void *rec, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *rbe, void *key, struct action_data *data, u64 *var_ref_vals) { if (data->track_data.save_data) - data->track_data.save_data(hist_data, elt, rec, rbe, key, data, var_ref_vals); + data->track_data.save_data(hist_data, elt, buffer, rec, rbe, + key, data, var_ref_vals); } static bool check_track_val(struct tracing_map_elt *elt, @@ -2836,7 +2857,8 @@ static bool cond_snapshot_update(struct trace_array *tr, void *cond_data) } static void save_track_data_snapshot(struct hist_trigger_data *hist_data, - struct tracing_map_elt *elt, void *rec, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *rbe, void *key, struct action_data *data, u64 *var_ref_vals) @@ -2905,7 +2927,8 @@ static bool cond_snapshot_update(struct trace_array *tr, void *cond_data) return false; } static void save_track_data_snapshot(struct hist_trigger_data *hist_data, - struct tracing_map_elt *elt, void *rec, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *rbe, void *key, struct action_data *data, u64 *var_ref_vals) {} @@ -2947,7 +2970,8 @@ static void track_data_print(struct seq_file *m, } static void ontrack_action(struct hist_trigger_data *hist_data, - struct tracing_map_elt *elt, void *rec, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *rbe, void *key, struct action_data *data, u64 *var_ref_vals) { @@ -2955,7 +2979,8 @@ static void ontrack_action(struct hist_trigger_data *hist_data, if (check_track_val(elt, data, var_val)) { save_track_val(hist_data, elt, data, var_val); - save_track_data(hist_data, elt, rec, rbe, key, data, var_ref_vals); + save_track_data(hist_data, elt, buffer, rec, rbe, + key, data, var_ref_vals); } } @@ -4400,7 +4425,8 @@ create_hist_data(unsigned int map_bits, } static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, - struct tracing_map_elt *elt, void *rec, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *rbe, u64 *var_ref_vals) { @@ -4414,7 +4440,7 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, for_each_hist_val_field(i, hist_data) { hist_field = hist_data->fields[i]; - hist_val = hist_field->fn(hist_field, elt, rbe, rec); + hist_val = hist_field->fn(hist_field, elt, buffer, rbe, rec); if (hist_field->flags & HIST_FIELD_FL_VAR) { var_idx = hist_field->var.idx; @@ -4442,13 +4468,13 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, for_each_hist_key_field(i, hist_data) { hist_field = hist_data->fields[i]; if (hist_field->flags & HIST_FIELD_FL_VAR) { - hist_val = hist_field->fn(hist_field, elt, rbe, rec); + hist_val = hist_field->fn(hist_field, elt, buffer, rbe, rec); var_idx = hist_field->var.idx; tracing_map_set_var(elt, var_idx, hist_val); } } - update_field_vars(hist_data, elt, rbe, rec); + update_field_vars(hist_data, elt, buffer, rbe, rec); } static inline void add_to_key(char *compound_key, void *key, @@ -4478,7 +4504,8 @@ static inline void add_to_key(char *compound_key, void *key, static void hist_trigger_actions(struct hist_trigger_data *hist_data, - struct tracing_map_elt *elt, void *rec, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *rbe, void *key, u64 *var_ref_vals) { @@ -4487,11 +4514,12 @@ hist_trigger_actions(struct hist_trigger_data *hist_data, for (i = 0; i < hist_data->n_actions; i++) { data = hist_data->actions[i]; - data->fn(hist_data, elt, rec, rbe, key, data, var_ref_vals); + data->fn(hist_data, elt, buffer, rec, rbe, key, data, var_ref_vals); } } -static void event_hist_trigger(struct event_trigger_data *data, void *rec, +static void event_hist_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *rbe) { struct hist_trigger_data *hist_data = data->private_data; @@ -4516,7 +4544,7 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec, HIST_STACKTRACE_SKIP); key = entries; } else { - field_contents = key_field->fn(key_field, elt, rbe, rec); + field_contents = key_field->fn(key_field, elt, buffer, rbe, rec); if (key_field->flags & HIST_FIELD_FL_STRING) { key = (void *)(unsigned long)field_contents; use_compound_key = true; @@ -4539,10 +4567,10 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec, if (!elt) return; - hist_trigger_elt_update(hist_data, elt, rec, rbe, var_ref_vals); + hist_trigger_elt_update(hist_data, elt, buffer, rec, rbe, var_ref_vals); if (resolve_var_refs(hist_data, key, var_ref_vals, true)) - hist_trigger_actions(hist_data, elt, rec, rbe, key, var_ref_vals); + hist_trigger_actions(hist_data, elt, buffer, rec, rbe, key, var_ref_vals); } static void hist_trigger_stacktrace_print(struct seq_file *m, @@ -5812,7 +5840,8 @@ __init int register_trigger_hist_cmd(void) } static void -hist_enable_trigger(struct event_trigger_data *data, void *rec, +hist_enable_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { struct enable_trigger_data *enable_data = data->private_data; @@ -5830,7 +5859,8 @@ hist_enable_trigger(struct event_trigger_data *data, void *rec, } static void -hist_enable_count_trigger(struct event_trigger_data *data, void *rec, +hist_enable_count_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { if (!data->count) @@ -5839,7 +5869,7 @@ hist_enable_count_trigger(struct event_trigger_data *data, void *rec, if (data->count != -1) (data->count)--; - hist_enable_trigger(data, rec, event); + hist_enable_trigger(data, buffer, rec, event); } static struct event_trigger_ops hist_enable_trigger_ops = { diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index f725802160c0..b8bfa8505b7b 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -53,7 +53,8 @@ void trigger_data_free(struct event_trigger_data *data) * any trigger that should be deferred, ETT_NONE if nothing to defer. */ enum event_trigger_type -event_triggers_call(struct trace_event_file *file, void *rec, +event_triggers_call(struct trace_event_file *file, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { struct event_trigger_data *data; @@ -67,7 +68,7 @@ event_triggers_call(struct trace_event_file *file, void *rec, if (data->paused) continue; if (!rec) { - data->ops->func(data, rec, event); + data->ops->func(data, buffer, rec, event); continue; } filter = rcu_dereference_sched(data->filter); @@ -77,7 +78,7 @@ event_triggers_call(struct trace_event_file *file, void *rec, tt |= data->cmd_ops->trigger_type; continue; } - data->ops->func(data, rec, event); + data->ops->func(data, buffer, rec, event); } return tt; } @@ -105,7 +106,7 @@ event_triggers_post_call(struct trace_event_file *file, if (data->paused) continue; if (data->cmd_ops->trigger_type & tt) - data->ops->func(data, NULL, NULL); + data->ops->func(data, NULL, NULL, NULL); } } EXPORT_SYMBOL_GPL(event_triggers_post_call); @@ -937,7 +938,8 @@ get_named_trigger_data(struct event_trigger_data *data) } static void -traceon_trigger(struct event_trigger_data *data, void *rec, +traceon_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { if (tracing_is_on()) @@ -947,7 +949,8 @@ traceon_trigger(struct event_trigger_data *data, void *rec, } static void -traceon_count_trigger(struct event_trigger_data *data, void *rec, +traceon_count_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { if (tracing_is_on()) @@ -963,7 +966,8 @@ traceon_count_trigger(struct event_trigger_data *data, void *rec, } static void -traceoff_trigger(struct event_trigger_data *data, void *rec, +traceoff_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { if (!tracing_is_on()) @@ -973,7 +977,8 @@ traceoff_trigger(struct event_trigger_data *data, void *rec, } static void -traceoff_count_trigger(struct event_trigger_data *data, void *rec, +traceoff_count_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { if (!tracing_is_on()) @@ -1071,7 +1076,8 @@ static struct event_command trigger_traceoff_cmd = { #ifdef CONFIG_TRACER_SNAPSHOT static void -snapshot_trigger(struct event_trigger_data *data, void *rec, +snapshot_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { struct trace_event_file *file = data->private_data; @@ -1083,7 +1089,8 @@ snapshot_trigger(struct event_trigger_data *data, void *rec, } static void -snapshot_count_trigger(struct event_trigger_data *data, void *rec, +snapshot_count_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { if (!data->count) @@ -1092,7 +1099,7 @@ snapshot_count_trigger(struct event_trigger_data *data, void *rec, if (data->count != -1) (data->count)--; - snapshot_trigger(data, rec, event); + snapshot_trigger(data, buffer, rec, event); } static int @@ -1176,14 +1183,16 @@ static __init int register_trigger_snapshot_cmd(void) { return 0; } #endif static void -stacktrace_trigger(struct event_trigger_data *data, void *rec, +stacktrace_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { trace_dump_stack(STACK_SKIP); } static void -stacktrace_count_trigger(struct event_trigger_data *data, void *rec, +stacktrace_count_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { if (!data->count) @@ -1192,7 +1201,7 @@ stacktrace_count_trigger(struct event_trigger_data *data, void *rec, if (data->count != -1) (data->count)--; - stacktrace_trigger(data, rec, event); + stacktrace_trigger(data, buffer, rec, event); } static int @@ -1254,7 +1263,8 @@ static __init void unregister_trigger_traceon_traceoff_cmds(void) } static void -event_enable_trigger(struct event_trigger_data *data, void *rec, +event_enable_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { struct enable_trigger_data *enable_data = data->private_data; @@ -1266,7 +1276,8 @@ event_enable_trigger(struct event_trigger_data *data, void *rec, } static void -event_enable_count_trigger(struct event_trigger_data *data, void *rec, +event_enable_count_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { struct enable_trigger_data *enable_data = data->private_data; @@ -1281,7 +1292,7 @@ event_enable_count_trigger(struct event_trigger_data *data, void *rec, if (data->count != -1) (data->count)--; - event_enable_trigger(data, rec, event); + event_enable_trigger(data, buffer, rec, event); } int event_enable_trigger_print(struct seq_file *m, -- cgit v1.2.3 From efe6196a6bc5bbc84b856316c4687fd24566a95c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 16 Mar 2021 12:41:04 -0400 Subject: ring-buffer: Allow ring_buffer_event_time_stamp() to return time stamp of all events Currently, ring_buffer_event_time_stamp() only returns an accurate time stamp of the event if it has an absolute extended time stamp attached to it. To make it more robust, use the event_stamp() in case the event does not have an absolute value attached to it. This will allow ring_buffer_event_time_stamp() to be used in more cases than just histograms, and it will also allow histograms to not require including absolute values all the time. Link: https://lkml.kernel.org/r/20210316164113.704830885@goodmis.org Reviewed-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- include/linux/ring_buffer.h | 3 +- kernel/trace/ring_buffer.c | 60 ++++++++++++++++++++++++++++++---------- kernel/trace/trace_events_hist.c | 2 +- 3 files changed, 48 insertions(+), 17 deletions(-) (limited to 'kernel/trace') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 136ea0997e6d..057b7ed4fe24 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -61,7 +61,8 @@ enum ring_buffer_type { unsigned ring_buffer_event_length(struct ring_buffer_event *event); void *ring_buffer_event_data(struct ring_buffer_event *event); -u64 ring_buffer_event_time_stamp(struct ring_buffer_event *event); +u64 ring_buffer_event_time_stamp(struct trace_buffer *buffer, + struct ring_buffer_event *event); /* * ring_buffer_discard_commit will remove an event that has not diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 470d97169081..8fa2a84f714f 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -298,21 +298,6 @@ static u64 rb_event_time_stamp(struct ring_buffer_event *event) return ts; } -/** - * ring_buffer_event_time_stamp - return the event's extended timestamp - * @event: the event to get the timestamp of - * - * Returns the extended timestamp associated with a data event. - * An extended time_stamp is a 64-bit timestamp represented - * internally in a special way that makes the best use of space - * contained within a ring buffer event. This function decodes - * it and maps it to a straight u64 value. - */ -u64 ring_buffer_event_time_stamp(struct ring_buffer_event *event) -{ - return rb_event_time_stamp(event); -} - /* Flag when events were overwritten */ #define RB_MISSED_EVENTS (1 << 31) /* Missed count stored at end */ @@ -757,6 +742,51 @@ static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set) } #endif +static inline u64 rb_time_stamp(struct trace_buffer *buffer); + +/** + * ring_buffer_event_time_stamp - return the event's current time stamp + * @buffer: The buffer that the event is on + * @event: the event to get the time stamp of + * + * Note, this must be called after @event is reserved, and before it is + * committed to the ring buffer. And must be called from the same + * context where the event was reserved (normal, softirq, irq, etc). + * + * Returns the time stamp associated with the current event. + * If the event has an extended time stamp, then that is used as + * the time stamp to return. + * In the highly unlikely case that the event was nested more than + * the max nesting, then the write_stamp of the buffer is returned, + * otherwise current time is returned, but that really neither of + * the last two cases should ever happen. + */ +u64 ring_buffer_event_time_stamp(struct trace_buffer *buffer, + struct ring_buffer_event *event) +{ + struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[smp_processor_id()]; + unsigned int nest; + u64 ts; + + /* If the event includes an absolute time, then just use that */ + if (event->type_len == RINGBUF_TYPE_TIME_STAMP) + return rb_event_time_stamp(event); + + /* Read the current saved nesting level time stamp */ + nest = local_read(&cpu_buffer->committing) - 1; + if (likely(nest < MAX_NEST)) + return cpu_buffer->event_stamp[nest]; + + WARN_ON_ONCE(1); + + /* Can only fail on 32 bit */ + if (!rb_time_read(&cpu_buffer->write_stamp, &ts)) + /* Screw it, just read the current time */ + ts = rb_time_stamp(cpu_buffer->buffer); + + return ts; +} + /** * ring_buffer_nr_pages - get the number of buffer pages in the ring buffer * @buffer: The ring_buffer to get the number of pages from diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 6978aa3ee4c5..45986cb4637e 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -644,7 +644,7 @@ static u64 hist_field_timestamp(struct hist_field *hist_field, struct hist_trigger_data *hist_data = hist_field->hist_data; struct trace_array *tr = hist_data->event_file->tr; - u64 ts = ring_buffer_event_time_stamp(rbe); + u64 ts = ring_buffer_event_time_stamp(buffer, rbe); if (hist_data->attrs->ts_in_usecs && trace_clock_in_ns(tr)) ts = ns2usecs(ts); -- cgit v1.2.3 From b94bc80df64823e676b506f8de7dcf6a688d681e Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 16 Mar 2021 12:41:05 -0400 Subject: tracing: Use a no_filter_buffering_ref to stop using the filter buffer Currently, the trace histograms relies on it using absolute time stamps to trigger the tracing to not use the temp buffer if filters are set. That's because the histograms need the full timestamp that is saved in the ring buffer. That is no longer the case, as the ring_buffer_event_time_stamp() can now return the time stamp for all events without all triggering a full absolute time stamp. Now that the absolute time stamp is an unrelated dependency to not using the filters. There's nothing about having absolute timestamps to keep from using the filter buffer. Instead, change the interface to explicitly state to disable filter buffering that the histogram logic can use. Link: https://lkml.kernel.org/r/20210316164113.847886563@goodmis.org Reviewed-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 28 ++++++++++++---------------- kernel/trace/trace.h | 4 ++-- kernel/trace/trace_events_hist.c | 6 +++--- 3 files changed, 17 insertions(+), 21 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f979220238a5..b15436ff85e8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2737,12 +2737,13 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb, unsigned int trace_ctx) { struct ring_buffer_event *entry; + struct trace_array *tr = trace_file->tr; int val; - *current_rb = trace_file->tr->array_buffer.buffer; + *current_rb = tr->array_buffer.buffer; - if (!ring_buffer_time_stamp_abs(*current_rb) && (trace_file->flags & - (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED)) && + if (!tr->no_filter_buffering_ref && + (trace_file->flags & (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED)) && (entry = this_cpu_read(trace_buffered_event))) { /* Try to use the per cpu buffer first */ val = this_cpu_inc_return(trace_buffered_event_cnt); @@ -6971,31 +6972,26 @@ static int tracing_time_stamp_mode_open(struct inode *inode, struct file *file) return ret; } -int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs) +/* + * Set or disable using the per CPU trace_buffer_event when possible. + */ +int tracing_set_filter_buffering(struct trace_array *tr, bool set) { int ret = 0; mutex_lock(&trace_types_lock); - if (abs && tr->time_stamp_abs_ref++) + if (set && tr->no_filter_buffering_ref++) goto out; - if (!abs) { - if (WARN_ON_ONCE(!tr->time_stamp_abs_ref)) { + if (!set) { + if (WARN_ON_ONCE(!tr->no_filter_buffering_ref)) { ret = -EINVAL; goto out; } - if (--tr->time_stamp_abs_ref) - goto out; + --tr->no_filter_buffering_ref; } - - ring_buffer_set_time_stamp_abs(tr->array_buffer.buffer, abs); - -#ifdef CONFIG_TRACER_MAX_TRACE - if (tr->max_buffer.buffer) - ring_buffer_set_time_stamp_abs(tr->max_buffer.buffer, abs); -#endif out: mutex_unlock(&trace_types_lock); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 798773178d7e..f2a7a72825c7 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -352,7 +352,7 @@ struct trace_array { /* function tracing enabled */ int function_enabled; #endif - int time_stamp_abs_ref; + int no_filter_buffering_ref; struct list_head hist_vars; #ifdef CONFIG_TRACER_SNAPSHOT struct cond_snapshot *cond_snapshot; @@ -372,7 +372,7 @@ extern int tracing_check_open_get_tr(struct trace_array *tr); extern struct trace_array *trace_array_find(const char *instance); extern struct trace_array *trace_array_find_get(const char *instance); -extern int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs); +extern int tracing_set_filter_buffering(struct trace_array *tr, bool set); extern int tracing_set_clock(struct trace_array *tr, const char *clockstr); extern bool trace_clock_in_ns(struct trace_array *tr); diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 45986cb4637e..c1abd63f1d6c 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -5484,7 +5484,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, goto out; } - tracing_set_time_stamp_abs(file->tr, true); + tracing_set_filter_buffering(file->tr, true); } if (named_data) @@ -5592,7 +5592,7 @@ static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops, if (hist_data->enable_timestamps) { if (!hist_data->remove || unregistered) - tracing_set_time_stamp_abs(file->tr, false); + tracing_set_filter_buffering(file->tr, false); } } @@ -5639,7 +5639,7 @@ static void hist_unreg_all(struct trace_event_file *file) update_cond_flag(file); if (hist_data->enable_timestamps) - tracing_set_time_stamp_abs(file->tr, false); + tracing_set_filter_buffering(file->tr, false); if (test->ops->free) test->ops->free(test->ops, test); } -- cgit v1.2.3 From a948c69d6fb1ba749a958a8a87d4eecdda28989d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 16 Mar 2021 12:41:06 -0400 Subject: ring-buffer: Add verifier for using ring_buffer_event_time_stamp() The ring_buffer_event_time_stamp() must be only called by an event that has not been committed yet, and is on the buffer that is passed in. This was used to help debug converting the histogram logic over to using the new time stamp code, and was proven to be very useful. Add a verifier that can check that this is the case, and extra WARN_ONs to catch unexpected use cases. Link: https://lkml.kernel.org/r/20210316164113.987294354@goodmis.org Reviewed-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 56 ++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 52 insertions(+), 4 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 8fa2a84f714f..1c61a8cd7b99 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -742,6 +742,48 @@ static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set) } #endif +/* + * Enable this to make sure that the event passed to + * ring_buffer_event_time_stamp() is not committed and also + * is on the buffer that it passed in. + */ +//#define RB_VERIFY_EVENT +#ifdef RB_VERIFY_EVENT +static struct list_head *rb_list_head(struct list_head *list); +static void verify_event(struct ring_buffer_per_cpu *cpu_buffer, + void *event) +{ + struct buffer_page *page = cpu_buffer->commit_page; + struct buffer_page *tail_page = READ_ONCE(cpu_buffer->tail_page); + struct list_head *next; + long commit, write; + unsigned long addr = (unsigned long)event; + bool done = false; + int stop = 0; + + /* Make sure the event exists and is not committed yet */ + do { + if (page == tail_page || WARN_ON_ONCE(stop++ > 100)) + done = true; + commit = local_read(&page->page->commit); + write = local_read(&page->write); + if (addr >= (unsigned long)&page->page->data[commit] && + addr < (unsigned long)&page->page->data[write]) + return; + + next = rb_list_head(page->list.next); + page = list_entry(next, struct buffer_page, list); + } while (!done); + WARN_ON_ONCE(1); +} +#else +static inline void verify_event(struct ring_buffer_per_cpu *cpu_buffer, + void *event) +{ +} +#endif + + static inline u64 rb_time_stamp(struct trace_buffer *buffer); /** @@ -772,13 +814,19 @@ u64 ring_buffer_event_time_stamp(struct trace_buffer *buffer, if (event->type_len == RINGBUF_TYPE_TIME_STAMP) return rb_event_time_stamp(event); + nest = local_read(&cpu_buffer->committing); + verify_event(cpu_buffer, event); + if (WARN_ON_ONCE(!nest)) + goto fail; + /* Read the current saved nesting level time stamp */ - nest = local_read(&cpu_buffer->committing) - 1; - if (likely(nest < MAX_NEST)) + if (likely(--nest < MAX_NEST)) return cpu_buffer->event_stamp[nest]; - WARN_ON_ONCE(1); + /* Shouldn't happen, warn if it does */ + WARN_ONCE(1, "nest (%d) greater than max", nest); + fail: /* Can only fail on 32 bit */ if (!rb_time_read(&cpu_buffer->write_stamp, &ts)) /* Screw it, just read the current time */ @@ -2750,7 +2798,7 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, u64 delta = info->delta; unsigned int nest = local_read(&cpu_buffer->committing) - 1; - if (nest < MAX_NEST) + if (!WARN_ON_ONCE(nest >= MAX_NEST)) cpu_buffer->event_stamp[nest] = info->ts; /* -- cgit v1.2.3 From d8279bfc5e9598682f657606d3830ab65932cfe4 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 16 Mar 2021 12:41:07 -0400 Subject: tracing: Add tracing_event_time_stamp() API Add a tracing_event_time_stamp() API that checks if the event passed in is not on the ring buffer but a pointer to the per CPU trace_buffered_event which does not have its time stamp set yet. If it is a pointer to the trace_buffered_event, then just return the current time stamp that the ring buffer would produce. Otherwise, return the time stamp from the event. Link: https://lkml.kernel.org/r/20210316164114.131996180@goodmis.org Reviewed-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 8 ++++++++ kernel/trace/trace.h | 1 + 2 files changed, 9 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b15436ff85e8..90ae3140756e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6972,6 +6972,14 @@ static int tracing_time_stamp_mode_open(struct inode *inode, struct file *file) return ret; } +u64 tracing_event_time_stamp(struct trace_buffer *buffer, struct ring_buffer_event *rbe) +{ + if (rbe == this_cpu_read(trace_buffered_event)) + return ring_buffer_time_stamp(buffer, smp_processor_id()); + + return ring_buffer_event_time_stamp(buffer, rbe); +} + /* * Set or disable using the per CPU trace_buffer_event when possible. */ diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f2a7a72825c7..0d8f54f49a3a 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -372,6 +372,7 @@ extern int tracing_check_open_get_tr(struct trace_array *tr); extern struct trace_array *trace_array_find(const char *instance); extern struct trace_array *trace_array_find_get(const char *instance); +extern u64 tracing_event_time_stamp(struct trace_buffer *buffer, struct ring_buffer_event *rbe); extern int tracing_set_filter_buffering(struct trace_array *tr, bool set); extern int tracing_set_clock(struct trace_array *tr, const char *clockstr); -- cgit v1.2.3 From 5013f454a352cce8e62162976026a9c472595e42 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 25 Feb 2021 16:51:23 -0500 Subject: tracing: Add check of trace event print fmts for dereferencing pointers Trace events record data into the ring buffer at the time of the event. The trace event has a printf logic to display the recorded data at a much later time when the user reads the trace file. This makes using dereferencing pointers unsafe if the dereferenced pointer points to the original source. The safe way to handle this is to create an array within the trace event and copy the source into the array. Then the dereference pointer may point to that array. As this is a easy mistake to make, a check is added to examine all trace event print fmts to make sure that they are safe to use. This only checks the various %p* dereferenced pointers like %pB, %pR, etc. It does not handle dereferencing of strings, as there are some use cases that are OK to dereference the source. That will be dealt with differently. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events.c | 210 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 210 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index a3563afd412d..f58106eaf8cb 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -217,6 +217,214 @@ int trace_event_get_offsets(struct trace_event_call *call) return tail->offset + tail->size; } +/* + * Check if the referenced field is an array and return true, + * as arrays are OK to dereference. + */ +static bool test_field(const char *fmt, struct trace_event_call *call) +{ + struct trace_event_fields *field = call->class->fields_array; + const char *array_descriptor; + const char *p = fmt; + int len; + + if (!(len = str_has_prefix(fmt, "REC->"))) + return false; + fmt += len; + for (p = fmt; *p; p++) { + if (!isalnum(*p) && *p != '_') + break; + } + len = p - fmt; + + for (; field->type; field++) { + if (strncmp(field->name, fmt, len) || + field->name[len]) + continue; + array_descriptor = strchr(field->type, '['); + /* This is an array and is OK to dereference. */ + return array_descriptor != NULL; + } + return false; +} + +/* + * Examine the print fmt of the event looking for unsafe dereference + * pointers using %p* that could be recorded in the trace event and + * much later referenced after the pointer was freed. Dereferencing + * pointers are OK, if it is dereferenced into the event itself. + */ +static void test_event_printk(struct trace_event_call *call) +{ + u64 dereference_flags = 0; + bool first = true; + const char *fmt, *c, *r, *a; + int parens = 0; + char in_quote = 0; + int start_arg = 0; + int arg = 0; + int i; + + fmt = call->print_fmt; + + if (!fmt) + return; + + for (i = 0; fmt[i]; i++) { + switch (fmt[i]) { + case '\\': + i++; + if (!fmt[i]) + return; + continue; + case '"': + case '\'': + /* + * The print fmt starts with a string that + * is processed first to find %p* usage, + * then after the first string, the print fmt + * contains arguments that are used to check + * if the dereferenced %p* usage is safe. + */ + if (first) { + if (fmt[i] == '\'') + continue; + if (in_quote) { + arg = 0; + first = false; + /* + * If there was no %p* uses + * the fmt is OK. + */ + if (!dereference_flags) + return; + } + } + if (in_quote) { + if (in_quote == fmt[i]) + in_quote = 0; + } else { + in_quote = fmt[i]; + } + continue; + case '%': + if (!first || !in_quote) + continue; + i++; + if (!fmt[i]) + return; + switch (fmt[i]) { + case '%': + continue; + case 'p': + /* Find dereferencing fields */ + switch (fmt[i + 1]) { + case 'B': case 'R': case 'r': + case 'b': case 'M': case 'm': + case 'I': case 'i': case 'E': + case 'U': case 'V': case 'N': + case 'a': case 'd': case 'D': + case 'g': case 't': case 'C': + case 'O': case 'f': + if (WARN_ONCE(arg == 63, + "Too many args for event: %s", + trace_event_name(call))) + return; + dereference_flags |= 1ULL << arg; + } + break; + default: + { + bool star = false; + int j; + + /* Increment arg if %*s exists. */ + for (j = 0; fmt[i + j]; j++) { + if (isdigit(fmt[i + j]) || + fmt[i + j] == '.') + continue; + if (fmt[i + j] == '*') { + star = true; + continue; + } + if ((fmt[i + j] == 's') && star) + arg++; + break; + } + break; + } /* default */ + + } /* switch */ + arg++; + continue; + case '(': + if (in_quote) + continue; + parens++; + continue; + case ')': + if (in_quote) + continue; + parens--; + if (WARN_ONCE(parens < 0, + "Paren mismatch for event: %s\narg='%s'\n%*s", + trace_event_name(call), + fmt + start_arg, + (i - start_arg) + 5, "^")) + return; + continue; + case ',': + if (in_quote || parens) + continue; + i++; + while (isspace(fmt[i])) + i++; + start_arg = i; + if (!(dereference_flags & (1ULL << arg))) + goto next_arg; + + /* Find the REC-> in the argument */ + c = strchr(fmt + i, ','); + r = strstr(fmt + i, "REC->"); + if (r && (!c || r < c)) { + /* + * Addresses of events on the buffer, + * or an array on the buffer is + * OK to dereference. + * There's ways to fool this, but + * this is to catch common mistakes, + * not malicious code. + */ + a = strchr(fmt + i, '&'); + if ((a && (a < r)) || test_field(r, call)) + dereference_flags &= ~(1ULL << arg); + } + next_arg: + i--; + arg++; + } + } + + /* + * If you triggered the below warning, the trace event reported + * uses an unsafe dereference pointer %p*. As the data stored + * at the trace event time may no longer exist when the trace + * event is printed, dereferencing to the original source is + * unsafe. The source of the dereference must be copied into the + * event itself, and the dereference must access the copy instead. + */ + if (WARN_ON_ONCE(dereference_flags)) { + arg = 1; + while (!(dereference_flags & 1)) { + dereference_flags >>= 1; + arg++; + } + pr_warn("event %s has unsafe dereference of argument %d\n", + trace_event_name(call), arg); + pr_warn("print_fmt: %s\n", fmt); + } +} + int trace_event_raw_init(struct trace_event_call *call) { int id; @@ -225,6 +433,8 @@ int trace_event_raw_init(struct trace_event_call *call) if (!id) return -ENODEV; + test_event_printk(call); + return 0; } EXPORT_SYMBOL_GPL(trace_event_raw_init); -- cgit v1.2.3 From 9a6944fee68e25084130386c608c5ac8db487581 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 25 Feb 2021 22:00:57 -0500 Subject: tracing: Add a verifier to check string pointers for trace events It is a common mistake for someone writing a trace event to save a pointer to a string in the TP_fast_assign() and then display that string pointer in the TP_printk() with %s. The problem is that those two events may happen a long time apart, where the source of the string may no longer exist. The proper way to handle displaying any string that is not guaranteed to be in the kernel core rodata section, is to copy it into the ring buffer via the __string(), __assign_str() and __get_str() helper macros. Add a check at run time while displaying the TP_printk() of events to make sure that every %s referenced is safe to dereference, and if it is not, trigger a warning and only show the address of the pointer, and the dereferenced string if it can be safely retrieved with a strncpy_from_kernel_nofault() call. In order to not have to copy the parsing of vsnprintf() formats, or even exporting its code, the verifier relies on vsnprintf() being able to modify the va_list that is passed to it, and it remains modified after it is called. This is the case for some architectures like x86_64, but other architectures like x86_32 pass the va_list to vsnprintf() as a value not a reference, and the verifier can not use it to parse the non string arguments. Thus, at boot up, it is checked if vsnprintf() modifies the passed in va_list or not, and a static branch will disable the verifier if it's not compatible. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 200 ++++++++++++++++++++++++++++++++++++++++++++ kernel/trace/trace.h | 3 + kernel/trace/trace_output.c | 2 +- kernel/trace/trace_printk.c | 11 +++ 4 files changed, 215 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 90ae3140756e..e32f5a49f1cf 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3558,6 +3558,204 @@ static char *trace_iter_expand_format(struct trace_iterator *iter) return tmp; } +/* Returns true if the string is safe to dereference from an event */ +static bool trace_safe_str(struct trace_iterator *iter, const char *str) +{ + unsigned long addr = (unsigned long)str; + struct trace_event *trace_event; + struct trace_event_call *event; + + /* OK if part of the event data */ + if ((addr >= (unsigned long)iter->ent) && + (addr < (unsigned long)iter->ent + iter->ent_size)) + return true; + + /* OK if part of the temp seq buffer */ + if ((addr >= (unsigned long)iter->tmp_seq.buffer) && + (addr < (unsigned long)iter->tmp_seq.buffer + PAGE_SIZE)) + return true; + + /* Core rodata can not be freed */ + if (is_kernel_rodata(addr)) + return true; + + if (trace_is_tracepoint_string(str)) + return true; + + /* + * Now this could be a module event, referencing core module + * data, which is OK. + */ + if (!iter->ent) + return false; + + trace_event = ftrace_find_event(iter->ent->type); + if (!trace_event) + return false; + + event = container_of(trace_event, struct trace_event_call, event); + if (!event->mod) + return false; + + /* Would rather have rodata, but this will suffice */ + if (within_module_core(addr, event->mod)) + return true; + + return false; +} + +static const char *show_buffer(struct trace_seq *s) +{ + struct seq_buf *seq = &s->seq; + + seq_buf_terminate(seq); + + return seq->buffer; +} + +static DEFINE_STATIC_KEY_FALSE(trace_no_verify); + +static int test_can_verify_check(const char *fmt, ...) +{ + char buf[16]; + va_list ap; + int ret; + + /* + * The verifier is dependent on vsnprintf() modifies the va_list + * passed to it, where it is sent as a reference. Some architectures + * (like x86_32) passes it by value, which means that vsnprintf() + * does not modify the va_list passed to it, and the verifier + * would then need to be able to understand all the values that + * vsnprintf can use. If it is passed by value, then the verifier + * is disabled. + */ + va_start(ap, fmt); + vsnprintf(buf, 16, "%d", ap); + ret = va_arg(ap, int); + va_end(ap); + + return ret; +} + +static void test_can_verify(void) +{ + if (!test_can_verify_check("%d %d", 0, 1)) { + pr_info("trace event string verifier disabled\n"); + static_branch_inc(&trace_no_verify); + } +} + +/** + * trace_check_vprintf - Check dereferenced strings while writing to the seq buffer + * @iter: The iterator that holds the seq buffer and the event being printed + * @fmt: The format used to print the event + * @ap: The va_list holding the data to print from @fmt. + * + * This writes the data into the @iter->seq buffer using the data from + * @fmt and @ap. If the format has a %s, then the source of the string + * is examined to make sure it is safe to print, otherwise it will + * warn and print "[UNSAFE MEMORY]" in place of the dereferenced string + * pointer. + */ +void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, + va_list ap) +{ + const char *p = fmt; + const char *str; + int i, j; + + if (WARN_ON_ONCE(!fmt)) + return; + + if (static_branch_unlikely(&trace_no_verify)) + goto print; + + /* Don't bother checking when doing a ftrace_dump() */ + if (iter->fmt == static_fmt_buf) + goto print; + + while (*p) { + j = 0; + + /* We only care about %s and variants */ + for (i = 0; p[i]; i++) { + if (i + 1 >= iter->fmt_size) { + /* + * If we can't expand the copy buffer, + * just print it. + */ + if (!trace_iter_expand_format(iter)) + goto print; + } + + if (p[i] == '\\' && p[i+1]) { + i++; + continue; + } + if (p[i] == '%') { + /* Need to test cases like %08.*s */ + for (j = 1; p[i+j]; j++) { + if (isdigit(p[i+j]) || + p[i+j] == '*' || + p[i+j] == '.') + continue; + break; + } + if (p[i+j] == 's') + break; + } + j = 0; + } + /* If no %s found then just print normally */ + if (!p[i]) + break; + + /* Copy up to the %s, and print that */ + strncpy(iter->fmt, p, i); + iter->fmt[i] = '\0'; + trace_seq_vprintf(&iter->seq, iter->fmt, ap); + + /* The ap now points to the string data of the %s */ + str = va_arg(ap, const char *); + + /* + * If you hit this warning, it is likely that the + * trace event in question used %s on a string that + * was saved at the time of the event, but may not be + * around when the trace is read. Use __string(), + * __assign_str() and __get_str() helpers in the TRACE_EVENT() + * instead. See samples/trace_events/trace-events-sample.h + * for reference. + */ + if (WARN_ONCE(!trace_safe_str(iter, str), + "fmt: '%s' current_buffer: '%s'", + fmt, show_buffer(&iter->seq))) { + int ret; + + /* Try to safely read the string */ + ret = strncpy_from_kernel_nofault(iter->fmt, str, + iter->fmt_size); + if (ret < 0) + trace_seq_printf(&iter->seq, "(0x%px)", str); + else + trace_seq_printf(&iter->seq, "(0x%px:%s)", + str, iter->fmt); + str = "[UNSAFE-MEMORY]"; + strcpy(iter->fmt, "%s"); + } else { + strncpy(iter->fmt, p + i, j + 1); + iter->fmt[j+1] = '\0'; + } + trace_seq_printf(&iter->seq, iter->fmt, str); + + p += i + j + 1; + } + print: + if (*p) + trace_seq_vprintf(&iter->seq, p, ap); +} + const char *trace_event_format(struct trace_iterator *iter, const char *fmt) { const char *p, *new_fmt; @@ -9675,6 +9873,8 @@ __init static int tracer_alloc_buffers(void) register_snapshot_cmd(); + test_can_verify(); + return 0; out_free_savedcmd: diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 0d8f54f49a3a..2952bd92bc62 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -582,7 +582,10 @@ struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, void trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer, struct ring_buffer_event *event); +bool trace_is_tracepoint_string(const char *str); const char *trace_event_format(struct trace_iterator *iter, const char *fmt); +void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, + va_list ap); int trace_empty(struct trace_iterator *iter); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 61255bad7e01..a0146e1fffdf 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -317,7 +317,7 @@ void trace_event_printf(struct trace_iterator *iter, const char *fmt, ...) va_list ap; va_start(ap, fmt); - trace_seq_vprintf(&iter->seq, trace_event_format(iter, fmt), ap); + trace_check_vprintf(iter, trace_event_format(iter, fmt), ap); va_end(ap); } EXPORT_SYMBOL(trace_event_printf); diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index ff32476df072..4b320fe7df70 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -251,6 +251,17 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap) } EXPORT_SYMBOL_GPL(__ftrace_vprintk); +bool trace_is_tracepoint_string(const char *str) +{ + const char **ptr = __start___tracepoint_str; + + for (ptr = __start___tracepoint_str; ptr < __stop___tracepoint_str; ptr++) { + if (str == *ptr) + return true; + } + return false; +} + static const char **find_next(void *v, loff_t *pos) { const char **fmt = v; -- cgit v1.2.3 From f2cc020d7876de7583feb52ec939a32419cf9468 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 23 Mar 2021 18:49:35 +0100 Subject: tracing: Fix various typos in comments Fix ~59 single-word typos in the tracing code comments, and fix the grammar in a handful of places. Link: https://lore.kernel.org/r/20210322224546.GA1981273@gmail.com Link: https://lkml.kernel.org/r/20210323174935.GA4176821@gmail.com Reviewed-by: Randy Dunlap Signed-off-by: Ingo Molnar Signed-off-by: Steven Rostedt (VMware) --- arch/microblaze/include/asm/ftrace.h | 2 +- arch/nds32/kernel/ftrace.c | 2 +- arch/powerpc/include/asm/ftrace.h | 4 ++-- arch/sh/kernel/ftrace.c | 2 +- arch/sparc/include/asm/ftrace.h | 2 +- fs/tracefs/inode.c | 2 +- include/linux/ftrace.h | 4 ++-- include/linux/trace_events.h | 2 +- include/linux/tracepoint.h | 2 +- include/trace/events/io_uring.h | 2 +- include/trace/events/rcu.h | 2 +- include/trace/events/sched.h | 2 +- include/trace/events/timer.h | 2 +- kernel/trace/bpf_trace.c | 5 +++-- kernel/trace/fgraph.c | 4 ++-- kernel/trace/ftrace.c | 8 ++++---- kernel/trace/ring_buffer.c | 2 +- kernel/trace/synth_event_gen_test.c | 2 +- kernel/trace/trace.c | 18 +++++++++--------- kernel/trace/trace.h | 4 ++-- kernel/trace/trace_event_perf.c | 2 +- kernel/trace/trace_events.c | 4 ++-- kernel/trace/trace_events_filter.c | 4 ++-- kernel/trace/trace_events_synth.c | 2 +- kernel/trace/trace_functions_graph.c | 2 +- kernel/trace/trace_hwlat.c | 4 ++-- kernel/trace/trace_kprobe.c | 2 +- kernel/trace/trace_probe.c | 6 +++--- kernel/trace/trace_probe.h | 2 +- kernel/trace/trace_probe_tmpl.h | 2 +- kernel/trace/trace_selftest.c | 4 ++-- kernel/trace/trace_seq.c | 12 ++++++------ 32 files changed, 60 insertions(+), 59 deletions(-) (limited to 'kernel/trace') diff --git a/arch/microblaze/include/asm/ftrace.h b/arch/microblaze/include/asm/ftrace.h index 5db7f4489f05..6a92bed37794 100644 --- a/arch/microblaze/include/asm/ftrace.h +++ b/arch/microblaze/include/asm/ftrace.h @@ -13,7 +13,7 @@ extern void ftrace_call_graph(void); #endif #ifdef CONFIG_DYNAMIC_FTRACE -/* reloction of mcount call site is the same as the address */ +/* relocation of mcount call site is the same as the address */ static inline unsigned long ftrace_call_adjust(unsigned long addr) { return addr; diff --git a/arch/nds32/kernel/ftrace.c b/arch/nds32/kernel/ftrace.c index 414f8a780cc3..0e23e3a8df6b 100644 --- a/arch/nds32/kernel/ftrace.c +++ b/arch/nds32/kernel/ftrace.c @@ -236,7 +236,7 @@ void __naked return_to_handler(void) "bal ftrace_return_to_handler\n\t" "move $lp, $r0 \n\t" - /* restore state nedded by the ABI */ + /* restore state needed by the ABI */ "lmw.bim $r0,[$sp],$r1,#0x0 \n\t"); } diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h index bc76970b6ee5..debe8c4f7062 100644 --- a/arch/powerpc/include/asm/ftrace.h +++ b/arch/powerpc/include/asm/ftrace.h @@ -12,7 +12,7 @@ #ifdef __ASSEMBLY__ -/* Based off of objdump optput from glibc */ +/* Based off of objdump output from glibc */ #define MCOUNT_SAVE_FRAME \ stwu r1,-48(r1); \ @@ -52,7 +52,7 @@ extern void _mcount(void); static inline unsigned long ftrace_call_adjust(unsigned long addr) { - /* reloction of mcount call site is the same as the address */ + /* relocation of mcount call site is the same as the address */ return addr; } diff --git a/arch/sh/kernel/ftrace.c b/arch/sh/kernel/ftrace.c index 0646c5961846..295c43315bbe 100644 --- a/arch/sh/kernel/ftrace.c +++ b/arch/sh/kernel/ftrace.c @@ -67,7 +67,7 @@ static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) * Modifying code must take extra care. On an SMP machine, if * the code being modified is also being executed on another CPU * that CPU will have undefined results and possibly take a GPF. - * We use kstop_machine to stop other CPUS from exectuing code. + * We use kstop_machine to stop other CPUS from executing code. * But this does not stop NMIs from happening. We still need * to protect against that. We separate out the modification of * the code to take care of this. diff --git a/arch/sparc/include/asm/ftrace.h b/arch/sparc/include/asm/ftrace.h index d3aa1a524431..e284394cb3aa 100644 --- a/arch/sparc/include/asm/ftrace.h +++ b/arch/sparc/include/asm/ftrace.h @@ -17,7 +17,7 @@ void _mcount(void); #endif #ifdef CONFIG_DYNAMIC_FTRACE -/* reloction of mcount call site is the same as the address */ +/* relocation of mcount call site is the same as the address */ static inline unsigned long ftrace_call_adjust(unsigned long addr) { return addr; diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index 4b83cbded559..1261e8b41edb 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -477,7 +477,7 @@ struct dentry *tracefs_create_dir(const char *name, struct dentry *parent) * * The instances directory is special as it allows for mkdir and rmdir to * to be done by userspace. When a mkdir or rmdir is performed, the inode - * locks are released and the methhods passed in (@mkdir and @rmdir) are + * locks are released and the methods passed in (@mkdir and @rmdir) are * called without locks and with the name of the directory being created * within the instances directory. * diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 86e5028bfa20..a69f363b61bf 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -33,7 +33,7 @@ /* * If the arch's mcount caller does not support all of ftrace's * features, then it must call an indirect function that - * does. Or at least does enough to prevent any unwelcomed side effects. + * does. Or at least does enough to prevent any unwelcome side effects. */ #if !ARCH_SUPPORTS_FTRACE_OPS # define FTRACE_FORCE_LIST_FUNC 1 @@ -389,7 +389,7 @@ DECLARE_PER_CPU(int, disable_stack_tracer); */ static inline void stack_tracer_disable(void) { - /* Preemption or interupts must be disabled */ + /* Preemption or interrupts must be disabled */ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) WARN_ON_ONCE(!preempt_count() || !irqs_disabled()); this_cpu_inc(disable_stack_tracer); diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 8cba64ce23a4..36e27c1f42e0 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -206,7 +206,7 @@ static inline unsigned int tracing_gen_ctx_dec(void) trace_ctx = tracing_gen_ctx(); /* - * Subtract one from the preeption counter if preemption is enabled, + * Subtract one from the preemption counter if preemption is enabled, * see trace_event_buffer_reserve()for details. */ if (IS_ENABLED(CONFIG_PREEMPTION)) diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 9cfb099da58f..13f65420f188 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -465,7 +465,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) * * * * The declared 'local variable' is called '__entry' * * - * * __field(pid_t, prev_prid) is equivalent to a standard declariton: + * * __field(pid_t, prev_prid) is equivalent to a standard declaration: * * * * pid_t prev_pid; * * diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h index 9f0d3b7d56b0..ba78a5602cd1 100644 --- a/include/trace/events/io_uring.h +++ b/include/trace/events/io_uring.h @@ -49,7 +49,7 @@ TRACE_EVENT(io_uring_create, ); /** - * io_uring_register - called after a buffer/file/eventfd was succesfully + * io_uring_register - called after a buffer/file/eventfd was successfully * registered for a ring * * @ctx: pointer to a ring context structure diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 5fc29400e1a2..97177c10bf64 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -48,7 +48,7 @@ TRACE_EVENT(rcu_utilization, * RCU flavor, the grace-period number, and a string identifying the * grace-period-related event as follows: * - * "AccReadyCB": CPU acclerates new callbacks to RCU_NEXT_READY_TAIL. + * "AccReadyCB": CPU accelerates new callbacks to RCU_NEXT_READY_TAIL. * "AccWaitCB": CPU accelerates new callbacks to RCU_WAIT_TAIL. * "newreq": Request a new grace period. * "start": Start a grace period. diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index cbe3e152d24c..1eca2305ca42 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -174,7 +174,7 @@ DEFINE_EVENT(sched_wakeup_template, sched_waking, TP_ARGS(p)); /* - * Tracepoint called when the task is actually woken; p->state == TASK_RUNNNG. + * Tracepoint called when the task is actually woken; p->state == TASK_RUNNING. * It is not always called from the waking context. */ DEFINE_EVENT(sched_wakeup_template, sched_wakeup, diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h index 19abb6c3eb73..6ad031c71be7 100644 --- a/include/trace/events/timer.h +++ b/include/trace/events/timer.h @@ -119,7 +119,7 @@ TRACE_EVENT(timer_expire_entry, * When used in combination with the timer_expire_entry tracepoint we can * determine the runtime of the timer callback function. * - * NOTE: Do NOT derefernce timer in TP_fast_assign. The pointer might + * NOTE: Do NOT dereference timer in TP_fast_assign. The pointer might * be invalid. We solely track the pointer. */ DEFINE_EVENT(timer_class, timer_expire_exit, diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index b0c45d923f0f..49f7ddb36149 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -658,7 +658,7 @@ BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size, /* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */ i++; - /* skip optional "[0 +-][num]" width formating field */ + /* skip optional "[0 +-][num]" width formatting field */ while (fmt[i] == '0' || fmt[i] == '+' || fmt[i] == '-' || fmt[i] == ' ') i++; @@ -748,7 +748,8 @@ BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size, fmt_cnt++; } - /* Maximumly we can have MAX_SEQ_PRINTF_VARARGS parameter, just give + /* + * The maximum we can have is MAX_SEQ_PRINTF_VARARGS parameters, so just give * all of them to seq_printf(). */ seq_printf(m, fmt, params[0], params[1], params[2], params[3], diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 29a6ebeebc9e..b8a0d1d564fb 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -42,7 +42,7 @@ bool ftrace_graph_is_dead(void) } /** - * ftrace_graph_stop - set to permanently disable function graph tracincg + * ftrace_graph_stop - set to permanently disable function graph tracing * * In case of an error int function graph tracing, this is called * to try to keep function graph tracing from causing any more harm. @@ -117,7 +117,7 @@ int function_graph_enter(unsigned long ret, unsigned long func, /* * Skip graph tracing if the return location is served by direct trampoline, - * since call sequence and return addresses is unpredicatable anymore. + * since call sequence and return addresses are unpredictable anyway. * Ex: BPF trampoline may call original function and may skip frame * depending on type of BPF programs attached. */ diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4d8e35575549..eb7d677cb466 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1045,7 +1045,7 @@ struct ftrace_ops global_ops = { }; /* - * Used by the stack undwinder to know about dynamic ftrace trampolines. + * Used by the stack unwinder to know about dynamic ftrace trampolines. */ struct ftrace_ops *ftrace_ops_trampoline(unsigned long addr) { @@ -3000,7 +3000,7 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command) * When the kernel is preemptive, tasks can be preempted * while on a ftrace trampoline. Just scheduling a task on * a CPU is not good enough to flush them. Calling - * synchornize_rcu_tasks() will wait for those tasks to + * synchronize_rcu_tasks() will wait for those tasks to * execute and either schedule voluntarily or enter user space. */ if (IS_ENABLED(CONFIG_PREEMPTION)) @@ -5373,7 +5373,7 @@ EXPORT_SYMBOL_GPL(modify_ftrace_direct); * @reset - non zero to reset all filters before applying this filter. * * Filters denote which functions should be enabled when tracing is enabled - * If @ip is NULL, it failes to update filter. + * If @ip is NULL, it fails to update filter. */ int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip, int remove, int reset) @@ -6325,7 +6325,7 @@ clear_mod_from_hash(struct ftrace_page *pg, struct ftrace_hash *hash) } } -/* Clear any records from hashs */ +/* Clear any records from hashes */ static void clear_mod_from_hashes(struct ftrace_page *pg) { struct trace_array *tr; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 1c61a8cd7b99..f4216df58e31 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3154,7 +3154,7 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) * is called before preempt_count() is updated, since the check will * be on the NORMAL bit, the TRANSITION bit will then be set. If an * NMI then comes in, it will set the NMI bit, but when the NMI code - * does the trace_recursive_unlock() it will clear the TRANSTION bit + * does the trace_recursive_unlock() it will clear the TRANSITION bit * and leave the NMI bit set. But this is fine, because the interrupt * code that set the TRANSITION bit will then clear the NMI bit when it * calls trace_recursive_unlock(). If another NMI comes in, it will diff --git a/kernel/trace/synth_event_gen_test.c b/kernel/trace/synth_event_gen_test.c index a4b4bbf8c3bf..0b15e975d2c2 100644 --- a/kernel/trace/synth_event_gen_test.c +++ b/kernel/trace/synth_event_gen_test.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Test module for in-kernel sythetic event creation and generation. + * Test module for in-kernel synthetic event creation and generation. * * Copyright (C) 2019 Tom Zanussi */ diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e32f5a49f1cf..c8e54b674d3e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -514,7 +514,7 @@ void trace_free_pid_list(struct trace_pid_list *pid_list) * @filtered_pids: The list of pids to check * @search_pid: The PID to find in @filtered_pids * - * Returns true if @search_pid is fonud in @filtered_pids, and false otherwis. + * Returns true if @search_pid is found in @filtered_pids, and false otherwise. */ bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid) @@ -545,7 +545,7 @@ trace_ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task) { /* - * If filterd_no_pids is not empty, and the task's pid is listed + * If filtered_no_pids is not empty, and the task's pid is listed * in filtered_no_pids, then return true. * Otherwise, if filtered_pids is empty, that means we can * trace all tasks. If it has content, then only trace pids @@ -612,7 +612,7 @@ void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos) (*pos)++; - /* pid already is +1 of the actual prevous bit */ + /* pid already is +1 of the actual previous bit */ pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid); /* Return pid + 1 to allow zero to be represented */ @@ -834,7 +834,7 @@ DEFINE_MUTEX(trace_types_lock); * The content of events may become garbage if we allow other process consumes * these events concurrently: * A) the page of the consumed events may become a normal page - * (not reader page) in ring buffer, and this page will be rewrited + * (not reader page) in ring buffer, and this page will be rewritten * by events producer. * B) The page of the consumed events may become a page for splice_read, * and this page will be returned to system. @@ -1520,7 +1520,7 @@ unsigned long nsecs_to_usecs(unsigned long nsecs) #undef C #define C(a, b) b -/* These must match the bit postions in trace_iterator_flags */ +/* These must match the bit positions in trace_iterator_flags */ static const char *trace_options[] = { TRACE_FLAGS NULL @@ -3368,7 +3368,7 @@ int trace_array_vprintk(struct trace_array *tr, * buffer (use trace_printk() for that), as writing into the top level * buffer should only have events that can be individually disabled. * trace_printk() is only used for debugging a kernel, and should not - * be ever encorporated in normal use. + * be ever incorporated in normal use. * * trace_array_printk() can be used, as it will not add noise to the * top level tracing buffer. @@ -7533,11 +7533,11 @@ static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr) * @cmd: The tracing command that caused the error * @str: The string to position the caret at within @cmd * - * Finds the position of the first occurence of @str within @cmd. The + * Finds the position of the first occurrence of @str within @cmd. The * return value can be passed to tracing_log_err() for caret placement * within @cmd. * - * Returns the index within @cmd of the first occurence of @str or 0 + * Returns the index within @cmd of the first occurrence of @str or 0 * if @str was not found. */ unsigned int err_pos(char *cmd, const char *str) @@ -9320,7 +9320,7 @@ int tracing_init_dentry(void) * As there may still be users that expect the tracing * files to exist in debugfs/tracing, we must automount * the tracefs file system there, so older tools still - * work with the newer kerenl. + * work with the newer kernel. */ tr->dir = debugfs_create_automount("tracing", NULL, trace_automount, NULL); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 2952bd92bc62..5506424eae2a 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1347,7 +1347,7 @@ __event_trigger_test_discard(struct trace_event_file *file, /** * event_trigger_unlock_commit - handle triggers and finish event commit - * @file: The file pointer assoctiated to the event + * @file: The file pointer associated with the event * @buffer: The ring buffer that the event is being written to * @event: The event meta data in the ring buffer * @entry: The event itself @@ -1374,7 +1374,7 @@ event_trigger_unlock_commit(struct trace_event_file *file, /** * event_trigger_unlock_commit_regs - handle triggers and finish event commit - * @file: The file pointer assoctiated to the event + * @file: The file pointer associated with the event * @buffer: The ring buffer that the event is being written to * @event: The event meta data in the ring buffer * @entry: The event itself diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 288ad2c274fb..03be4435d103 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -16,7 +16,7 @@ static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS]; /* * Force it to be aligned to unsigned long to avoid misaligned accesses - * suprises + * surprises */ typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)]) perf_trace_t; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f58106eaf8cb..80e96989770e 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2646,7 +2646,7 @@ void trace_event_eval_update(struct trace_eval_map **map, int len) } /* - * Since calls are grouped by systems, the likelyhood that the + * Since calls are grouped by systems, the likelihood that the * next call in the iteration belongs to the same system as the * previous call is high. As an optimization, we skip searching * for a map[] that matches the call's system if the last call @@ -2706,7 +2706,7 @@ __trace_add_new_event(struct trace_event_call *call, struct trace_array *tr) } /* - * Just create a decriptor for early init. A descriptor is required + * Just create a descriptor for early init. A descriptor is required * for enabling events at boot. We want to enable events before * the filesystem is initialized. */ diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index e91259f6a722..9730acf3c03e 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -296,7 +296,7 @@ enum { * and "FALSE" the program entry after that, we are now done with the first * pass. * - * Making the above "a || b && c" have a progam of: + * Making the above "a || b && c" have a program of: * prog[0] = { "a", 1, 2 } * prog[1] = { "b", 0, 2 } * prog[2] = { "c", 0, 3 } @@ -390,7 +390,7 @@ enum { * F: return FALSE * * As "r = a; if (!r) goto n5;" is obviously the same as - * "if (!a) goto n5;" without doing anything we can interperate the + * "if (!a) goto n5;" without doing anything we can interpret the * program as: * n1: if (!a) goto n5; * n2: if (!b) goto n5; diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 8d71e6c83f10..2ac75eb6aa86 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -1385,7 +1385,7 @@ static int destroy_synth_event(struct synth_event *se) /** * synth_event_delete - Delete a synthetic event - * @event_name: The name of the new sythetic event + * @event_name: The name of the new synthetic event * * Delete a synthetic event that was created with synth_event_create(). * diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 0aa6e6faa943..0de6837722da 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -764,7 +764,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, * - we are inside irq code * - we just entered irq code * - * retunns 0 if + * returns 0 if * - funcgraph-interrupts option is set * - we are not inside irq code */ diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index 34dc1a712dcb..632ef88131a9 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -83,7 +83,7 @@ struct hwlat_sample { u64 nmi_total_ts; /* Total time spent in NMIs */ struct timespec64 timestamp; /* wall time */ int nmi_count; /* # NMIs during this sample */ - int count; /* # of iteratons over threash */ + int count; /* # of iterations over thresh */ }; /* keep the global state somewhere. */ @@ -389,7 +389,7 @@ static int start_kthread(struct trace_array *tr) } /** - * stop_kthread - Inform the hardware latency samping/detector kthread to stop + * stop_kthread - Inform the hardware latency sampling/detector kthread to stop * * This kicks the running hardware latency sampling/detector kernel thread and * tells it to stop sampling now. Use this on unload and at system shutdown. diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 6fe770d86dc3..ea6178cb5e33 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1748,7 +1748,7 @@ kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) if (trace_probe_test_flag(&tk->tp, TP_FLAG_PROFILE)) kretprobe_perf_func(tk, ri, regs); #endif - return 0; /* We don't tweek kernel, so just return 0 */ + return 0; /* We don't tweak kernel, so just return 0 */ } NOKPROBE_SYMBOL(kretprobe_dispatcher); diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index ec589a4612df..15413ad7cef2 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -168,7 +168,7 @@ void __trace_probe_log_err(int offset, int err_type) if (!trace_probe_log.argv) return; - /* Recalcurate the length and allocate buffer */ + /* Recalculate the length and allocate buffer */ for (i = 0; i < trace_probe_log.argc; i++) { if (i == trace_probe_log.index) pos = len; @@ -182,7 +182,7 @@ void __trace_probe_log_err(int offset, int err_type) /** * Set the error position is next to the last arg + space. * Note that len includes the terminal null and the cursor - * appaers at pos + 1. + * appears at pos + 1. */ pos = len; offset = 0; @@ -592,7 +592,7 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, } /* - * Since $comm and immediate string can not be dereferred, + * Since $comm and immediate string can not be dereferenced, * we can find those by strcmp. */ if (strcmp(arg, "$comm") == 0 || strncmp(arg, "\\\"", 2) == 0) { diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 7ce4027089ee..227d518e5ba5 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -134,7 +134,7 @@ struct fetch_type { size_t size; /* Byte size of type */ int is_signed; /* Signed flag */ print_type_func_t print; /* Print functions */ - const char *fmt; /* Fromat string */ + const char *fmt; /* Format string */ const char *fmttype; /* Name in format file */ }; diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h index e5282828f4a6..f003c5d02a3a 100644 --- a/kernel/trace/trace_probe_tmpl.h +++ b/kernel/trace/trace_probe_tmpl.h @@ -167,7 +167,7 @@ array: return code->op == FETCH_OP_END ? ret : -EILSEQ; } -/* Sum up total data length for dynamic arraies (strings) */ +/* Sum up total data length for dynamic arrays (strings) */ static nokprobe_inline int __get_data_size(struct trace_probe *tp, struct pt_regs *regs) { diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 73ef12092250..adf7ef194005 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -878,7 +878,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) int ret; /* - * Now that the big kernel lock is no longer preemptable, + * Now that the big kernel lock is no longer preemptible, * and this is called with the BKL held, it will always * fail. If preemption is already disabled, simply * pass the test. When the BKL is removed, or becomes @@ -940,7 +940,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * int ret; /* - * Now that the big kernel lock is no longer preemptable, + * Now that the big kernel lock is no longer preemptible, * and this is called with the BKL held, it will always * fail. If preemption is already disabled, simply * pass the test. When the BKL is removed, or becomes diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c index 1d84fcc78e3e..9c90b3a7dce2 100644 --- a/kernel/trace/trace_seq.c +++ b/kernel/trace/trace_seq.c @@ -16,7 +16,7 @@ * The buffer size is currently PAGE_SIZE, although it may become dynamic * in the future. * - * A write to the buffer will either succed or fail. That is, unlike + * A write to the buffer will either succeed or fail. That is, unlike * sprintf() there will not be a partial write (well it may write into * the buffer but it wont update the pointers). This allows users to * try to write something into the trace_seq buffer and if it fails @@ -73,7 +73,7 @@ int trace_print_seq(struct seq_file *m, struct trace_seq *s) * @fmt: printf format string * * The tracer may use either sequence operations or its own - * copy to user routines. To simplify formating of a trace + * copy to user routines. To simplify formatting of a trace * trace_seq_printf() is used to store strings into a special * buffer (@s). Then the output may be either used by * the sequencer or pulled into another buffer. @@ -133,7 +133,7 @@ EXPORT_SYMBOL_GPL(trace_seq_bitmask); * @fmt: printf format string * * The tracer may use either sequence operations or its own - * copy to user routines. To simplify formating of a trace + * copy to user routines. To simplify formatting of a trace * trace_seq_printf is used to store strings into a special * buffer (@s). Then the output may be either used by * the sequencer or pulled into another buffer. @@ -226,7 +226,7 @@ EXPORT_SYMBOL_GPL(trace_seq_puts); * @c: simple character to record * * The tracer may use either the sequence operations or its own - * copy to user routines. This function records a simple charater + * copy to user routines. This function records a simple character * into a special buffer (@s) for later retrieval by a sequencer * or other mechanism. */ @@ -348,7 +348,7 @@ int trace_seq_path(struct trace_seq *s, const struct path *path) EXPORT_SYMBOL_GPL(trace_seq_path); /** - * trace_seq_to_user - copy the squence buffer to user space + * trace_seq_to_user - copy the sequence buffer to user space * @s: trace sequence descriptor * @ubuf: The userspace memory location to copy to * @cnt: The amount to copy @@ -363,7 +363,7 @@ EXPORT_SYMBOL_GPL(trace_seq_path); * * On failure it returns -EBUSY if all of the content in the * sequence has been already read, which includes nothing in the - * sequenc (@s->len == @s->readpos). + * sequence (@s->len == @s->readpos). * * Returns -EFAULT if the copy to userspace fails. */ -- cgit v1.2.3 From 4613bdcc122e9e60e0763c5851337470d25d7e40 Mon Sep 17 00:00:00 2001 From: Bhaskar Chowdhury Date: Wed, 17 Mar 2021 15:24:01 +0530 Subject: kernel: trace: Mundane typo fixes in the file trace_events_filter.c s/callin/calling/ Link: https://lkml.kernel.org/r/20210317095401.1854544-1-unixbhaskar@gmail.com Acked-by: Randy Dunlap Signed-off-by: Bhaskar Chowdhury [ Other fixes already done by Ingo Molnar ] Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 9730acf3c03e..49de3e21e9bc 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -256,7 +256,7 @@ enum { * is "&&" we don't call update_preds(). Instead continue to "c". As the * next token after "c" is not "&&" but the end of input, we first process the * "&&" by calling update_preds() for the "&&" then we process the "||" by - * callin updates_preds() with the values for processing "||". + * calling updates_preds() with the values for processing "||". * * What does that mean? What update_preds() does is to first save the "target" * of the program entry indexed by the current program entry's "target" -- cgit v1.2.3 From 30c3d39f7f78f3b232f6a6f6357a545cbe23cc16 Mon Sep 17 00:00:00 2001 From: Qiujun Huang Date: Thu, 25 Mar 2021 16:37:52 +0000 Subject: tracing: A minor cleanup for create_system_filter() The first two parameters should be reduced to one, as @tr is simply @dir->tr. Link: https://lkml.kernel.org/r/20210324205642.65e03248@oasis.local.home Link: https://lkml.kernel.org/r/20210325163752.128407-1-hqjagain@gmail.com Suggested-by: Steven Rostedt (VMware) Signed-off-by: Qiujun Huang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_filter.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 49de3e21e9bc..c2dd697cc9c0 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1750,7 +1750,6 @@ int create_event_filter(struct trace_array *tr, * and always remembers @filter_str. */ static int create_system_filter(struct trace_subsystem_dir *dir, - struct trace_array *tr, char *filter_str, struct event_filter **filterp) { struct filter_parse_error *pe = NULL; @@ -1758,13 +1757,13 @@ static int create_system_filter(struct trace_subsystem_dir *dir, err = create_filter_start(filter_str, true, &pe, filterp); if (!err) { - err = process_system_preds(dir, tr, pe, filter_str); + err = process_system_preds(dir, dir->tr, pe, filter_str); if (!err) { /* System filters just show a default message */ kfree((*filterp)->filter_string); (*filterp)->filter_string = NULL; } else { - append_filter_err(tr, pe, *filterp); + append_filter_err(dir->tr, pe, *filterp); } } create_filter_finish(pe); @@ -1852,7 +1851,7 @@ int apply_subsystem_event_filter(struct trace_subsystem_dir *dir, goto out_unlock; } - err = create_system_filter(dir, tr, filter_string, &filter); + err = create_system_filter(dir, filter_string, &filter); if (filter) { /* * No event actually uses the system filter -- cgit v1.2.3 From 70193038a6ec9bbf10990a126432b0cbf56aa339 Mon Sep 17 00:00:00 2001 From: Qiujun Huang Date: Thu, 25 Mar 2021 16:19:10 +0000 Subject: tracing: Update create_system_filter() kernel-doc comment commit f306cc82a93d ("tracing: Update event filters for multibuffer") added the parameter @tr for create_system_filter(). commit bb9ef1cb7d86 ("tracing: Change apply_subsystem_event_filter() paths to check file->system == dir") changed the parameter from @system to @dir. Link: https://lkml.kernel.org/r/20210325161911.123452-1-hqjagain@gmail.com Signed-off-by: Qiujun Huang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_filter.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index c2dd697cc9c0..c9124038b140 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1693,6 +1693,7 @@ static void create_filter_finish(struct filter_parse_error *pe) /** * create_filter - create a filter for a trace_event_call + * @tr: the trace array associated with these events * @call: trace_event_call to create a filter for * @filter_str: filter string * @set_str: remember @filter_str and enable detailed error in filter @@ -1741,8 +1742,8 @@ int create_event_filter(struct trace_array *tr, } /** - * create_system_filter - create a filter for an event_subsystem - * @system: event_subsystem to create a filter for + * create_system_filter - create a filter for an event subsystem + * @dir: the descriptor for the subsystem directory * @filter_str: filter string * @filterp: out param for created filter (always updated on return) * -- cgit v1.2.3 From f3ef7202ef7c705d640d1aeec3b286a641ac9186 Mon Sep 17 00:00:00 2001 From: "Yordan Karadzhov (VMware)" Date: Mon, 29 Mar 2021 16:03:31 +0300 Subject: tracing: Remove unused argument from "ring_buffer_time_stamp() The "cpu" parameter is not being used by the function. Link: https://lkml.kernel.org/r/20210329130331.199402-1-y.karadz@gmail.com Signed-off-by: Yordan Karadzhov (VMware) Signed-off-by: Steven Rostedt (VMware) --- include/linux/ring_buffer.h | 2 +- kernel/trace/ring_buffer.c | 2 +- kernel/trace/trace.c | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel/trace') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 057b7ed4fe24..dac53fd3afea 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -181,7 +181,7 @@ unsigned long ring_buffer_commit_overrun_cpu(struct trace_buffer *buffer, int cp unsigned long ring_buffer_dropped_events_cpu(struct trace_buffer *buffer, int cpu); unsigned long ring_buffer_read_events_cpu(struct trace_buffer *buffer, int cpu); -u64 ring_buffer_time_stamp(struct trace_buffer *buffer, int cpu); +u64 ring_buffer_time_stamp(struct trace_buffer *buffer); void ring_buffer_normalize_time_stamp(struct trace_buffer *buffer, int cpu, u64 *ts); void ring_buffer_set_clock(struct trace_buffer *buffer, diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index f4216df58e31..2c0ee6484990 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1080,7 +1080,7 @@ static inline u64 rb_time_stamp(struct trace_buffer *buffer) return ts << DEBUG_SHIFT; } -u64 ring_buffer_time_stamp(struct trace_buffer *buffer, int cpu) +u64 ring_buffer_time_stamp(struct trace_buffer *buffer) { u64 time; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3834de91fb97..507a30bf26e4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -771,7 +771,7 @@ static u64 buffer_ftrace_now(struct array_buffer *buf, int cpu) if (!buf->buffer) return trace_clock_local(); - ts = ring_buffer_time_stamp(buf->buffer, cpu); + ts = ring_buffer_time_stamp(buf->buffer); ring_buffer_normalize_time_stamp(buf->buffer, cpu, &ts); return ts; @@ -7174,7 +7174,7 @@ static int tracing_time_stamp_mode_open(struct inode *inode, struct file *file) u64 tracing_event_time_stamp(struct trace_buffer *buffer, struct ring_buffer_event *rbe) { if (rbe == this_cpu_read(trace_buffered_event)) - return ring_buffer_time_stamp(buffer, smp_processor_id()); + return ring_buffer_time_stamp(buffer); return ring_buffer_event_time_stamp(buffer, rbe); } @@ -8088,7 +8088,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf, trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n", t, usec_rem); - t = ns2usecs(ring_buffer_time_stamp(trace_buf->buffer, cpu)); + t = ns2usecs(ring_buffer_time_stamp(trace_buf->buffer)); usec_rem = do_div(t, USEC_PER_SEC); trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem); } else { @@ -8097,7 +8097,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf, ring_buffer_oldest_event_ts(trace_buf->buffer, cpu)); trace_seq_printf(s, "now ts: %llu\n", - ring_buffer_time_stamp(trace_buf->buffer, cpu)); + ring_buffer_time_stamp(trace_buf->buffer)); } cnt = ring_buffer_dropped_events_cpu(trace_buf->buffer, cpu); -- cgit v1.2.3 From db42523b4f3e83ff86b53cdda219a9767c8b047f Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 1 Apr 2021 16:14:17 -0400 Subject: ftrace: Store the order of pages allocated in ftrace_page Instead of saving the size of the records field of the ftrace_page, store the order it uses to allocate the pages, as that is what is needed to know in order to free the pages. This simplifies the code. Link: https://lore.kernel.org/lkml/CAHk-=whyMxheOqXAORt9a7JK9gc9eHTgCJ55Pgs4p=X3RrQubQ@mail.gmail.com/ Signed-off-by: Linus Torvalds [ change log written by Steven Rostedt ] Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 22ecaaa13baa..f7bef5a9ada9 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1090,7 +1090,7 @@ struct ftrace_page { struct ftrace_page *next; struct dyn_ftrace *records; int index; - int size; + int order; }; #define ENTRY_SIZE sizeof(struct dyn_ftrace) @@ -3181,7 +3181,7 @@ static int ftrace_allocate_records(struct ftrace_page *pg, int count) ftrace_number_of_groups++; cnt = (PAGE_SIZE << order) / ENTRY_SIZE; - pg->size = cnt; + pg->order = order; if (cnt > count) cnt = count; @@ -3194,7 +3194,6 @@ ftrace_allocate_pages(unsigned long num_to_init) { struct ftrace_page *start_pg; struct ftrace_page *pg; - int order; int cnt; if (!num_to_init) @@ -3230,13 +3229,13 @@ ftrace_allocate_pages(unsigned long num_to_init) free_pages: pg = start_pg; while (pg) { - order = get_count_order(pg->size / ENTRIES_PER_PAGE); - if (order >= 0) - free_pages((unsigned long)pg->records, order); + if (pg->records) { + free_pages((unsigned long)pg->records, pg->order); + ftrace_number_of_pages -= 1 << pg->order; + } start_pg = pg->next; kfree(pg); pg = start_pg; - ftrace_number_of_pages -= 1 << order; ftrace_number_of_groups--; } pr_info("ftrace: FAILED to allocate memory for functions\n"); @@ -6188,6 +6187,7 @@ static int ftrace_process_locs(struct module *mod, p = start; pg = start_pg; while (p < end) { + unsigned long end_offset; addr = ftrace_call_adjust(*p++); /* * Some architecture linkers will pad between @@ -6198,7 +6198,8 @@ static int ftrace_process_locs(struct module *mod, if (!addr) continue; - if (pg->index == pg->size) { + end_offset = (pg->index+1) * sizeof(pg->records[0]); + if (end_offset > PAGE_SIZE << pg->order) { /* We should have allocated enough */ if (WARN_ON(!pg->next)) break; @@ -6367,7 +6368,6 @@ void ftrace_release_mod(struct module *mod) struct ftrace_page **last_pg; struct ftrace_page *tmp_page = NULL; struct ftrace_page *pg; - int order; mutex_lock(&ftrace_lock); @@ -6418,12 +6418,12 @@ void ftrace_release_mod(struct module *mod) /* Needs to be called outside of ftrace_lock */ clear_mod_from_hashes(pg); - order = get_count_order(pg->size / ENTRIES_PER_PAGE); - if (order >= 0) - free_pages((unsigned long)pg->records, order); + if (pg->records) { + free_pages((unsigned long)pg->records, pg->order); + ftrace_number_of_pages -= 1 << pg->order; + } tmp_page = pg->next; kfree(pg); - ftrace_number_of_pages -= 1 << order; ftrace_number_of_groups--; } } @@ -6741,7 +6741,6 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr) struct ftrace_mod_map *mod_map = NULL; struct ftrace_init_func *func, *func_next; struct list_head clear_hash; - int order; INIT_LIST_HEAD(&clear_hash); @@ -6779,10 +6778,10 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr) ftrace_update_tot_cnt--; if (!pg->index) { *last_pg = pg->next; - order = get_count_order(pg->size / ENTRIES_PER_PAGE); - if (order >= 0) - free_pages((unsigned long)pg->records, order); - ftrace_number_of_pages -= 1 << order; + if (pg->records) { + free_pages((unsigned long)pg->records, pg->order); + ftrace_number_of_pages -= 1 << pg->order; + } ftrace_number_of_groups--; kfree(pg); pg = container_of(last_pg, struct ftrace_page, next); -- cgit v1.2.3 From ceaaa12904df07d07ea8975abbf04c4d60e46956 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 1 Apr 2021 16:40:32 -0400 Subject: ftrace: Simplify the calculation of page number for ftrace_page->records some more Commit b40c6eabfcd40 ("ftrace: Simplify the calculation of page number for ftrace_page->records") simplified the calculation of the number of pages needed for each page group without having any empty pages, but it can be simplified even further. Link: https://lore.kernel.org/lkml/CAHk-=wjt9b7kxQ2J=aDNKbR1QBMB3Hiqb_hYcZbKsxGRSEb+gQ@mail.gmail.com/ Suggested-by: Linus Torvalds Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f7bef5a9ada9..057e962ca5ce 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3156,15 +3156,9 @@ static int ftrace_allocate_records(struct ftrace_page *pg, int count) if (WARN_ON(!count)) return -EINVAL; + /* We want to fill as much as possible, with no empty pages */ pages = DIV_ROUND_UP(count, ENTRIES_PER_PAGE); - order = get_count_order(pages); - - /* - * We want to fill as much as possible. No more than a page - * may be empty. - */ - if (!is_power_of_2(pages)) - order--; + order = fls(pages) - 1; again: pg->records = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, order); -- cgit v1.2.3 From eaa7a897206ac5bfa7da3f647686209ada1984d0 Mon Sep 17 00:00:00 2001 From: "Yordan Karadzhov (VMware)" Date: Thu, 15 Apr 2021 21:18:49 +0300 Subject: tracing: Define static void trace_print_time() The part of the code that prints the time of the trace record in "int trace_print_context()" gets extracted in a static function. This is done as a preparation for a following patch, in which we will define a new ftrace event called "func_repeats". The new static method, defined here, will be used by this new event to print the time of the last repeat of a function that is consecutively called number of times. Link: https://lkml.kernel.org/r/20210415181854.147448-2-y.karadz@gmail.com Signed-off-by: Yordan Karadzhov (VMware) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_output.c | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index a0146e1fffdf..333233d45596 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -587,13 +587,26 @@ lat_print_timestamp(struct trace_iterator *iter, u64 next_ts) return !trace_seq_has_overflowed(s); } +static void trace_print_time(struct trace_seq *s, struct trace_iterator *iter, + unsigned long long ts) +{ + unsigned long secs, usec_rem; + unsigned long long t; + + if (iter->iter_flags & TRACE_FILE_TIME_IN_NS) { + t = ns2usecs(ts); + usec_rem = do_div(t, USEC_PER_SEC); + secs = (unsigned long)t; + trace_seq_printf(s, " %5lu.%06lu", secs, usec_rem); + } else + trace_seq_printf(s, " %12llu", ts); +} + int trace_print_context(struct trace_iterator *iter) { struct trace_array *tr = iter->tr; struct trace_seq *s = &iter->seq; struct trace_entry *entry = iter->ent; - unsigned long long t; - unsigned long secs, usec_rem; char comm[TASK_COMM_LEN]; trace_find_cmdline(entry->pid, comm); @@ -614,13 +627,8 @@ int trace_print_context(struct trace_iterator *iter) if (tr->trace_flags & TRACE_ITER_IRQ_INFO) trace_print_lat_fmt(s, entry); - if (iter->iter_flags & TRACE_FILE_TIME_IN_NS) { - t = ns2usecs(iter->ts); - usec_rem = do_div(t, USEC_PER_SEC); - secs = (unsigned long)t; - trace_seq_printf(s, " %5lu.%06lu: ", secs, usec_rem); - } else - trace_seq_printf(s, " %12llu: ", iter->ts); + trace_print_time(s, iter, iter->ts); + trace_seq_puts(s, ": "); return !trace_seq_has_overflowed(s); } -- cgit v1.2.3 From f689e4f280b69cd7341743c2ecacd1b13528a0d8 Mon Sep 17 00:00:00 2001 From: "Yordan Karadzhov (VMware)" Date: Thu, 15 Apr 2021 21:18:50 +0300 Subject: tracing: Define new ftrace event "func_repeats" The event aims to consolidate the function tracing record in the cases when a single function is called number of times consecutively. while (cond) do_func(); This may happen in various scenarios (busy waiting for example). The new ftrace event can be used to show repeated function events with a single event and save space on the ring buffer Link: https://lkml.kernel.org/r/20210415181854.147448-3-y.karadz@gmail.com Signed-off-by: Yordan Karadzhov (VMware) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.h | 3 +++ kernel/trace/trace_entries.h | 22 ++++++++++++++++++++ kernel/trace/trace_output.c | 48 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 73 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 5506424eae2a..6a5b4c2a0fa7 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -45,6 +45,7 @@ enum trace_type { TRACE_BPUTS, TRACE_HWLAT, TRACE_RAW_DATA, + TRACE_FUNC_REPEATS, __TRACE_LAST_TYPE, }; @@ -442,6 +443,8 @@ extern void __ftrace_bad_type(void); TRACE_GRAPH_ENT); \ IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ TRACE_GRAPH_RET); \ + IF_ASSIGN(var, ent, struct func_repeats_entry, \ + TRACE_FUNC_REPEATS); \ __ftrace_bad_type(); \ } while (0) diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 4547ac59da61..251c819cf0c5 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -338,3 +338,25 @@ FTRACE_ENTRY(hwlat, hwlat_entry, __entry->nmi_total_ts, __entry->nmi_count) ); + +#define FUNC_REPEATS_GET_DELTA_TS(entry) \ + (((u64)(entry)->top_delta_ts << 32) | (entry)->bottom_delta_ts) \ + +FTRACE_ENTRY(func_repeats, func_repeats_entry, + + TRACE_FUNC_REPEATS, + + F_STRUCT( + __field( unsigned long, ip ) + __field( unsigned long, parent_ip ) + __field( u16 , count ) + __field( u16 , top_delta_ts ) + __field( u32 , bottom_delta_ts ) + ), + + F_printk(" %ps <-%ps\t(repeats:%u delta: -%llu)", + (void *)__entry->ip, + (void *)__entry->parent_ip, + __entry->count, + FUNC_REPEATS_GET_DELTA_TS(__entry)) +); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 333233d45596..3037f0c88f90 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -1381,6 +1381,53 @@ static struct trace_event trace_raw_data_event = { .funcs = &trace_raw_data_funcs, }; +static enum print_line_t +trace_func_repeats_raw(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct func_repeats_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + trace_seq_printf(s, "%lu %lu %u %llu\n", + field->ip, + field->parent_ip, + field->count, + FUNC_REPEATS_GET_DELTA_TS(field)); + + return trace_handle_return(s); +} + +static enum print_line_t +trace_func_repeats_print(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct func_repeats_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + seq_print_ip_sym(s, field->ip, flags); + trace_seq_puts(s, " <-"); + seq_print_ip_sym(s, field->parent_ip, flags); + trace_seq_printf(s, " (repeats: %u, last_ts:", field->count); + trace_print_time(s, iter, + iter->ts - FUNC_REPEATS_GET_DELTA_TS(field)); + trace_seq_puts(s, ")\n"); + + return trace_handle_return(s); +} + +static struct trace_event_functions trace_func_repeats_funcs = { + .trace = trace_func_repeats_print, + .raw = trace_func_repeats_raw, +}; + +static struct trace_event trace_func_repeats_event = { + .type = TRACE_FUNC_REPEATS, + .funcs = &trace_func_repeats_funcs, +}; static struct trace_event *events[] __initdata = { &trace_fn_event, @@ -1393,6 +1440,7 @@ static struct trace_event *events[] __initdata = { &trace_print_event, &trace_hwlat_event, &trace_raw_data_event, + &trace_func_repeats_event, NULL }; -- cgit v1.2.3 From 20344c54d1c7ab7428e312bbe9b0097750875002 Mon Sep 17 00:00:00 2001 From: "Yordan Karadzhov (VMware)" Date: Thu, 15 Apr 2021 21:18:51 +0300 Subject: tracing: Add "last_func_repeats" to struct trace_array The field is used to keep track of the consecutive (on the same CPU) calls of a single function. This information is needed in order to consolidate the function tracing record in the cases when a single function is called number of times. Link: https://lkml.kernel.org/r/20210415181854.147448-4-y.karadz@gmail.com Signed-off-by: Yordan Karadzhov (VMware) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 1 + kernel/trace/trace.h | 12 ++++++++++++ 2 files changed, 13 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 507a30bf26e4..82833be07c1e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -9104,6 +9104,7 @@ static int __remove_instance(struct trace_array *tr) ftrace_clear_pids(tr); ftrace_destroy_function_files(tr); tracefs_remove(tr->dir); + free_percpu(tr->last_func_repeats); free_trace_buffers(tr); for (i = 0; i < tr->nr_topts; i++) { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 6a5b4c2a0fa7..a4f1b66049fd 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -262,6 +262,17 @@ struct cond_snapshot { cond_update_fn_t update; }; +/* + * struct trace_func_repeats - used to keep track of the consecutive + * (on the same CPU) calls of a single function. + */ +struct trace_func_repeats { + unsigned long ip; + unsigned long parent_ip; + unsigned long count; + u64 ts_last_call; +}; + /* * The trace array - an array of per-CPU trace arrays. This is the * highest level data structure that individual tracers deal with. @@ -358,6 +369,7 @@ struct trace_array { #ifdef CONFIG_TRACER_SNAPSHOT struct cond_snapshot *cond_snapshot; #endif + struct trace_func_repeats __percpu *last_func_repeats; }; enum { -- cgit v1.2.3 From c658797f1a70561205a224be0c8be64977ed64e8 Mon Sep 17 00:00:00 2001 From: "Yordan Karadzhov (VMware)" Date: Thu, 15 Apr 2021 21:18:52 +0300 Subject: tracing: Add method for recording "func_repeats" events This patch only provides the implementation of the method. Later we will used it in a combination with a new option for function tracing. Link: https://lkml.kernel.org/r/20210415181854.147448-5-y.karadz@gmail.com Signed-off-by: Yordan Karadzhov (VMware) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 34 ++++++++++++++++++++++++++++++++++ kernel/trace/trace.h | 4 ++++ 2 files changed, 38 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 82833be07c1e..66a4ad93b5e9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3117,6 +3117,40 @@ static void ftrace_trace_userstack(struct trace_array *tr, #endif /* CONFIG_STACKTRACE */ +static inline void +func_repeats_set_delta_ts(struct func_repeats_entry *entry, + unsigned long long delta) +{ + entry->bottom_delta_ts = delta & U32_MAX; + entry->top_delta_ts = (delta >> 32); +} + +void trace_last_func_repeats(struct trace_array *tr, + struct trace_func_repeats *last_info, + unsigned int trace_ctx) +{ + struct trace_buffer *buffer = tr->array_buffer.buffer; + struct func_repeats_entry *entry; + struct ring_buffer_event *event; + u64 delta; + + event = __trace_buffer_lock_reserve(buffer, TRACE_FUNC_REPEATS, + sizeof(*entry), trace_ctx); + if (!event) + return; + + delta = ring_buffer_event_time_stamp(buffer, event) - + last_info->ts_last_call; + + entry = ring_buffer_event_data(event); + entry->ip = last_info->ip; + entry->parent_ip = last_info->parent_ip; + entry->count = last_info->count; + func_repeats_set_delta_ts(entry, delta); + + __buffer_unlock_commit(buffer, event); +} + /* created for use with alloc_percpu */ struct trace_buffer_struct { int nesting; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index a4f1b66049fd..cd80d046c7a5 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -695,6 +695,10 @@ static inline void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, } #endif /* CONFIG_STACKTRACE */ +void trace_last_func_repeats(struct trace_array *tr, + struct trace_func_repeats *last_info, + unsigned int trace_ctx); + extern u64 ftrace_now(int cpu); extern void trace_find_cmdline(int pid, char comm[]); -- cgit v1.2.3 From 4994891ebbb89b18903637dc1c8f27b42cb8b8b2 Mon Sep 17 00:00:00 2001 From: "Yordan Karadzhov (VMware)" Date: Thu, 15 Apr 2021 21:18:53 +0300 Subject: tracing: Unify the logic for function tracing options Currently the logic for dealing with the options for function tracing has two different implementations. One is used when we set the flags (in "static int func_set_flag()") and another used when we initialize the tracer (in "static int function_trace_init()"). Those two implementations are meant to do essentially the same thing and they are both not very convenient for adding new options. In this patch we add a helper function that provides a single implementation of the logic for dealing with the options and we make it such that new options can be easily added. Link: https://lkml.kernel.org/r/20210415181854.147448-6-y.karadz@gmail.com Signed-off-by: Yordan Karadzhov (VMware) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_functions.c | 65 ++++++++++++++++++++++++------------------ 1 file changed, 38 insertions(+), 27 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index f93723ca66bc..f37f73a9b1b8 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -31,9 +31,12 @@ static struct tracer_flags func_flags; /* Our option */ enum { + TRACE_FUNC_NO_OPTS = 0x0, /* No flags set. */ TRACE_FUNC_OPT_STACK = 0x1, }; +#define TRACE_FUNC_OPT_MASK (TRACE_FUNC_OPT_STACK) + int ftrace_allocate_ftrace_ops(struct trace_array *tr) { struct ftrace_ops *ops; @@ -86,6 +89,18 @@ void ftrace_destroy_function_files(struct trace_array *tr) ftrace_free_ftrace_ops(tr); } +static ftrace_func_t select_trace_function(u32 flags_val) +{ + switch (flags_val & TRACE_FUNC_OPT_MASK) { + case TRACE_FUNC_NO_OPTS: + return function_trace_call; + case TRACE_FUNC_OPT_STACK: + return function_stack_trace_call; + default: + return NULL; + } +} + static int function_trace_init(struct trace_array *tr) { ftrace_func_t func; @@ -97,12 +112,9 @@ static int function_trace_init(struct trace_array *tr) if (!tr->ops) return -ENOMEM; - /* Currently only the global instance can do stack tracing */ - if (tr->flags & TRACE_ARRAY_FL_GLOBAL && - func_flags.val & TRACE_FUNC_OPT_STACK) - func = function_stack_trace_call; - else - func = function_trace_call; + func = select_trace_function(func_flags.val); + if (!func) + return -EINVAL; ftrace_init_array_ops(tr, func); @@ -213,7 +225,7 @@ static struct tracer_opt func_opts[] = { }; static struct tracer_flags func_flags = { - .val = 0, /* By default: all flags disabled */ + .val = TRACE_FUNC_NO_OPTS, /* By default: all flags disabled */ .opts = func_opts }; @@ -235,30 +247,29 @@ static struct tracer function_trace; static int func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { - switch (bit) { - case TRACE_FUNC_OPT_STACK: - /* do nothing if already set */ - if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK)) - break; - - /* We can change this flag when not running. */ - if (tr->current_trace != &function_trace) - break; + ftrace_func_t func; + u32 new_flags; - unregister_ftrace_function(tr->ops); + /* Do nothing if already set. */ + if (!!set == !!(func_flags.val & bit)) + return 0; - if (set) { - tr->ops->func = function_stack_trace_call; - register_ftrace_function(tr->ops); - } else { - tr->ops->func = function_trace_call; - register_ftrace_function(tr->ops); - } + /* We can change this flag only when not running. */ + if (tr->current_trace != &function_trace) + return 0; - break; - default: + new_flags = (func_flags.val & ~bit) | (set ? bit : 0); + func = select_trace_function(new_flags); + if (!func) return -EINVAL; - } + + /* Check if there's anything to change. */ + if (tr->ops->func == func) + return 0; + + unregister_ftrace_function(tr->ops); + tr->ops->func = func; + register_ftrace_function(tr->ops); return 0; } -- cgit v1.2.3 From 22db095d57b51ff71aaa8ddba515180399f54334 Mon Sep 17 00:00:00 2001 From: "Yordan Karadzhov (VMware)" Date: Thu, 15 Apr 2021 21:18:54 +0300 Subject: tracing: Add "func_no_repeats" option for function tracing If the option is activated the function tracing record gets consolidated in the cases when a single function is called number of times consecutively. Instead of having an identical record for each call of the function we will record only the first call following by event showing the number of repeats. Link: https://lkml.kernel.org/r/20210415181854.147448-7-y.karadz@gmail.com Signed-off-by: Yordan Karadzhov (VMware) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_functions.c | 162 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 159 insertions(+), 3 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index f37f73a9b1b8..1f0e63f5d1f9 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -27,15 +27,27 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, static void function_stack_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs); +static void +function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs); +static void +function_stack_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, + struct ftrace_regs *fregs); static struct tracer_flags func_flags; /* Our option */ enum { - TRACE_FUNC_NO_OPTS = 0x0, /* No flags set. */ - TRACE_FUNC_OPT_STACK = 0x1, + + TRACE_FUNC_NO_OPTS = 0x0, /* No flags set. */ + TRACE_FUNC_OPT_STACK = 0x1, + TRACE_FUNC_OPT_NO_REPEATS = 0x2, + + /* Update this to next highest bit. */ + TRACE_FUNC_OPT_HIGHEST_BIT = 0x4 }; -#define TRACE_FUNC_OPT_MASK (TRACE_FUNC_OPT_STACK) +#define TRACE_FUNC_OPT_MASK (TRACE_FUNC_OPT_HIGHEST_BIT - 1) int ftrace_allocate_ftrace_ops(struct trace_array *tr) { @@ -96,11 +108,27 @@ static ftrace_func_t select_trace_function(u32 flags_val) return function_trace_call; case TRACE_FUNC_OPT_STACK: return function_stack_trace_call; + case TRACE_FUNC_OPT_NO_REPEATS: + return function_no_repeats_trace_call; + case TRACE_FUNC_OPT_STACK | TRACE_FUNC_OPT_NO_REPEATS: + return function_stack_no_repeats_trace_call; default: return NULL; } } +static bool handle_func_repeats(struct trace_array *tr, u32 flags_val) +{ + if (!tr->last_func_repeats && + (flags_val & TRACE_FUNC_OPT_NO_REPEATS)) { + tr->last_func_repeats = alloc_percpu(struct trace_func_repeats); + if (!tr->last_func_repeats) + return false; + } + + return true; +} + static int function_trace_init(struct trace_array *tr) { ftrace_func_t func; @@ -116,6 +144,9 @@ static int function_trace_init(struct trace_array *tr) if (!func) return -EINVAL; + if (!handle_func_repeats(tr, func_flags.val)) + return -ENOMEM; + ftrace_init_array_ops(tr, func); tr->array_buffer.cpu = raw_smp_processor_id(); @@ -217,10 +248,132 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip, local_irq_restore(flags); } +static inline bool is_repeat_check(struct trace_array *tr, + struct trace_func_repeats *last_info, + unsigned long ip, unsigned long parent_ip) +{ + if (last_info->ip == ip && + last_info->parent_ip == parent_ip && + last_info->count < U16_MAX) { + last_info->ts_last_call = + ring_buffer_time_stamp(tr->array_buffer.buffer); + last_info->count++; + return true; + } + + return false; +} + +static inline void process_repeats(struct trace_array *tr, + unsigned long ip, unsigned long parent_ip, + struct trace_func_repeats *last_info, + unsigned int trace_ctx) +{ + if (last_info->count) { + trace_last_func_repeats(tr, last_info, trace_ctx); + last_info->count = 0; + } + + last_info->ip = ip; + last_info->parent_ip = parent_ip; +} + +static void +function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, + struct ftrace_regs *fregs) +{ + struct trace_func_repeats *last_info; + struct trace_array *tr = op->private; + struct trace_array_cpu *data; + unsigned int trace_ctx; + unsigned long flags; + int bit; + int cpu; + + if (unlikely(!tr->function_enabled)) + return; + + bit = ftrace_test_recursion_trylock(ip, parent_ip); + if (bit < 0) + return; + + preempt_disable_notrace(); + + cpu = smp_processor_id(); + data = per_cpu_ptr(tr->array_buffer.data, cpu); + if (atomic_read(&data->disabled)) + goto out; + + /* + * An interrupt may happen at any place here. But as far as I can see, + * the only damage that this can cause is to mess up the repetition + * counter without valuable data being lost. + * TODO: think about a solution that is better than just hoping to be + * lucky. + */ + last_info = per_cpu_ptr(tr->last_func_repeats, cpu); + if (is_repeat_check(tr, last_info, ip, parent_ip)) + goto out; + + local_save_flags(flags); + trace_ctx = tracing_gen_ctx_flags(flags); + process_repeats(tr, ip, parent_ip, last_info, trace_ctx); + + trace_function(tr, ip, parent_ip, trace_ctx); + +out: + ftrace_test_recursion_unlock(bit); + preempt_enable_notrace(); +} + +static void +function_stack_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, + struct ftrace_regs *fregs) +{ + struct trace_func_repeats *last_info; + struct trace_array *tr = op->private; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + unsigned int trace_ctx; + + if (unlikely(!tr->function_enabled)) + return; + + /* + * Need to use raw, since this must be called before the + * recursive protection is performed. + */ + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = per_cpu_ptr(tr->array_buffer.data, cpu); + disabled = atomic_inc_return(&data->disabled); + + if (likely(disabled == 1)) { + last_info = per_cpu_ptr(tr->last_func_repeats, cpu); + if (is_repeat_check(tr, last_info, ip, parent_ip)) + goto out; + + trace_ctx = tracing_gen_ctx_flags(flags); + process_repeats(tr, ip, parent_ip, last_info, trace_ctx); + + trace_function(tr, ip, parent_ip, trace_ctx); + __trace_stack(tr, trace_ctx, STACK_SKIP); + } + + out: + atomic_dec(&data->disabled); + local_irq_restore(flags); +} + static struct tracer_opt func_opts[] = { #ifdef CONFIG_STACKTRACE { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) }, #endif + { TRACER_OPT(func-no-repeats, TRACE_FUNC_OPT_NO_REPEATS) }, { } /* Always set a last empty entry */ }; @@ -267,6 +420,9 @@ func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) if (tr->ops->func == func) return 0; + if (!handle_func_repeats(tr, new_flags)) + return -ENOMEM; + unregister_ftrace_function(tr->ops); tr->ops->func = func; register_ftrace_function(tr->ops); -- cgit v1.2.3 From e1db6338d6fa0d409e45cf20ab5aeaca704f68e7 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 15 Apr 2021 16:34:26 -0400 Subject: ftrace: Reuse the output of the function tracer for func_repeats The func_repeats event shows the output of the function tracer followed by a count of the number of repeats the previous function had made, as well as the timestamp of the last function that was repeated. The printing of the function should be the same as is for the function it is displaying. Reuse the code in trace_fn_trace() by making a helper function print_fn_trace() and use it for trace_func_repeats_print(). Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_output.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 3037f0c88f90..d0368a569bfa 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -845,6 +845,17 @@ enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags, return trace_handle_return(&iter->seq); } +static void print_fn_trace(struct trace_seq *s, unsigned long ip, + unsigned long parent_ip, int flags) +{ + seq_print_ip_sym(s, ip, flags); + + if ((flags & TRACE_ITER_PRINT_PARENT) && parent_ip) { + trace_seq_puts(s, " <-"); + seq_print_ip_sym(s, parent_ip, flags); + } +} + /* TRACE_FN */ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags, struct trace_event *event) @@ -854,13 +865,7 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags, trace_assign_type(field, iter->ent); - seq_print_ip_sym(s, field->ip, flags); - - if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) { - trace_seq_puts(s, " <-"); - seq_print_ip_sym(s, field->parent_ip, flags); - } - + print_fn_trace(s, field->ip, field->parent_ip, flags); trace_seq_putc(s, '\n'); return trace_handle_return(s); @@ -1408,9 +1413,7 @@ trace_func_repeats_print(struct trace_iterator *iter, int flags, trace_assign_type(field, iter->ent); - seq_print_ip_sym(s, field->ip, flags); - trace_seq_puts(s, " <-"); - seq_print_ip_sym(s, field->parent_ip, flags); + print_fn_trace(s, field->ip, field->parent_ip, flags); trace_seq_printf(s, " (repeats: %u, last_ts:", field->count); trace_print_time(s, iter, iter->ts - FUNC_REPEATS_GET_DELTA_TS(field)); -- cgit v1.2.3 From 785e3c0a3a870e72dc530856136ab4c8dd207128 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 27 Apr 2021 11:32:07 -0400 Subject: tracing: Map all PIDs to command lines The default max PID is set by PID_MAX_DEFAULT, and the tracing infrastructure uses this number to map PIDs to the comm names of the tasks, such output of the trace can show names from the recorded PIDs in the ring buffer. This mapping is also exported to user space via the "saved_cmdlines" file in the tracefs directory. But currently the mapping expects the PIDs to be less than PID_MAX_DEFAULT, which is the default maximum and not the real maximum. Recently, systemd will increases the maximum value of a PID on the system, and when tasks are traced that have a PID higher than PID_MAX_DEFAULT, its comm is not recorded. This leads to the entire trace to have "<...>" as the comm name, which is pretty useless. Instead, keep the array mapping the size of PID_MAX_DEFAULT, but instead of just mapping the index to the comm, map a mask of the PID (PID_MAX_DEFAULT - 1) to the comm, and find the full PID from the map_cmdline_to_pid array (that already exists). This bug goes back to the beginning of ftrace, but hasn't been an issue until user space started increasing the maximum value of PIDs. Link: https://lkml.kernel.org/r/20210427113207.3c601884@gandalf.local.home Cc: stable@vger.kernel.org Fixes: bc0c38d139ec7 ("ftrace: latency tracer infrastructure") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 41 +++++++++++++++-------------------------- 1 file changed, 15 insertions(+), 26 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 66a4ad93b5e9..e28d08905124 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2390,14 +2390,13 @@ static void tracing_stop_tr(struct trace_array *tr) static int trace_save_cmdline(struct task_struct *tsk) { - unsigned pid, idx; + unsigned tpid, idx; /* treat recording of idle task as a success */ if (!tsk->pid) return 1; - if (unlikely(tsk->pid > PID_MAX_DEFAULT)) - return 0; + tpid = tsk->pid & (PID_MAX_DEFAULT - 1); /* * It's not the end of the world if we don't get @@ -2408,26 +2407,15 @@ static int trace_save_cmdline(struct task_struct *tsk) if (!arch_spin_trylock(&trace_cmdline_lock)) return 0; - idx = savedcmd->map_pid_to_cmdline[tsk->pid]; + idx = savedcmd->map_pid_to_cmdline[tpid]; if (idx == NO_CMDLINE_MAP) { idx = (savedcmd->cmdline_idx + 1) % savedcmd->cmdline_num; - /* - * Check whether the cmdline buffer at idx has a pid - * mapped. We are going to overwrite that entry so we - * need to clear the map_pid_to_cmdline. Otherwise we - * would read the new comm for the old pid. - */ - pid = savedcmd->map_cmdline_to_pid[idx]; - if (pid != NO_CMDLINE_MAP) - savedcmd->map_pid_to_cmdline[pid] = NO_CMDLINE_MAP; - - savedcmd->map_cmdline_to_pid[idx] = tsk->pid; - savedcmd->map_pid_to_cmdline[tsk->pid] = idx; - + savedcmd->map_pid_to_cmdline[tpid] = idx; savedcmd->cmdline_idx = idx; } + savedcmd->map_cmdline_to_pid[idx] = tsk->pid; set_cmdline(idx, tsk->comm); arch_spin_unlock(&trace_cmdline_lock); @@ -2438,6 +2426,7 @@ static int trace_save_cmdline(struct task_struct *tsk) static void __trace_find_cmdline(int pid, char comm[]) { unsigned map; + int tpid; if (!pid) { strcpy(comm, ""); @@ -2449,16 +2438,16 @@ static void __trace_find_cmdline(int pid, char comm[]) return; } - if (pid > PID_MAX_DEFAULT) { - strcpy(comm, "<...>"); - return; + tpid = pid & (PID_MAX_DEFAULT - 1); + map = savedcmd->map_pid_to_cmdline[tpid]; + if (map != NO_CMDLINE_MAP) { + tpid = savedcmd->map_cmdline_to_pid[map]; + if (tpid == pid) { + strlcpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN); + return; + } } - - map = savedcmd->map_pid_to_cmdline[pid]; - if (map != NO_CMDLINE_MAP) - strlcpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN); - else - strcpy(comm, "<...>"); + strcpy(comm, "<...>"); } void trace_find_cmdline(int pid, char comm[]) -- cgit v1.2.3 From aafe104aa9096827a429bc1358f8260ee565b7cc Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 30 Apr 2021 12:17:58 -0400 Subject: tracing: Restructure trace_clock_global() to never block It was reported that a fix to the ring buffer recursion detection would cause a hung machine when performing suspend / resume testing. The following backtrace was extracted from debugging that case: Call Trace: trace_clock_global+0x91/0xa0 __rb_reserve_next+0x237/0x460 ring_buffer_lock_reserve+0x12a/0x3f0 trace_buffer_lock_reserve+0x10/0x50 __trace_graph_return+0x1f/0x80 trace_graph_return+0xb7/0xf0 ? trace_clock_global+0x91/0xa0 ftrace_return_to_handler+0x8b/0xf0 ? pv_hash+0xa0/0xa0 return_to_handler+0x15/0x30 ? ftrace_graph_caller+0xa0/0xa0 ? trace_clock_global+0x91/0xa0 ? __rb_reserve_next+0x237/0x460 ? ring_buffer_lock_reserve+0x12a/0x3f0 ? trace_event_buffer_lock_reserve+0x3c/0x120 ? trace_event_buffer_reserve+0x6b/0xc0 ? trace_event_raw_event_device_pm_callback_start+0x125/0x2d0 ? dpm_run_callback+0x3b/0xc0 ? pm_ops_is_empty+0x50/0x50 ? platform_get_irq_byname_optional+0x90/0x90 ? trace_device_pm_callback_start+0x82/0xd0 ? dpm_run_callback+0x49/0xc0 With the following RIP: RIP: 0010:native_queued_spin_lock_slowpath+0x69/0x200 Since the fix to the recursion detection would allow a single recursion to happen while tracing, this lead to the trace_clock_global() taking a spin lock and then trying to take it again: ring_buffer_lock_reserve() { trace_clock_global() { arch_spin_lock() { queued_spin_lock_slowpath() { /* lock taken */ (something else gets traced by function graph tracer) ring_buffer_lock_reserve() { trace_clock_global() { arch_spin_lock() { queued_spin_lock_slowpath() { /* DEAD LOCK! */ Tracing should *never* block, as it can lead to strange lockups like the above. Restructure the trace_clock_global() code to instead of simply taking a lock to update the recorded "prev_time" simply use it, as two events happening on two different CPUs that calls this at the same time, really doesn't matter which one goes first. Use a trylock to grab the lock for updating the prev_time, and if it fails, simply try again the next time. If it failed to be taken, that means something else is already updating it. Link: https://lkml.kernel.org/r/20210430121758.650b6e8a@gandalf.local.home Cc: stable@vger.kernel.org Tested-by: Konstantin Kharlamov Tested-by: Todd Brandt Fixes: b02414c8f045 ("ring-buffer: Fix recursion protection transitions between interrupt context") # started showing the problem Fixes: 14131f2f98ac3 ("tracing: implement trace_clock_*() APIs") # where the bug happened Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=212761 Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_clock.c | 44 ++++++++++++++++++++++++++++++-------------- 1 file changed, 30 insertions(+), 14 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index aaf6793ededa..c1637f90c8a3 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -95,33 +95,49 @@ u64 notrace trace_clock_global(void) { unsigned long flags; int this_cpu; - u64 now; + u64 now, prev_time; raw_local_irq_save(flags); this_cpu = raw_smp_processor_id(); - now = sched_clock_cpu(this_cpu); + /* - * If in an NMI context then dont risk lockups and return the - * cpu_clock() time: + * The global clock "guarantees" that the events are ordered + * between CPUs. But if two events on two different CPUS call + * trace_clock_global at roughly the same time, it really does + * not matter which one gets the earlier time. Just make sure + * that the same CPU will always show a monotonic clock. + * + * Use a read memory barrier to get the latest written + * time that was recorded. */ - if (unlikely(in_nmi())) - goto out; + smp_rmb(); + prev_time = READ_ONCE(trace_clock_struct.prev_time); + now = sched_clock_cpu(this_cpu); - arch_spin_lock(&trace_clock_struct.lock); + /* Make sure that now is always greater than prev_time */ + if ((s64)(now - prev_time) < 0) + now = prev_time + 1; /* - * TODO: if this happens often then maybe we should reset - * my_scd->clock to prev_time+1, to make sure - * we start ticking with the local clock from now on? + * If in an NMI context then dont risk lockups and simply return + * the current time. */ - if ((s64)(now - trace_clock_struct.prev_time) < 0) - now = trace_clock_struct.prev_time + 1; + if (unlikely(in_nmi())) + goto out; - trace_clock_struct.prev_time = now; + /* Tracing can cause strange recursion, always use a try lock */ + if (arch_spin_trylock(&trace_clock_struct.lock)) { + /* Reread prev_time in case it was already updated */ + prev_time = READ_ONCE(trace_clock_struct.prev_time); + if ((s64)(now - prev_time) < 0) + now = prev_time + 1; - arch_spin_unlock(&trace_clock_struct.lock); + trace_clock_struct.prev_time = now; + /* The unlock acts as the wmb for the above rmb */ + arch_spin_unlock(&trace_clock_struct.lock); + } out: raw_local_irq_restore(flags); -- cgit v1.2.3