From 36009d07b79d2a168d6037947357d96e5d8cebe7 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 6 Aug 2013 18:08:41 +0200 Subject: tracing/perf: Expand TRACE_EVENT(sched_stat_runtime) To simplify the review of the next patches: 1. We are going to reimplent __perf_task/counter and embedd them into TP_ARGS(). expand TRACE_EVENT(sched_stat_runtime) into DECLARE_EVENT_CLASS() + DEFINE_EVENT(), this way they can use different TP_ARGS's. 2. Change perf_trace_##call() macro to do perf_fetch_caller_regs() right before perf_trace_buf_prepare(). This way it evaluates TP_ARGS() asap, the next patch explores this fact. Note: after 87f44bbc perf_trace_buf_prepare() doesn't need "struct pt_regs *regs", perhaps it makes sense to remove this argument. And perhaps we can teach perf_trace_buf_submit() to accept regs == NULL and do fetch_caller_regs(CALLER_ADDR1) in this case. 3. Cosmetic, but the typecast from "void*" buys nothing. It just adds the noise, remove it. Link: http://lkml.kernel.org/r/20130806160841.GA2736@redhat.com Acked-by: Peter Zijlstra Tested-by: David Ahern Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- include/trace/events/sched.h | 6 +++++- include/trace/ftrace.h | 7 +++---- 2 files changed, 8 insertions(+), 5 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index e5586caff67a..249c024e67ae 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -372,7 +372,7 @@ DEFINE_EVENT(sched_stat_template, sched_stat_blocked, * Tracepoint for accounting runtime (time the task is executing * on a CPU). */ -TRACE_EVENT(sched_stat_runtime, +DECLARE_EVENT_CLASS(sched_stat_runtime, TP_PROTO(struct task_struct *tsk, u64 runtime, u64 vruntime), @@ -401,6 +401,10 @@ TRACE_EVENT(sched_stat_runtime, (unsigned long long)__entry->vruntime) ); +DEFINE_EVENT(sched_stat_runtime, sched_stat_runtime, + TP_PROTO(struct task_struct *tsk, u64 runtime, u64 vruntime), + TP_ARGS(tsk, runtime, vruntime)); + /* * Tracepoint for showing priority inheritance modifying a tasks * priority. diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 41a6643e2136..618af05f0be6 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -663,15 +663,14 @@ perf_trace_##call(void *__data, proto) \ int __data_size; \ int rctx; \ \ - perf_fetch_caller_regs(&__regs); \ - \ __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \ __entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32),\ sizeof(u64)); \ __entry_size -= sizeof(u32); \ \ - entry = (struct ftrace_raw_##call *)perf_trace_buf_prepare( \ - __entry_size, event_call->event.type, &__regs, &rctx); \ + perf_fetch_caller_regs(&__regs); \ + entry = perf_trace_buf_prepare(__entry_size, \ + event_call->event.type, &__regs, &rctx); \ if (!entry) \ return; \ \ -- cgit v1.2.3 From 12473965c38a527a0c6f7a38d23edce60957f873 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 6 Aug 2013 18:08:44 +0200 Subject: tracing/perf: Reimplement TP_perf_assign() logic The next patch tries to avoid the costly perf_trace_buf_* calls when possible but there is a problem. We can only do this if __task == NULL, perf_tp_event(task != NULL) has the additional code for this case. Unfortunately, TP_perf_assign/__perf_xxx which changes the default values of __count/__task variables for perf_trace_buf_submit() is called "too late", after we already did perf_trace_buf_prepare(), and the optimization above can't work. So this patch simply embeds __perf_xxx() into TP_ARGS(), this way DECLARE_EVENT_CLASS() can use the result of assignments hidden in "args" right after ftrace_get_offsets_##call() which is mostly trivial. This allows us to have the fast-path "__task != NULL" check at the start, see the next patch. Link: http://lkml.kernel.org/r/20130806160844.GA2739@redhat.com Tested-by: David Ahern Acked-by: Peter Zijlstra Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- include/trace/events/sched.h | 16 +++------------- include/trace/ftrace.h | 19 +++++++++++-------- 2 files changed, 14 insertions(+), 21 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 249c024e67ae..2e7d9947a10d 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -57,7 +57,7 @@ DECLARE_EVENT_CLASS(sched_wakeup_template, TP_PROTO(struct task_struct *p, int success), - TP_ARGS(p, success), + TP_ARGS(__perf_task(p), success), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) @@ -73,9 +73,6 @@ DECLARE_EVENT_CLASS(sched_wakeup_template, __entry->prio = p->prio; __entry->success = success; __entry->target_cpu = task_cpu(p); - ) - TP_perf_assign( - __perf_task(p); ), TP_printk("comm=%s pid=%d prio=%d success=%d target_cpu=%03d", @@ -313,7 +310,7 @@ DECLARE_EVENT_CLASS(sched_stat_template, TP_PROTO(struct task_struct *tsk, u64 delay), - TP_ARGS(tsk, delay), + TP_ARGS(__perf_task(tsk), __perf_count(delay)), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) @@ -325,10 +322,6 @@ DECLARE_EVENT_CLASS(sched_stat_template, memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); __entry->pid = tsk->pid; __entry->delay = delay; - ) - TP_perf_assign( - __perf_count(delay); - __perf_task(tsk); ), TP_printk("comm=%s pid=%d delay=%Lu [ns]", @@ -376,7 +369,7 @@ DECLARE_EVENT_CLASS(sched_stat_runtime, TP_PROTO(struct task_struct *tsk, u64 runtime, u64 vruntime), - TP_ARGS(tsk, runtime, vruntime), + TP_ARGS(tsk, __perf_count(runtime), vruntime), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) @@ -390,9 +383,6 @@ DECLARE_EVENT_CLASS(sched_stat_runtime, __entry->pid = tsk->pid; __entry->runtime = runtime; __entry->vruntime = vruntime; - ) - TP_perf_assign( - __perf_count(runtime); ), TP_printk("comm=%s pid=%d runtime=%Lu [ns] vruntime=%Lu [ns]", diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 618af05f0be6..4163d93ccf38 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -507,8 +507,14 @@ static inline notrace int ftrace_get_offsets_##call( \ #undef TP_fast_assign #define TP_fast_assign(args...) args -#undef TP_perf_assign -#define TP_perf_assign(args...) +#undef __perf_addr +#define __perf_addr(a) (a) + +#undef __perf_count +#define __perf_count(c) (c) + +#undef __perf_task +#define __perf_task(t) (t) #undef DECLARE_EVENT_CLASS #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ @@ -636,16 +642,13 @@ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call #define __get_str(field) (char *)__get_dynamic_array(field) #undef __perf_addr -#define __perf_addr(a) __addr = (a) +#define __perf_addr(a) (__addr = (a)) #undef __perf_count -#define __perf_count(c) __count = (c) +#define __perf_count(c) (__count = (c)) #undef __perf_task -#define __perf_task(t) __task = (t) - -#undef TP_perf_assign -#define TP_perf_assign(args...) args +#define __perf_task(t) (__task = (t)) #undef DECLARE_EVENT_CLASS #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ -- cgit v1.2.3 From d027e6a9c83440bf1ca9e5503539d58d8e0914f1 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 6 Aug 2013 18:08:47 +0200 Subject: tracing/perf: Avoid perf_trace_buf_*() in perf_trace_##call() when possible perf_trace_buf_prepare() + perf_trace_buf_submit(task => NULL) make no sense if hlist_empty(head). Change perf_trace_##call() to check ->perf_events beforehand and do nothing if it is empty. This removes the overhead for tasks without events associated with them. For example, "perf record -e sched:sched_switch -p1" attaches the counter(s) to the single task, but every task in system will do perf_trace_buf_prepare/submit() just to realize that it was not attached to this event. However, we can only do this if __task == NULL, so we also add the __builtin_constant_p(__task) check. With this patch "perf bench sched pipe" shows approximately 4% improvement when "perf record -p1" runs in parallel, many thanks to Steven for the testing. Link: http://lkml.kernel.org/r/20130806160847.GA2746@redhat.com Tested-by: David Ahern Acked-by: Peter Zijlstra Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- include/trace/ftrace.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/trace') diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 4163d93ccf38..5c7ab17cbb02 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -667,6 +667,12 @@ perf_trace_##call(void *__data, proto) \ int rctx; \ \ __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \ + \ + head = this_cpu_ptr(event_call->perf_events); \ + if (__builtin_constant_p(!__task) && !__task && \ + hlist_empty(head)) \ + return; \ + \ __entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32),\ sizeof(u64)); \ __entry_size -= sizeof(u32); \ @@ -681,7 +687,6 @@ perf_trace_##call(void *__data, proto) \ \ { assign; } \ \ - head = this_cpu_ptr(event_call->perf_events); \ perf_trace_buf_submit(entry, __entry_size, rctx, __addr, \ __count, &__regs, head, __task); \ } -- cgit v1.2.3