From 6a02ad66b2c44155d529f430d4fa5c6c66321077 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 3 Feb 2014 18:11:08 +0100 Subject: perf/x86: Push the duration-logging printk() to IRQ context Calling printk() from NMI context is bad (TM), so move it to IRQ context. This also avoids the problem where the printk() time is measured by the generic NMI duration goo and triggers a second warning. Signed-off-by: Peter Zijlstra Cc: Don Zickus Cc: Dave Hansen Link: http://lkml.kernel.org/n/tip-75dv35xf6dhhmeb7nq6fua31@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 56003c6edfd3..2067cbb378eb 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -231,11 +231,29 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, #define NR_ACCUMULATED_SAMPLES 128 static DEFINE_PER_CPU(u64, running_sample_length); -void perf_sample_event_took(u64 sample_len_ns) +static void perf_duration_warn(struct irq_work *w) { + u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns); u64 avg_local_sample_len; u64 local_samples_len; + + local_samples_len = __get_cpu_var(running_sample_length); + avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; + + printk_ratelimited(KERN_WARNING + "perf interrupt took too long (%lld > %lld), lowering " + "kernel.perf_event_max_sample_rate to %d\n", + avg_local_sample_len, allowed_ns, + sysctl_perf_event_sample_rate); +} + +static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn); + +void perf_sample_event_took(u64 sample_len_ns) +{ u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns); + u64 avg_local_sample_len; + u64 local_samples_len; if (allowed_ns == 0) return; @@ -263,13 +281,9 @@ void perf_sample_event_took(u64 sample_len_ns) sysctl_perf_event_sample_rate = max_samples_per_tick * HZ; perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; - printk_ratelimited(KERN_WARNING - "perf samples too long (%lld > %lld), lowering " - "kernel.perf_event_max_sample_rate to %d\n", - avg_local_sample_len, allowed_ns, - sysctl_perf_event_sample_rate); - update_perf_cpu_limits(); + + irq_work_queue(&perf_duration_work); } static atomic64_t perf_event_id; -- cgit v1.2.3 From cd578abb24aa67ce468c427d3356c08ea32cf768 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 11 Feb 2014 16:01:16 +0100 Subject: perf/x86: Warn to early_printk() in case irq_work is too slow On Mon, Feb 10, 2014 at 08:45:16AM -0800, Dave Hansen wrote: > The reason I coded this up was that NMIs were firing off so fast that > nothing else was getting a chance to run. With this patch, at least the > printk() would come out and I'd have some idea what was going on. It will start spewing to early_printk() (which is a lot nicer to use from NMI context too) when it fails to queue the IRQ-work because its already enqueued. It does have the false-positive for when two CPUs trigger the warn concurrently, but that should be rare and some extra clutter on the early printk shouldn't be a problem. Cc: hpa@zytor.com Cc: tglx@linutronix.de Cc: dzickus@redhat.com Cc: Dave Hansen Cc: mingo@kernel.org Fixes: 6a02ad66b2c4 ("perf/x86: Push the duration-logging printk() to IRQ context") Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140211150116.GO27965@twins.programming.kicks-ass.net Signed-off-by: Thomas Gleixner --- include/linux/irq_work.h | 2 +- kernel/events/core.c | 9 +++++++-- kernel/irq_work.c | 6 ++++-- 3 files changed, 12 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h index add13c8624b7..19ae05d4b8ec 100644 --- a/include/linux/irq_work.h +++ b/include/linux/irq_work.h @@ -32,7 +32,7 @@ void init_irq_work(struct irq_work *work, void (*func)(struct irq_work *)) #define DEFINE_IRQ_WORK(name, _f) struct irq_work name = { .func = (_f), } -void irq_work_queue(struct irq_work *work); +bool irq_work_queue(struct irq_work *work); void irq_work_run(void); void irq_work_sync(struct irq_work *work); diff --git a/kernel/events/core.c b/kernel/events/core.c index 2067cbb378eb..45e5543e2a1e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -243,7 +243,7 @@ static void perf_duration_warn(struct irq_work *w) printk_ratelimited(KERN_WARNING "perf interrupt took too long (%lld > %lld), lowering " "kernel.perf_event_max_sample_rate to %d\n", - avg_local_sample_len, allowed_ns, + avg_local_sample_len, allowed_ns >> 1, sysctl_perf_event_sample_rate); } @@ -283,7 +283,12 @@ void perf_sample_event_took(u64 sample_len_ns) update_perf_cpu_limits(); - irq_work_queue(&perf_duration_work); + if (!irq_work_queue(&perf_duration_work)) { + early_printk("perf interrupt took too long (%lld > %lld), lowering " + "kernel.perf_event_max_sample_rate to %d\n", + avg_local_sample_len, allowed_ns >> 1, + sysctl_perf_event_sample_rate); + } } static atomic64_t perf_event_id; diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 55fcce6065cf..a82170e2fa78 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -61,11 +61,11 @@ void __weak arch_irq_work_raise(void) * * Can be re-enqueued while the callback is still in progress. */ -void irq_work_queue(struct irq_work *work) +bool irq_work_queue(struct irq_work *work) { /* Only queue if not already pending */ if (!irq_work_claim(work)) - return; + return false; /* Queue the entry and raise the IPI if needed. */ preempt_disable(); @@ -83,6 +83,8 @@ void irq_work_queue(struct irq_work *work) } preempt_enable(); + + return true; } EXPORT_SYMBOL_GPL(irq_work_queue); -- cgit v1.2.3 From 9e3170411ed171a126f4dca1672012a33efe59e5 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 10 Feb 2014 17:44:18 +0000 Subject: perf: Fix prototype of find_pmu_context() For some reason find_pmu_context() is defined as returning void * rather than a __percpu struct perf_cpu_context *. As all the requisite types are defined in advance there's no reason to keep it that way. This patch modifies the prototype of pmu_find_context to return a __percpu struct perf_cpu_context *. Signed-off-by: Mark Rutland Signed-off-by: Peter Zijlstra Reviewed-by: Dave Martin Acked-by: Will Deacon Link: http://lkml.kernel.org/r/1392054264-23570-2-git-send-email-mark.rutland@arm.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index fa990061aa6c..425159882a6f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6313,7 +6313,7 @@ static int perf_event_idx_default(struct perf_event *event) * Ensures all contexts with the same task_ctx_nr have the same * pmu_cpu_context too. */ -static void *find_pmu_context(int ctxn) +static struct perf_cpu_context __percpu *find_pmu_context(int ctxn) { struct pmu *pmu; -- cgit v1.2.3 From fdded676c3ef680bf1abc415d307d7e69a6768d1 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 10 Feb 2014 17:44:19 +0000 Subject: perf: Remove redundant PMU assignment Currently perf_branch_stack_sched_in iterates over the set of pmus, checks that each pmu has a flush_branch_stack callback, then overwrites the pmu before calling the callback. This is either redundant or broken. In systems with a single hw pmu, pmu == cpuctx->ctx.pmu, and thus the assignment is redundant. In systems with multiple hw pmus (i.e. multiple pmus with task_ctx_nr == perf_hw_context) the pmus share the same perf_cpu_context. Thus the assignment can cause one of the pmus to flush its branch stack repeatedly rather than causing each of the pmus to flush their branch stacks. Worse still, if only some pmus have the callback the assignment can result in a branch to NULL. This patch removes the redundant assignment. Signed-off-by: Mark Rutland Acked-by: Will Deacon Signed-off-by: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1392054264-23570-3-git-send-email-mark.rutland@arm.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 425159882a6f..823a53d72d6a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2582,8 +2582,6 @@ static void perf_branch_stack_sched_in(struct task_struct *prev, if (cpuctx->ctx.nr_branch_stack > 0 && pmu->flush_branch_stack) { - pmu = cpuctx->ctx.pmu; - perf_ctx_lock(cpuctx, cpuctx->task_ctx); perf_pmu_disable(pmu); -- cgit v1.2.3 From 4a2345937c17722bd2979f662ae909846b4a052a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 24 Feb 2014 12:43:31 +0100 Subject: perf: Optimize group_sched_in() Use the ctx pmu instead of the event pmu. When a group leader is a software event but the group contains hardware events, the entire group is on the hardware PMU. Using the hardware PMU for the transaction makes most sense since that's the most expensive one to programm (and software PMUs generally don't have TXN support anyway). Signed-off-by: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-sctoo9t2f3nn2c9g568928q3@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 823a53d72d6a..661951ab8ae7 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1733,7 +1733,7 @@ group_sched_in(struct perf_event *group_event, struct perf_event_context *ctx) { struct perf_event *event, *partial_group = NULL; - struct pmu *pmu = group_event->pmu; + struct pmu *pmu = ctx->pmu; u64 now = ctx->time; bool simulate = false; -- cgit v1.2.3 From cfa77bc4af2c75c0781ee76cde2dd104c6c8e2b7 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 2 Mar 2014 16:56:38 +0100 Subject: perf: Disallow user-space callchains for function trace events Recent issues with user space callchains processing within page fault handler tracing showed as Peter said 'there's just too much fail surface'. Related list discussions: http://marc.info/?t=139302086500001&r=1&w=2 http://marc.info/?t=139301437300003&r=1&w=2 Suggested-by: Peter Zijlstra Signed-off-by: Jiri Olsa Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: "H. Peter Anvin" Cc: Vince Weaver Cc: Steven Rostedt Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1393775800-13524-2-git-send-email-jolsa@redhat.com Signed-off-by: Ingo Molnar --- kernel/trace/trace_event_perf.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index e854f420e033..d5e01c3f4e69 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -31,9 +31,18 @@ static int perf_trace_event_perm(struct ftrace_event_call *tp_event, } /* The ftrace function trace is allowed only for root. */ - if (ftrace_event_is_function(tp_event) && - perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) - return -EPERM; + if (ftrace_event_is_function(tp_event)) { + if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + /* + * We don't allow user space callchains for function trace + * event, due to issues with page faults while tracing page + * fault handler and its overall trickiness nature. + */ + if (!p_event->attr.exclude_callchain_user) + return -EINVAL; + } /* No tracing, just counting, so no obvious leak */ if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW)) -- cgit v1.2.3 From 63c45f4ba533e9749da16298db53e491c25d805b Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 2 Mar 2014 16:56:39 +0100 Subject: perf: Disallow user-space stack dumps for function trace events Recent issues with user space callchains processing within page fault handler tracing showed as Peter said 'there's just too much fail surface'. The user space stack dump is just another source of the this issue. Related list discussions: http://marc.info/?t=139302086500001&r=1&w=2 http://marc.info/?t=139301437300003&r=1&w=2 Suggested-by: Peter Zijlstra Signed-off-by: Jiri Olsa Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Vince Weaver Cc: Steven Rostedt Cc: Paul Mackerras Cc: H. Peter Anvin Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1393775800-13524-3-git-send-email-jolsa@redhat.com Signed-off-by: Ingo Molnar --- kernel/trace/trace_event_perf.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index d5e01c3f4e69..c894614de14d 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -42,6 +42,13 @@ static int perf_trace_event_perm(struct ftrace_event_call *tp_event, */ if (!p_event->attr.exclude_callchain_user) return -EINVAL; + + /* + * Same reason to disable user stack dump as for user space + * callchains above. + */ + if (p_event->attr.sample_type & PERF_SAMPLE_STACK_USER) + return -EINVAL; } /* No tracing, just counting, so no obvious leak */ -- cgit v1.2.3