From 6a02ad66b2c44155d529f430d4fa5c6c66321077 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 3 Feb 2014 18:11:08 +0100
Subject: perf/x86: Push the duration-logging printk() to IRQ context

Calling printk() from NMI context is bad (TM), so move it to IRQ
context.

This also avoids the problem where the printk() time is measured by
the generic NMI duration goo and triggers a second warning.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Link: http://lkml.kernel.org/n/tip-75dv35xf6dhhmeb7nq6fua31@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 56003c6edfd3..2067cbb378eb 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -231,11 +231,29 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
 #define NR_ACCUMULATED_SAMPLES 128
 static DEFINE_PER_CPU(u64, running_sample_length);
 
-void perf_sample_event_took(u64 sample_len_ns)
+static void perf_duration_warn(struct irq_work *w)
 {
+	u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
 	u64 avg_local_sample_len;
 	u64 local_samples_len;
+
+	local_samples_len = __get_cpu_var(running_sample_length);
+	avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
+
+	printk_ratelimited(KERN_WARNING
+			"perf interrupt took too long (%lld > %lld), lowering "
+			"kernel.perf_event_max_sample_rate to %d\n",
+			avg_local_sample_len, allowed_ns,
+			sysctl_perf_event_sample_rate);
+}
+
+static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
+
+void perf_sample_event_took(u64 sample_len_ns)
+{
 	u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
+	u64 avg_local_sample_len;
+	u64 local_samples_len;
 
 	if (allowed_ns == 0)
 		return;
@@ -263,13 +281,9 @@ void perf_sample_event_took(u64 sample_len_ns)
 	sysctl_perf_event_sample_rate = max_samples_per_tick * HZ;
 	perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
 
-	printk_ratelimited(KERN_WARNING
-			"perf samples too long (%lld > %lld), lowering "
-			"kernel.perf_event_max_sample_rate to %d\n",
-			avg_local_sample_len, allowed_ns,
-			sysctl_perf_event_sample_rate);
-
 	update_perf_cpu_limits();
+
+	irq_work_queue(&perf_duration_work);
 }
 
 static atomic64_t perf_event_id;
-- 
cgit v1.2.3


From cd578abb24aa67ce468c427d3356c08ea32cf768 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 11 Feb 2014 16:01:16 +0100
Subject: perf/x86: Warn to early_printk() in case irq_work is too slow

On Mon, Feb 10, 2014 at 08:45:16AM -0800, Dave Hansen wrote:
> The reason I coded this up was that NMIs were firing off so fast that
> nothing else was getting a chance to run.  With this patch, at least the
> printk() would come out and I'd have some idea what was going on.

It will start spewing to early_printk() (which is a lot nicer to use
from NMI context too) when it fails to queue the IRQ-work because its
already enqueued.

It does have the false-positive for when two CPUs trigger the warn
concurrently, but that should be rare and some extra clutter on the
early printk shouldn't be a problem.

Cc: hpa@zytor.com
Cc: tglx@linutronix.de
Cc: dzickus@redhat.com
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: mingo@kernel.org
Fixes: 6a02ad66b2c4 ("perf/x86: Push the duration-logging printk() to IRQ context")
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20140211150116.GO27965@twins.programming.kicks-ass.net
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irq_work.h | 2 +-
 kernel/events/core.c     | 9 +++++++--
 kernel/irq_work.c        | 6 ++++--
 3 files changed, 12 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h
index add13c8624b7..19ae05d4b8ec 100644
--- a/include/linux/irq_work.h
+++ b/include/linux/irq_work.h
@@ -32,7 +32,7 @@ void init_irq_work(struct irq_work *work, void (*func)(struct irq_work *))
 
 #define DEFINE_IRQ_WORK(name, _f) struct irq_work name = { .func = (_f), }
 
-void irq_work_queue(struct irq_work *work);
+bool irq_work_queue(struct irq_work *work);
 void irq_work_run(void);
 void irq_work_sync(struct irq_work *work);
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 2067cbb378eb..45e5543e2a1e 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -243,7 +243,7 @@ static void perf_duration_warn(struct irq_work *w)
 	printk_ratelimited(KERN_WARNING
 			"perf interrupt took too long (%lld > %lld), lowering "
 			"kernel.perf_event_max_sample_rate to %d\n",
-			avg_local_sample_len, allowed_ns,
+			avg_local_sample_len, allowed_ns >> 1,
 			sysctl_perf_event_sample_rate);
 }
 
@@ -283,7 +283,12 @@ void perf_sample_event_took(u64 sample_len_ns)
 
 	update_perf_cpu_limits();
 
-	irq_work_queue(&perf_duration_work);
+	if (!irq_work_queue(&perf_duration_work)) {
+		early_printk("perf interrupt took too long (%lld > %lld), lowering "
+			     "kernel.perf_event_max_sample_rate to %d\n",
+			     avg_local_sample_len, allowed_ns >> 1,
+			     sysctl_perf_event_sample_rate);
+	}
 }
 
 static atomic64_t perf_event_id;
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 55fcce6065cf..a82170e2fa78 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -61,11 +61,11 @@ void __weak arch_irq_work_raise(void)
  *
  * Can be re-enqueued while the callback is still in progress.
  */
-void irq_work_queue(struct irq_work *work)
+bool irq_work_queue(struct irq_work *work)
 {
 	/* Only queue if not already pending */
 	if (!irq_work_claim(work))
-		return;
+		return false;
 
 	/* Queue the entry and raise the IPI if needed. */
 	preempt_disable();
@@ -83,6 +83,8 @@ void irq_work_queue(struct irq_work *work)
 	}
 
 	preempt_enable();
+
+	return true;
 }
 EXPORT_SYMBOL_GPL(irq_work_queue);
 
-- 
cgit v1.2.3


From 9e3170411ed171a126f4dca1672012a33efe59e5 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 10 Feb 2014 17:44:18 +0000
Subject: perf: Fix prototype of find_pmu_context()

For some reason find_pmu_context() is defined as returning void * rather
than a __percpu struct perf_cpu_context *. As all the requisite types are
defined in advance there's no reason to keep it that way.

This patch modifies the prototype of pmu_find_context to return a
__percpu struct perf_cpu_context *.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Reviewed-by: Dave Martin <Dave.Martin@arm.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Link: http://lkml.kernel.org/r/1392054264-23570-2-git-send-email-mark.rutland@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index fa990061aa6c..425159882a6f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6313,7 +6313,7 @@ static int perf_event_idx_default(struct perf_event *event)
  * Ensures all contexts with the same task_ctx_nr have the same
  * pmu_cpu_context too.
  */
-static void *find_pmu_context(int ctxn)
+static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
 {
 	struct pmu *pmu;
 
-- 
cgit v1.2.3


From fdded676c3ef680bf1abc415d307d7e69a6768d1 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 10 Feb 2014 17:44:19 +0000
Subject: perf: Remove redundant PMU assignment

Currently perf_branch_stack_sched_in iterates over the set of pmus,
checks that each pmu has a flush_branch_stack callback, then overwrites
the pmu before calling the callback. This is either redundant or broken.

In systems with a single hw pmu, pmu == cpuctx->ctx.pmu, and thus the
assignment is redundant.

In systems with multiple hw pmus (i.e. multiple pmus with task_ctx_nr ==
perf_hw_context) the pmus share the same perf_cpu_context. Thus the
assignment can cause one of the pmus to flush its branch stack
repeatedly rather than causing each of the pmus to flush their branch
stacks. Worse still, if only some pmus have the callback the assignment
can result in a branch to NULL.

This patch removes the redundant assignment.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lkml.kernel.org/r/1392054264-23570-3-git-send-email-mark.rutland@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 425159882a6f..823a53d72d6a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2582,8 +2582,6 @@ static void perf_branch_stack_sched_in(struct task_struct *prev,
 		if (cpuctx->ctx.nr_branch_stack > 0
 		    && pmu->flush_branch_stack) {
 
-			pmu = cpuctx->ctx.pmu;
-
 			perf_ctx_lock(cpuctx, cpuctx->task_ctx);
 
 			perf_pmu_disable(pmu);
-- 
cgit v1.2.3


From 4a2345937c17722bd2979f662ae909846b4a052a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 24 Feb 2014 12:43:31 +0100
Subject: perf: Optimize group_sched_in()

Use the ctx pmu instead of the event pmu.

When a group leader is a software event but the group contains
hardware events, the entire group is on the hardware PMU.

Using the hardware PMU for the transaction makes most sense since
that's the most expensive one to programm (and software PMUs generally
don't have TXN support anyway).

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lkml.kernel.org/n/tip-sctoo9t2f3nn2c9g568928q3@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 823a53d72d6a..661951ab8ae7 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1733,7 +1733,7 @@ group_sched_in(struct perf_event *group_event,
 	       struct perf_event_context *ctx)
 {
 	struct perf_event *event, *partial_group = NULL;
-	struct pmu *pmu = group_event->pmu;
+	struct pmu *pmu = ctx->pmu;
 	u64 now = ctx->time;
 	bool simulate = false;
 
-- 
cgit v1.2.3


From cfa77bc4af2c75c0781ee76cde2dd104c6c8e2b7 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Sun, 2 Mar 2014 16:56:38 +0100
Subject: perf: Disallow user-space callchains for function trace events

Recent issues with user space callchains processing within
page fault handler tracing showed as Peter said 'there's
just too much fail surface'.

Related list discussions:

  http://marc.info/?t=139302086500001&r=1&w=2
  http://marc.info/?t=139301437300003&r=1&w=2

Suggested-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Jiri Olsa <jolsa@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1393775800-13524-2-git-send-email-jolsa@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/trace/trace_event_perf.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index e854f420e033..d5e01c3f4e69 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -31,9 +31,18 @@ static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
 	}
 
 	/* The ftrace function trace is allowed only for root. */
-	if (ftrace_event_is_function(tp_event) &&
-	    perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
-		return -EPERM;
+	if (ftrace_event_is_function(tp_event)) {
+		if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		/*
+		 * We don't allow user space callchains for  function trace
+		 * event, due to issues with page faults while tracing page
+		 * fault handler and its overall trickiness nature.
+		 */
+		if (!p_event->attr.exclude_callchain_user)
+			return -EINVAL;
+	}
 
 	/* No tracing, just counting, so no obvious leak */
 	if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW))
-- 
cgit v1.2.3


From 63c45f4ba533e9749da16298db53e491c25d805b Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Sun, 2 Mar 2014 16:56:39 +0100
Subject: perf: Disallow user-space stack dumps for function trace events

Recent issues with user space callchains processing within
page fault handler tracing showed as Peter said 'there's
just too much fail surface'.

The user space stack dump is just another source of the this issue.

Related list discussions:
  http://marc.info/?t=139302086500001&r=1&w=2
  http://marc.info/?t=139301437300003&r=1&w=2

Suggested-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Jiri Olsa <jolsa@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1393775800-13524-3-git-send-email-jolsa@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/trace/trace_event_perf.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index d5e01c3f4e69..c894614de14d 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -42,6 +42,13 @@ static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
 		 */
 		if (!p_event->attr.exclude_callchain_user)
 			return -EINVAL;
+
+		/*
+		 * Same reason to disable user stack dump as for user space
+		 * callchains above.
+		 */
+		if (p_event->attr.sample_type & PERF_SAMPLE_STACK_USER)
+			return -EINVAL;
 	}
 
 	/* No tracing, just counting, so no obvious leak */
-- 
cgit v1.2.3