summaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel/cpu
diff options
context:
space:
mode:
authorAlexander Shishkin <alexander.shishkin@linux.intel.com>2015-08-31 17:09:27 +0300
committerIngo Molnar <mingo@kernel.org>2015-09-13 11:27:21 +0200
commita09d31f45224c219a4d5c728fa40150dc9d3e3e5 (patch)
tree12f62b9431cbe4e5220f6d08ee2ab2b4f15aec07 /arch/x86/kernel/cpu
parentb20112edeadf0b8a1416de061caa4beb11539902 (diff)
downloadlinux-a09d31f45224c219a4d5c728fa40150dc9d3e3e5.tar.bz2
perf/x86/intel/ds: Work around BTS leaking kernel addresses
BTS leaks kernel addresses even in userspace-only mode due to imprecise IP sampling, so sometimes syscall entry points or page fault handler addresses end up in a userspace trace. Since this driver uses a relatively small buffer for BTS records and it has to iterate through them anyway, it can also take on the additional job of filtering out the records that contain kernel addresses when kernel space tracing is not enabled. This patch changes the bts code to skip the offending records from perf output. In order to request the exact amount of space on the ring buffer, we need to do an extra pass through the records to know how many there are of the valid ones, but considering the small size of the buffer, this extra pass adds very little overhead to the nmi handler. This way we won't end up with awkward IP samples with zero IPs in the perf stream. Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Arnaldo Carvalho de Melo <acme@infradead.org> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Jiri Olsa <jolsa@redhat.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Vince Weaver <vincent.weaver@maine.edu> Cc: hpa@zytor.com Link: http://lkml.kernel.org/r/1441030168-6853-2-git-send-email-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'arch/x86/kernel/cpu')
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c40
1 files changed, 34 insertions, 6 deletions
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 84f236ab96b0..5db1c7755548 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -510,10 +510,11 @@ int intel_pmu_drain_bts_buffer(void)
u64 flags;
};
struct perf_event *event = cpuc->events[INTEL_PMC_IDX_FIXED_BTS];
- struct bts_record *at, *top;
+ struct bts_record *at, *base, *top;
struct perf_output_handle handle;
struct perf_event_header header;
struct perf_sample_data data;
+ unsigned long skip = 0;
struct pt_regs regs;
if (!event)
@@ -522,10 +523,10 @@ int intel_pmu_drain_bts_buffer(void)
if (!x86_pmu.bts_active)
return 0;
- at = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
- top = (struct bts_record *)(unsigned long)ds->bts_index;
+ base = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
+ top = (struct bts_record *)(unsigned long)ds->bts_index;
- if (top <= at)
+ if (top <= base)
return 0;
memset(&regs, 0, sizeof(regs));
@@ -535,16 +536,43 @@ int intel_pmu_drain_bts_buffer(void)
perf_sample_data_init(&data, 0, event->hw.last_period);
/*
+ * BTS leaks kernel addresses in branches across the cpl boundary,
+ * such as traps or system calls, so unless the user is asking for
+ * kernel tracing (and right now it's not possible), we'd need to
+ * filter them out. But first we need to count how many of those we
+ * have in the current batch. This is an extra O(n) pass, however,
+ * it's much faster than the other one especially considering that
+ * n <= 2560 (BTS_BUFFER_SIZE / BTS_RECORD_SIZE * 15/16; see the
+ * alloc_bts_buffer()).
+ */
+ for (at = base; at < top; at++) {
+ /*
+ * Note that right now *this* BTS code only works if
+ * attr::exclude_kernel is set, but let's keep this extra
+ * check here in case that changes.
+ */
+ if (event->attr.exclude_kernel &&
+ (kernel_ip(at->from) || kernel_ip(at->to)))
+ skip++;
+ }
+
+ /*
* Prepare a generic sample, i.e. fill in the invariant fields.
* We will overwrite the from and to address before we output
* the sample.
*/
perf_prepare_sample(&header, &data, event, &regs);
- if (perf_output_begin(&handle, event, header.size * (top - at)))
+ if (perf_output_begin(&handle, event, header.size *
+ (top - base - skip)))
return 1;
- for (; at < top; at++) {
+ for (at = base; at < top; at++) {
+ /* Filter out any records that contain kernel addresses. */
+ if (event->attr.exclude_kernel &&
+ (kernel_ip(at->from) || kernel_ip(at->to)))
+ continue;
+
data.ip = at->from;
data.addr = at->to;