summaryrefslogtreecommitdiffstats
path: root/kernel/events
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2020-10-12 14:14:35 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2020-10-12 14:14:35 -0700
commit3bff6112c80cecb76af5fe485506f96e8adb6122 (patch)
tree3a2507f01d3d2d3296742ae1dc59f4ea40d12705 /kernel/events
parentdd502a81077a5f3b3e19fa9a1accffdcab5ad5bc (diff)
parentf91072ed1b7283b13ca57fcfbece5a3b92726143 (diff)
downloadlinux-3bff6112c80cecb76af5fe485506f96e8adb6122.tar.bz2
Merge tag 'perf-core-2020-10-12' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull performance events updates from Ingo Molnar: "x86 Intel updates: - Add Jasper Lake support - Add support for TopDown metrics on Ice Lake - Fix Ice Lake & Tiger Lake uncore support, add Snow Ridge support - Add a PCI sub driver to support uncore PMUs where the PCI resources have been claimed already - extending the range of supported systems. x86 AMD updates: - Restore 'perf stat -a' behaviour to program the uncore PMU to count all CPU threads. - Fix setting the proper count when sampling Large Increment per Cycle events / 'paired' events. - Fix IBS Fetch sampling on F17h and some other IBS fine tuning, greatly reducing the number of interrupts when large sample periods are specified. - Extends Family 17h RAPL support to also work on compatible F19h machines. Core code updates: - Fix race in perf_mmap_close() - Add PERF_EV_CAP_SIBLING, to denote that sibling events should be closed if the leader is removed. - Smaller fixes and updates" * tag 'perf-core-2020-10-12' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (45 commits) perf/core: Fix race in the perf_mmap_close() function perf/x86: Fix n_metric for cancelled txn perf/x86: Fix n_pair for cancelled txn x86/events/amd/iommu: Fix sizeof mismatch perf/x86/intel: Check perf metrics feature for each CPU perf/x86/intel: Fix Ice Lake event constraint table perf/x86/intel/uncore: Fix the scale of the IMC free-running events perf/x86/intel/uncore: Fix for iio mapping on Skylake Server perf/x86/msr: Add Jasper Lake support perf/x86/intel: Add Jasper Lake support perf/x86/intel/uncore: Reduce the number of CBOX counters perf/x86/intel/uncore: Update Ice Lake uncore units perf/x86/intel/uncore: Split the Ice Lake and Tiger Lake MSR uncore support perf/x86/intel/uncore: Support PCIe3 unit on Snow Ridge perf/x86/intel/uncore: Generic support for the PCI sub driver perf/x86/intel/uncore: Factor out uncore_pci_pmu_unregister() perf/x86/intel/uncore: Factor out uncore_pci_pmu_register() perf/x86/intel/uncore: Factor out uncore_pci_find_dev_pmu() perf/x86/intel/uncore: Factor out uncore_pci_get_dev_die_info() perf/amd/uncore: Inform the user how many counters each uncore PMU has ...
Diffstat (limited to 'kernel/events')
-rw-r--r--kernel/events/core.c121
1 files changed, 74 insertions, 47 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c
index e8bf92202542..da467e1dd49a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -383,7 +383,6 @@ static DEFINE_MUTEX(perf_sched_mutex);
static atomic_t perf_sched_count;
static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
-static DEFINE_PER_CPU(int, perf_sched_cb_usages);
static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
static atomic_t nr_mmap_events __read_mostly;
@@ -2134,8 +2133,24 @@ static inline struct list_head *get_event_list(struct perf_event *event)
return event->attr.pinned ? &ctx->pinned_active : &ctx->flexible_active;
}
+/*
+ * Events that have PERF_EV_CAP_SIBLING require being part of a group and
+ * cannot exist on their own, schedule them out and move them into the ERROR
+ * state. Also see _perf_event_enable(), it will not be able to recover
+ * this ERROR state.
+ */
+static inline void perf_remove_sibling_event(struct perf_event *event)
+{
+ struct perf_event_context *ctx = event->ctx;
+ struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+
+ event_sched_out(event, cpuctx, ctx);
+ perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
+}
+
static void perf_group_detach(struct perf_event *event)
{
+ struct perf_event *leader = event->group_leader;
struct perf_event *sibling, *tmp;
struct perf_event_context *ctx = event->ctx;
@@ -2154,7 +2169,7 @@ static void perf_group_detach(struct perf_event *event)
/*
* If this is a sibling, remove it from its group.
*/
- if (event->group_leader != event) {
+ if (leader != event) {
list_del_init(&event->sibling_list);
event->group_leader->nr_siblings--;
goto out;
@@ -2167,6 +2182,9 @@ static void perf_group_detach(struct perf_event *event)
*/
list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {
+ if (sibling->event_caps & PERF_EV_CAP_SIBLING)
+ perf_remove_sibling_event(sibling);
+
sibling->group_leader = sibling;
list_del_init(&sibling->sibling_list);
@@ -2184,10 +2202,10 @@ static void perf_group_detach(struct perf_event *event)
}
out:
- perf_event__header_size(event->group_leader);
-
- for_each_sibling_event(tmp, event->group_leader)
+ for_each_sibling_event(tmp, leader)
perf_event__header_size(tmp);
+
+ perf_event__header_size(leader);
}
static bool is_orphaned_event(struct perf_event *event)
@@ -2980,6 +2998,7 @@ static void _perf_event_enable(struct perf_event *event)
raw_spin_lock_irq(&ctx->lock);
if (event->state >= PERF_EVENT_STATE_INACTIVE ||
event->state < PERF_EVENT_STATE_ERROR) {
+out:
raw_spin_unlock_irq(&ctx->lock);
return;
}
@@ -2991,8 +3010,16 @@ static void _perf_event_enable(struct perf_event *event)
* has gone back into error state, as distinct from the task having
* been scheduled away before the cross-call arrived.
*/
- if (event->state == PERF_EVENT_STATE_ERROR)
+ if (event->state == PERF_EVENT_STATE_ERROR) {
+ /*
+ * Detached SIBLING events cannot leave ERROR state.
+ */
+ if (event->event_caps & PERF_EV_CAP_SIBLING &&
+ event->group_leader == event)
+ goto out;
+
event->state = PERF_EVENT_STATE_OFF;
+ }
raw_spin_unlock_irq(&ctx->lock);
event_function_call(event, __perf_event_enable, NULL);
@@ -3357,10 +3384,12 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
struct perf_event_context *parent, *next_parent;
struct perf_cpu_context *cpuctx;
int do_switch = 1;
+ struct pmu *pmu;
if (likely(!ctx))
return;
+ pmu = ctx->pmu;
cpuctx = __get_cpu_context(ctx);
if (!cpuctx->task_ctx)
return;
@@ -3390,11 +3419,15 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
raw_spin_lock(&ctx->lock);
raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
if (context_equiv(ctx, next_ctx)) {
- struct pmu *pmu = ctx->pmu;
WRITE_ONCE(ctx->task, next);
WRITE_ONCE(next_ctx->task, task);
+ perf_pmu_disable(pmu);
+
+ if (cpuctx->sched_cb_usage && pmu->sched_task)
+ pmu->sched_task(ctx, false);
+
/*
* PMU specific parts of task perf context can require
* additional synchronization. As an example of such
@@ -3406,6 +3439,8 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
else
swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
+ perf_pmu_enable(pmu);
+
/*
* RCU_INIT_POINTER here is safe because we've not
* modified the ctx and the above modification of
@@ -3428,21 +3463,22 @@ unlock:
if (do_switch) {
raw_spin_lock(&ctx->lock);
+ perf_pmu_disable(pmu);
+
+ if (cpuctx->sched_cb_usage && pmu->sched_task)
+ pmu->sched_task(ctx, false);
task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
+
+ perf_pmu_enable(pmu);
raw_spin_unlock(&ctx->lock);
}
}
-static DEFINE_PER_CPU(struct list_head, sched_cb_list);
-
void perf_sched_cb_dec(struct pmu *pmu)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
- this_cpu_dec(perf_sched_cb_usages);
-
- if (!--cpuctx->sched_cb_usage)
- list_del(&cpuctx->sched_cb_entry);
+ --cpuctx->sched_cb_usage;
}
@@ -3450,10 +3486,7 @@ void perf_sched_cb_inc(struct pmu *pmu)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
- if (!cpuctx->sched_cb_usage++)
- list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
-
- this_cpu_inc(perf_sched_cb_usages);
+ cpuctx->sched_cb_usage++;
}
/*
@@ -3464,30 +3497,22 @@ void perf_sched_cb_inc(struct pmu *pmu)
* PEBS requires this to provide PID/TID information. This requires we flush
* all queued PEBS records before we context switch to a new task.
*/
-static void perf_pmu_sched_task(struct task_struct *prev,
- struct task_struct *next,
- bool sched_in)
+static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in)
{
- struct perf_cpu_context *cpuctx;
struct pmu *pmu;
- if (prev == next)
- return;
+ pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */
- list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
- pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */
-
- if (WARN_ON_ONCE(!pmu->sched_task))
- continue;
+ if (WARN_ON_ONCE(!pmu->sched_task))
+ return;
- perf_ctx_lock(cpuctx, cpuctx->task_ctx);
- perf_pmu_disable(pmu);
+ perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+ perf_pmu_disable(pmu);
- pmu->sched_task(cpuctx->task_ctx, sched_in);
+ pmu->sched_task(cpuctx->task_ctx, sched_in);
- perf_pmu_enable(pmu);
- perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
- }
+ perf_pmu_enable(pmu);
+ perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}
static void perf_event_switch(struct task_struct *task,
@@ -3512,9 +3537,6 @@ void __perf_event_task_sched_out(struct task_struct *task,
{
int ctxn;
- if (__this_cpu_read(perf_sched_cb_usages))
- perf_pmu_sched_task(task, next, false);
-
if (atomic_read(&nr_switch_events))
perf_event_switch(task, next, false);
@@ -3746,10 +3768,14 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
struct task_struct *task)
{
struct perf_cpu_context *cpuctx;
+ struct pmu *pmu = ctx->pmu;
cpuctx = __get_cpu_context(ctx);
- if (cpuctx->task_ctx == ctx)
+ if (cpuctx->task_ctx == ctx) {
+ if (cpuctx->sched_cb_usage)
+ __perf_pmu_sched_task(cpuctx, true);
return;
+ }
perf_ctx_lock(cpuctx, ctx);
/*
@@ -3759,7 +3785,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
if (!ctx->nr_events)
goto unlock;
- perf_pmu_disable(ctx->pmu);
+ perf_pmu_disable(pmu);
/*
* We want to keep the following priority order:
* cpu pinned (that don't need to move), task pinned,
@@ -3771,7 +3797,11 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
perf_event_sched_in(cpuctx, ctx, task);
- perf_pmu_enable(ctx->pmu);
+
+ if (cpuctx->sched_cb_usage && pmu->sched_task)
+ pmu->sched_task(cpuctx->task_ctx, true);
+
+ perf_pmu_enable(pmu);
unlock:
perf_ctx_unlock(cpuctx, ctx);
@@ -3814,9 +3844,6 @@ void __perf_event_task_sched_in(struct task_struct *prev,
if (atomic_read(&nr_switch_events))
perf_event_switch(task, prev, true);
-
- if (__this_cpu_read(perf_sched_cb_usages))
- perf_pmu_sched_task(prev, task, true);
}
static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@ -5869,11 +5896,11 @@ static void perf_pmu_output_stop(struct perf_event *event);
static void perf_mmap_close(struct vm_area_struct *vma)
{
struct perf_event *event = vma->vm_file->private_data;
-
struct perf_buffer *rb = ring_buffer_get(event);
struct user_struct *mmap_user = rb->mmap_user;
int mmap_locked = rb->mmap_locked;
unsigned long size = perf_data_size(rb);
+ bool detach_rest = false;
if (event->pmu->event_unmapped)
event->pmu->event_unmapped(event, vma->vm_mm);
@@ -5904,7 +5931,8 @@ static void perf_mmap_close(struct vm_area_struct *vma)
mutex_unlock(&event->mmap_mutex);
}
- atomic_dec(&rb->mmap_count);
+ if (atomic_dec_and_test(&rb->mmap_count))
+ detach_rest = true;
if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
goto out_put;
@@ -5913,7 +5941,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
mutex_unlock(&event->mmap_mutex);
/* If there's still other mmap()s of this buffer, we're done. */
- if (atomic_read(&rb->mmap_count))
+ if (!detach_rest)
goto out_put;
/*
@@ -12829,7 +12857,6 @@ static void __init perf_event_init_all_cpus(void)
#ifdef CONFIG_CGROUP_PERF
INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
#endif
- INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
}
}