summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/testing/sysfs-devices-mapping30
-rw-r--r--arch/arm64/kernel/perf_event.c18
-rw-r--r--arch/powerpc/perf/core-book3s.c8
-rw-r--r--arch/s390/kernel/perf_pai_crypto.c2
-rw-r--r--arch/s390/kernel/perf_pai_ext.c2
-rw-r--r--arch/x86/events/amd/brs.c2
-rw-r--r--arch/x86/events/amd/ibs.c4
-rw-r--r--arch/x86/events/amd/lbr.c6
-rw-r--r--arch/x86/events/core.c48
-rw-r--r--arch/x86/events/intel/core.c23
-rw-r--r--arch/x86/events/intel/ds.c4
-rw-r--r--arch/x86/events/intel/lbr.c30
-rw-r--r--arch/x86/events/intel/uncore.h24
-rw-r--r--arch/x86/events/intel/uncore_snb.c3
-rw-r--r--arch/x86/events/intel/uncore_snbep.c495
-rw-r--r--arch/x86/events/perf_event.h31
-rw-r--r--drivers/perf/arm_pmu.c16
-rw-r--r--include/linux/perf/arm_pmu.h2
-rw-r--r--include/linux/perf_event.h125
-rw-r--r--include/linux/sched.h2
-rw-r--r--kernel/events/core.c2100
21 files changed, 1755 insertions, 1220 deletions
diff --git a/Documentation/ABI/testing/sysfs-devices-mapping b/Documentation/ABI/testing/sysfs-devices-mapping
index 8d202bac9394..2eee1446ad4c 100644
--- a/Documentation/ABI/testing/sysfs-devices-mapping
+++ b/Documentation/ABI/testing/sysfs-devices-mapping
@@ -1,6 +1,6 @@
What: /sys/devices/uncore_iio_x/dieX
Date: February 2020
-Contact: Roman Sudarikov <roman.sudarikov@linux.intel.com>
+Contact: Alexander Antonov <alexander.antonov@linux.intel.com>
Description:
Each IIO stack (PCIe root port) has its own IIO PMON block, so
each dieX file (where X is die number) holds "Segment:Root Bus"
@@ -32,3 +32,31 @@ Description:
IIO PMU 0 on die 1 belongs to PCI RP on bus 0x40, domain 0x0000
IIO PMU 0 on die 2 belongs to PCI RP on bus 0x80, domain 0x0000
IIO PMU 0 on die 3 belongs to PCI RP on bus 0xc0, domain 0x0000
+
+What: /sys/devices/uncore_upi_x/dieX
+Date: March 2022
+Contact: Alexander Antonov <alexander.antonov@linux.intel.com>
+Description:
+ Each /sys/devices/uncore_upi_X/dieY file holds "upi_Z,die_W"
+ value that means UPI link number X on die Y is connected to UPI
+ link Z on die W and this link between sockets can be monitored
+ by UPI PMON block.
+ For example, 4-die Sapphire Rapids platform has the following
+ UPI 0 topology::
+
+ # tail /sys/devices/uncore_upi_0/die*
+ ==> /sys/devices/uncore_upi_0/die0 <==
+ upi_1,die_1
+ ==> /sys/devices/uncore_upi_0/die1 <==
+ upi_0,die_3
+ ==> /sys/devices/uncore_upi_0/die2 <==
+ upi_1,die_3
+ ==> /sys/devices/uncore_upi_0/die3 <==
+ upi_0,die_1
+
+ Which means::
+
+ UPI link 0 on die 0 is connected to UPI link 1 on die 1
+ UPI link 0 on die 1 is connected to UPI link 0 on die 3
+ UPI link 0 on die 2 is connected to UPI link 1 on die 3
+ UPI link 0 on die 3 is connected to UPI link 0 on die 1 \ No newline at end of file
diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c
index a15b3c1d15d9..a5193f2146a6 100644
--- a/arch/arm64/kernel/perf_event.c
+++ b/arch/arm64/kernel/perf_event.c
@@ -806,10 +806,14 @@ static void armv8pmu_disable_event(struct perf_event *event)
static void armv8pmu_start(struct arm_pmu *cpu_pmu)
{
- struct perf_event_context *task_ctx =
- this_cpu_ptr(cpu_pmu->pmu.pmu_cpu_context)->task_ctx;
+ struct perf_event_context *ctx;
+ int nr_user = 0;
- if (sysctl_perf_user_access && task_ctx && task_ctx->nr_user)
+ ctx = perf_cpu_task_ctx();
+ if (ctx)
+ nr_user = ctx->nr_user;
+
+ if (sysctl_perf_user_access && nr_user)
armv8pmu_enable_user_access(cpu_pmu);
else
armv8pmu_disable_user_access();
@@ -1019,10 +1023,10 @@ static int armv8pmu_set_event_filter(struct hw_perf_event *event,
return 0;
}
-static int armv8pmu_filter_match(struct perf_event *event)
+static bool armv8pmu_filter(struct pmu *pmu, int cpu)
{
- unsigned long evtype = event->hw.config_base & ARMV8_PMU_EVTYPE_EVENT;
- return evtype != ARMV8_PMUV3_PERFCTR_CHAIN;
+ struct arm_pmu *armpmu = to_arm_pmu(pmu);
+ return !cpumask_test_cpu(smp_processor_id(), &armpmu->supported_cpus);
}
static void armv8pmu_reset(void *info)
@@ -1254,7 +1258,7 @@ static int armv8_pmu_init(struct arm_pmu *cpu_pmu, char *name,
cpu_pmu->stop = armv8pmu_stop;
cpu_pmu->reset = armv8pmu_reset;
cpu_pmu->set_event_filter = armv8pmu_set_event_filter;
- cpu_pmu->filter_match = armv8pmu_filter_match;
+ cpu_pmu->filter = armv8pmu_filter;
cpu_pmu->pmu.event_idx = armv8pmu_user_event_idx;
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 942aa830e110..bf318dd9b709 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -132,7 +132,7 @@ static unsigned long ebb_switch_in(bool ebb, struct cpu_hw_events *cpuhw)
static inline void power_pmu_bhrb_enable(struct perf_event *event) {}
static inline void power_pmu_bhrb_disable(struct perf_event *event) {}
-static void power_pmu_sched_task(struct perf_event_context *ctx, bool sched_in) {}
+static void power_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in) {}
static inline void power_pmu_bhrb_read(struct perf_event *event, struct cpu_hw_events *cpuhw) {}
static void pmao_restore_workaround(bool ebb) { }
#endif /* CONFIG_PPC32 */
@@ -424,7 +424,7 @@ static void power_pmu_bhrb_enable(struct perf_event *event)
cpuhw->bhrb_context = event->ctx;
}
cpuhw->bhrb_users++;
- perf_sched_cb_inc(event->ctx->pmu);
+ perf_sched_cb_inc(event->pmu);
}
static void power_pmu_bhrb_disable(struct perf_event *event)
@@ -436,7 +436,7 @@ static void power_pmu_bhrb_disable(struct perf_event *event)
WARN_ON_ONCE(!cpuhw->bhrb_users);
cpuhw->bhrb_users--;
- perf_sched_cb_dec(event->ctx->pmu);
+ perf_sched_cb_dec(event->pmu);
if (!cpuhw->disabled && !cpuhw->bhrb_users) {
/* BHRB cannot be turned off when other
@@ -451,7 +451,7 @@ static void power_pmu_bhrb_disable(struct perf_event *event)
/* Called from ctxsw to prevent one process's branch entries to
* mingle with the other process's entries during context switch.
*/
-static void power_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
+static void power_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
{
if (!ppmu->bhrb_nr)
return;
diff --git a/arch/s390/kernel/perf_pai_crypto.c b/arch/s390/kernel/perf_pai_crypto.c
index 529a2fee4ea5..985e243a2ed8 100644
--- a/arch/s390/kernel/perf_pai_crypto.c
+++ b/arch/s390/kernel/perf_pai_crypto.c
@@ -377,7 +377,7 @@ static int paicrypt_push_sample(void)
/* Called on schedule-in and schedule-out. No access to event structure,
* but for sampling only event CRYPTO_ALL is allowed.
*/
-static void paicrypt_sched_task(struct perf_event_context *ctx, bool sched_in)
+static void paicrypt_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
{
/* We started with a clean page on event installation. So read out
* results on schedule_out and if page was dirty, clear values.
diff --git a/arch/s390/kernel/perf_pai_ext.c b/arch/s390/kernel/perf_pai_ext.c
index a46cd7406b20..1138f57baae3 100644
--- a/arch/s390/kernel/perf_pai_ext.c
+++ b/arch/s390/kernel/perf_pai_ext.c
@@ -466,7 +466,7 @@ static int paiext_push_sample(void)
/* Called on schedule-in and schedule-out. No access to event structure,
* but for sampling only event NNPA_ALL is allowed.
*/
-static void paiext_sched_task(struct perf_event_context *ctx, bool sched_in)
+static void paiext_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
{
/* We started with a clean page on event installation. So read out
* results on schedule_out and if page was dirty, clear values.
diff --git a/arch/x86/events/amd/brs.c b/arch/x86/events/amd/brs.c
index f1bff153d945..58461fa18b6f 100644
--- a/arch/x86/events/amd/brs.c
+++ b/arch/x86/events/amd/brs.c
@@ -384,7 +384,7 @@ static void amd_brs_poison_buffer(void)
* On ctxswin, sched_in = true, called after the PMU has started
* On ctxswout, sched_in = false, called before the PMU is stopped
*/
-void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in)
+void amd_pmu_brs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c
index 4cb710efbdd9..da3f5ebac4e1 100644
--- a/arch/x86/events/amd/ibs.c
+++ b/arch/x86/events/amd/ibs.c
@@ -631,7 +631,7 @@ static const struct attribute_group *op_attr_update[] = {
static struct perf_ibs perf_ibs_fetch = {
.pmu = {
- .task_ctx_nr = perf_invalid_context,
+ .task_ctx_nr = perf_hw_context,
.event_init = perf_ibs_init,
.add = perf_ibs_add,
@@ -655,7 +655,7 @@ static struct perf_ibs perf_ibs_fetch = {
static struct perf_ibs perf_ibs_op = {
.pmu = {
- .task_ctx_nr = perf_invalid_context,
+ .task_ctx_nr = perf_hw_context,
.event_init = perf_ibs_init,
.add = perf_ibs_add,
diff --git a/arch/x86/events/amd/lbr.c b/arch/x86/events/amd/lbr.c
index 38a75216c12c..eb31f850841a 100644
--- a/arch/x86/events/amd/lbr.c
+++ b/arch/x86/events/amd/lbr.c
@@ -352,7 +352,7 @@ void amd_pmu_lbr_add(struct perf_event *event)
cpuc->br_sel = reg->reg;
}
- perf_sched_cb_inc(event->ctx->pmu);
+ perf_sched_cb_inc(event->pmu);
if (!cpuc->lbr_users++ && !event->total_time_running)
amd_pmu_lbr_reset();
@@ -370,10 +370,10 @@ void amd_pmu_lbr_del(struct perf_event *event)
cpuc->lbr_users--;
WARN_ON_ONCE(cpuc->lbr_users < 0);
- perf_sched_cb_dec(event->ctx->pmu);
+ perf_sched_cb_dec(event->pmu);
}
-void amd_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
+void amd_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index b30b8bbcd1e2..85a63a41c471 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -90,6 +90,8 @@ DEFINE_STATIC_CALL_NULL(x86_pmu_swap_task_ctx, *x86_pmu.swap_task_ctx);
DEFINE_STATIC_CALL_NULL(x86_pmu_drain_pebs, *x86_pmu.drain_pebs);
DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_aliases, *x86_pmu.pebs_aliases);
+DEFINE_STATIC_CALL_NULL(x86_pmu_filter, *x86_pmu.filter);
+
/*
* This one is magic, it will get called even when PMU init fails (because
* there is no PMU), in which case it should simply return NULL.
@@ -2031,6 +2033,7 @@ static void x86_pmu_static_call_update(void)
static_call_update(x86_pmu_pebs_aliases, x86_pmu.pebs_aliases);
static_call_update(x86_pmu_guest_get_msrs, x86_pmu.guest_get_msrs);
+ static_call_update(x86_pmu_filter, x86_pmu.filter);
}
static void _x86_pmu_read(struct perf_event *event)
@@ -2052,23 +2055,6 @@ void x86_pmu_show_pmu_cap(int num_counters, int num_counters_fixed,
pr_info("... event mask: %016Lx\n", intel_ctrl);
}
-/*
- * The generic code is not hybrid friendly. The hybrid_pmu->pmu
- * of the first registered PMU is unconditionally assigned to
- * each possible cpuctx->ctx.pmu.
- * Update the correct hybrid PMU to the cpuctx->ctx.pmu.
- */
-void x86_pmu_update_cpu_context(struct pmu *pmu, int cpu)
-{
- struct perf_cpu_context *cpuctx;
-
- if (!pmu->pmu_cpu_context)
- return;
-
- cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
- cpuctx->ctx.pmu = pmu;
-}
-
static int __init init_hw_perf_events(void)
{
struct x86_pmu_quirk *quirk;
@@ -2175,13 +2161,9 @@ static int __init init_hw_perf_events(void)
if (err)
goto out2;
} else {
- u8 cpu_type = get_this_hybrid_cpu_type();
struct x86_hybrid_pmu *hybrid_pmu;
int i, j;
- if (!cpu_type && x86_pmu.get_hybrid_cpu_type)
- cpu_type = x86_pmu.get_hybrid_cpu_type();
-
for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) {
hybrid_pmu = &x86_pmu.hybrid_pmu[i];
@@ -2195,9 +2177,6 @@ static int __init init_hw_perf_events(void)
(hybrid_pmu->cpu_type == hybrid_big) ? PERF_TYPE_RAW : -1);
if (err)
break;
-
- if (cpu_type == hybrid_pmu->cpu_type)
- x86_pmu_update_cpu_context(&hybrid_pmu->pmu, raw_smp_processor_id());
}
if (i < x86_pmu.num_hybrid_pmus) {
@@ -2646,15 +2625,15 @@ static const struct attribute_group *x86_pmu_attr_groups[] = {
NULL,
};
-static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
+static void x86_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
{
- static_call_cond(x86_pmu_sched_task)(ctx, sched_in);
+ static_call_cond(x86_pmu_sched_task)(pmu_ctx, sched_in);
}
-static void x86_pmu_swap_task_ctx(struct perf_event_context *prev,
- struct perf_event_context *next)
+static void x86_pmu_swap_task_ctx(struct perf_event_pmu_context *prev_epc,
+ struct perf_event_pmu_context *next_epc)
{
- static_call_cond(x86_pmu_swap_task_ctx)(prev, next);
+ static_call_cond(x86_pmu_swap_task_ctx)(prev_epc, next_epc);
}
void perf_check_microcode(void)
@@ -2689,12 +2668,13 @@ static int x86_pmu_aux_output_match(struct perf_event *event)
return 0;
}
-static int x86_pmu_filter_match(struct perf_event *event)
+static bool x86_pmu_filter(struct pmu *pmu, int cpu)
{
- if (x86_pmu.filter_match)
- return x86_pmu.filter_match(event);
+ bool ret = false;
- return 1;
+ static_call_cond(x86_pmu_filter)(pmu, cpu, &ret);
+
+ return ret;
}
static struct pmu pmu = {
@@ -2725,7 +2705,7 @@ static struct pmu pmu = {
.aux_output_match = x86_pmu_aux_output_match,
- .filter_match = x86_pmu_filter_match,
+ .filter = x86_pmu_filter,
};
void arch_perf_update_userpage(struct perf_event *event,
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 1b92bf05fd65..dfd2c124cdf8 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -4536,8 +4536,6 @@ end:
cpumask_set_cpu(cpu, &pmu->supported_cpus);
cpuc->pmu = &pmu->pmu;
- x86_pmu_update_cpu_context(&pmu->pmu, cpu);
-
return true;
}
@@ -4671,17 +4669,17 @@ static void intel_pmu_cpu_dead(int cpu)
cpumask_clear_cpu(cpu, &hybrid_pmu(cpuc->pmu)->supported_cpus);
}
-static void intel_pmu_sched_task(struct perf_event_context *ctx,
+static void intel_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx,
bool sched_in)
{
- intel_pmu_pebs_sched_task(ctx, sched_in);
- intel_pmu_lbr_sched_task(ctx, sched_in);
+ intel_pmu_pebs_sched_task(pmu_ctx, sched_in);
+ intel_pmu_lbr_sched_task(pmu_ctx, sched_in);
}
-static void intel_pmu_swap_task_ctx(struct perf_event_context *prev,
- struct perf_event_context *next)
+static void intel_pmu_swap_task_ctx(struct perf_event_pmu_context *prev_epc,
+ struct perf_event_pmu_context *next_epc)
{
- intel_pmu_lbr_swap_task_ctx(prev, next);
+ intel_pmu_lbr_swap_task_ctx(prev_epc, next_epc);
}
static int intel_pmu_check_period(struct perf_event *event, u64 value)
@@ -4705,12 +4703,11 @@ static int intel_pmu_aux_output_match(struct perf_event *event)
return is_intel_pt_event(event);
}
-static int intel_pmu_filter_match(struct perf_event *event)
+static void intel_pmu_filter(struct pmu *pmu, int cpu, bool *ret)
{
- struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu);
- unsigned int cpu = smp_processor_id();
+ struct x86_hybrid_pmu *hpmu = hybrid_pmu(pmu);
- return cpumask_test_cpu(cpu, &pmu->supported_cpus);
+ *ret = !cpumask_test_cpu(cpu, &hpmu->supported_cpus);
}
PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63");
@@ -6413,7 +6410,7 @@ __init int intel_pmu_init(void)
static_call_update(intel_pmu_set_topdown_event_period,
&adl_set_topdown_event_period);
- x86_pmu.filter_match = intel_pmu_filter_match;
+ x86_pmu.filter = intel_pmu_filter;
x86_pmu.get_event_constraints = adl_get_event_constraints;
x86_pmu.hw_config = adl_hw_config;
x86_pmu.limit_period = spr_limit_period;
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 446d2833efa7..88e58b6ee73c 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -1069,7 +1069,7 @@ static inline bool pebs_needs_sched_cb(struct cpu_hw_events *cpuc)
return cpuc->n_pebs && (cpuc->n_pebs == cpuc->n_large_pebs);
}
-void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in)
+void intel_pmu_pebs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -1177,7 +1177,7 @@ static void
pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc,
struct perf_event *event, bool add)
{
- struct pmu *pmu = event->ctx->pmu;
+ struct pmu *pmu = event->pmu;
/*
* Make sure we get updated with the first PEBS
* event. It will trigger also during removal, but
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 8259d725054d..017baba56b01 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -515,21 +515,21 @@ static void __intel_pmu_lbr_save(void *ctx)
cpuc->last_log_id = ++task_context_opt(ctx)->log_id;
}
-void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev,
- struct perf_event_context *next)
+void intel_pmu_lbr_swap_task_ctx(struct perf_event_pmu_context *prev_epc,
+ struct perf_event_pmu_context *next_epc)
{
void *prev_ctx_data, *next_ctx_data;
- swap(prev->task_ctx_data, next->task_ctx_data);
+ swap(prev_epc->task_ctx_data, next_epc->task_ctx_data);
/*
- * Architecture specific synchronization makes sense in
- * case both prev->task_ctx_data and next->task_ctx_data
+ * Architecture specific synchronization makes sense in case
+ * both prev_epc->task_ctx_data and next_epc->task_ctx_data
* pointers are allocated.
*/
- prev_ctx_data = next->task_ctx_data;
- next_ctx_data = prev->task_ctx_data;
+ prev_ctx_data = next_epc->task_ctx_data;
+ next_ctx_data = prev_epc->task_ctx_data;
if (!prev_ctx_data || !next_ctx_data)
return;
@@ -538,7 +538,7 @@ void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev,
task_context_opt(next_ctx_data)->lbr_callstack_users);
}
-void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
+void intel_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
void *task_ctx;
@@ -551,7 +551,7 @@ void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
* the task was scheduled out, restore the stack. Otherwise flush
* the LBR stack.
*/
- task_ctx = ctx ? ctx->task_ctx_data : NULL;
+ task_ctx = pmu_ctx ? pmu_ctx->task_ctx_data : NULL;
if (task_ctx) {
if (sched_in)
__intel_pmu_lbr_restore(task_ctx);
@@ -587,8 +587,8 @@ void intel_pmu_lbr_add(struct perf_event *event)
cpuc->br_sel = event->hw.branch_reg.reg;
- if (branch_user_callstack(cpuc->br_sel) && event->ctx->task_ctx_data)
- task_context_opt(event->ctx->task_ctx_data)->lbr_callstack_users++;
+ if (branch_user_callstack(cpuc->br_sel) && event->pmu_ctx->task_ctx_data)
+ task_context_opt(event->pmu_ctx->task_ctx_data)->lbr_callstack_users++;
/*
* Request pmu::sched_task() callback, which will fire inside the
@@ -611,7 +611,7 @@ void intel_pmu_lbr_add(struct perf_event *event)
*/
if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip > 0)
cpuc->lbr_pebs_users++;
- perf_sched_cb_inc(event->ctx->pmu);
+ perf_sched_cb_inc(event->pmu);
if (!cpuc->lbr_users++ && !event->total_time_running)
intel_pmu_lbr_reset();
}
@@ -664,8 +664,8 @@ void intel_pmu_lbr_del(struct perf_event *event)
return;
if (branch_user_callstack(cpuc->br_sel) &&
- event->ctx->task_ctx_data)
- task_context_opt(event->ctx->task_ctx_data)->lbr_callstack_users--;
+ event->pmu_ctx->task_ctx_data)
+ task_context_opt(event->pmu_ctx->task_ctx_data)->lbr_callstack_users--;
if (event->hw.flags & PERF_X86_EVENT_LBR_SELECT)
cpuc->lbr_select = 0;
@@ -675,7 +675,7 @@ void intel_pmu_lbr_del(struct perf_event *event)
cpuc->lbr_users--;
WARN_ON_ONCE(cpuc->lbr_users < 0);
WARN_ON_ONCE(cpuc->lbr_pebs_users < 0);
- perf_sched_cb_dec(event->ctx->pmu);
+ perf_sched_cb_dec(event->pmu);
}
static inline bool vlbr_exclude_host(void)
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index 2adeaf4de4df..e278e2e7c051 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -2,6 +2,7 @@
#include <linux/slab.h>
#include <linux/pci.h>
#include <asm/apicdef.h>
+#include <asm/intel-family.h>
#include <linux/io-64-nonatomic-lo-hi.h>
#include <linux/perf_event.h>
@@ -88,12 +89,12 @@ struct intel_uncore_type {
* to identify which platform component each PMON block of that type is
* supposed to monitor.
*/
- struct intel_uncore_topology *topology;
+ struct intel_uncore_topology **topology;
/*
* Optional callbacks for managing mapping of Uncore units to PMONs
*/
int (*get_topology)(struct intel_uncore_type *type);
- int (*set_mapping)(struct intel_uncore_type *type);
+ void (*set_mapping)(struct intel_uncore_type *type);
void (*cleanup_mapping)(struct intel_uncore_type *type);
};
@@ -178,11 +179,26 @@ struct freerunning_counters {
unsigned *box_offsets;
};
-struct intel_uncore_topology {
- u64 configuration;
+struct uncore_iio_topology {
+ int pci_bus_no;
int segment;
};
+struct uncore_upi_topology {
+ int die_to;
+ int pmu_idx_to;
+ int enabled;
+};
+
+struct intel_uncore_topology {
+ int pmu_idx;
+ union {
+ void *untyped;
+ struct uncore_iio_topology *iio;
+ struct uncore_upi_topology *upi;
+ };
+};
+
struct pci2phy_map {
struct list_head list;
int segment;
diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c
index 1ef4f7861e2e..1f4869227efb 100644
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -1338,6 +1338,7 @@ static void __uncore_imc_init_box(struct intel_uncore_box *box,
/* MCHBAR is disabled */
if (!(mch_bar & BIT(0))) {
pr_warn("perf uncore: MCHBAR is disabled. Failed to map IMC free-running counters.\n");
+ pci_dev_put(pdev);
return;
}
mch_bar &= ~BIT(0);
@@ -1352,6 +1353,8 @@ static void __uncore_imc_init_box(struct intel_uncore_box *box,
box->io_addr = ioremap(addr, type->mmio_map_size);
if (!box->io_addr)
pr_warn("perf uncore: Failed to ioremap for %s.\n", type->name);
+
+ pci_dev_put(pdev);
}
static void tgl_uncore_imc_freerunning_init_box(struct intel_uncore_box *box)
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index ed869443efb2..44c2f879f708 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -445,6 +445,7 @@
#define ICX_UPI_PCI_PMON_CTR0 0x320
#define ICX_UPI_PCI_PMON_BOX_CTL 0x318
#define ICX_UPI_CTL_UMASK_EXT 0xffffff
+#define ICX_UBOX_DID 0x3450
/* ICX M3UPI*/
#define ICX_M3UPI_PCI_PMON_CTL0 0xd8
@@ -457,6 +458,7 @@
/* SPR */
#define SPR_RAW_EVENT_MASK_EXT 0xffffff
+#define SPR_UBOX_DID 0x3250
/* SPR CHA */
#define SPR_CHA_PMON_CTL_TID_EN (1 << 16)
@@ -1372,6 +1374,28 @@ static struct pci_driver snbep_uncore_pci_driver = {
#define NODE_ID_MASK 0x7
+/* Each three bits from 0 to 23 of GIDNIDMAP register correspond Node ID. */
+#define GIDNIDMAP(config, id) (((config) >> (3 * (id))) & 0x7)
+
+static int upi_nodeid_groupid(struct pci_dev *ubox_dev, int nodeid_loc, int idmap_loc,
+ int *nodeid, int *groupid)
+{
+ int ret;
+
+ /* get the Node ID of the local register */
+ ret = pci_read_config_dword(ubox_dev, nodeid_loc, nodeid);
+ if (ret)
+ goto err;
+
+ *nodeid = *nodeid & NODE_ID_MASK;
+ /* get the Node ID mapping */
+ ret = pci_read_config_dword(ubox_dev, idmap_loc, groupid);
+ if (ret)
+ goto err;
+err:
+ return ret;
+}
+
/*
* build pci bus to socket mapping
*/
@@ -1397,13 +1421,8 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool
* the topology.
*/
if (nr_node_ids <= 8) {
- /* get the Node ID of the local register */
- err = pci_read_config_dword(ubox_dev, nodeid_loc, &config);
- if (err)
- break;
- nodeid = config & NODE_ID_MASK;
- /* get the Node ID mapping */
- err = pci_read_config_dword(ubox_dev, idmap_loc, &config);
+ err = upi_nodeid_groupid(ubox_dev, nodeid_loc, idmap_loc,
+ &nodeid, &config);
if (err)
break;
@@ -1421,7 +1440,7 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool
* to a particular node.
*/
for (i = 0; i < 8; i++) {
- if (nodeid == ((config >> (3 * i)) & 0x7)) {
+ if (nodeid == GIDNIDMAP(config, i)) {
if (topology_max_die_per_package() > 1)
die_id = i;
else
@@ -2891,6 +2910,7 @@ static bool hswep_has_limit_sbox(unsigned int device)
return false;
pci_read_config_dword(dev, HSWEP_PCU_CAPID4_OFFET, &capid4);
+ pci_dev_put(dev);
if (!hswep_get_chop(capid4))
return true;
@@ -3699,10 +3719,16 @@ static struct intel_uncore_ops skx_uncore_iio_ops = {
.read_counter = uncore_msr_read_counter,
};
-static inline u8 skx_iio_stack(struct intel_uncore_pmu *pmu, int die)
+static struct intel_uncore_topology *pmu_topology(struct intel_uncore_pmu *pmu, int die)
{
- return pmu->type->topology[die].configuration >>
- (pmu->pmu_idx * BUS_NUM_STRIDE);
+ int idx;
+
+ for (idx = 0; idx < pmu->type->num_boxes; idx++) {
+ if (pmu->type->topology[die][idx].pmu_idx == pmu->pmu_idx)
+ return &pmu->type->topology[die][idx];
+ }
+
+ return NULL;
}
static umode_t
@@ -3710,8 +3736,9 @@ pmu_iio_mapping_visible(struct kobject *kobj, struct attribute *attr,
int die, int zero_bus_pmu)
{
struct intel_uncore_pmu *pmu = dev_to_uncore_pmu(kobj_to_dev(kobj));
+ struct intel_uncore_topology *pmut = pmu_topology(pmu, die);
- return (!skx_iio_stack(pmu, die) && pmu->pmu_idx != zero_bus_pmu) ? 0 : attr->mode;
+ return (pmut && !pmut->iio->pci_bus_no && pmu->pmu_idx != zero_bus_pmu) ? 0 : attr->mode;
}
static umode_t
@@ -3727,9 +3754,10 @@ static ssize_t skx_iio_mapping_show(struct device *dev,
struct intel_uncore_pmu *pmu = dev_to_uncore_pmu(dev);
struct dev_ext_attribute *ea = to_dev_ext_attribute(attr);
long die = (long)ea->var;
+ struct intel_uncore_topology *pmut = pmu_topology(pmu, die);
- return sprintf(buf, "%04x:%02x\n", pmu->type->topology[die].segment,
- skx_iio_stack(pmu, die));
+ return sprintf(buf, "%04x:%02x\n", pmut ? pmut->iio->segment : 0,
+ pmut ? pmut->iio->pci_bus_no : 0);
}
static int skx_msr_cpu_bus_read(int cpu, u64 *topology)
@@ -3764,18 +3792,79 @@ static int die_to_cpu(int die)
return res;
}
-static int skx_iio_get_topology(struct intel_uncore_type *type)
+enum {
+ IIO_TOPOLOGY_TYPE,
+ UPI_TOPOLOGY_TYPE,
+ TOPOLOGY_MAX
+};
+
+static const size_t topology_size[TOPOLOGY_MAX] = {
+ sizeof(*((struct intel_uncore_topology *)NULL)->iio),
+ sizeof(*((struct intel_uncore_topology *)NULL)->upi)
+};
+
+static int pmu_alloc_topology(struct intel_uncore_type *type, int topology_type)
{
- int die, ret = -EPERM;
+ int die, idx;
+ struct intel_uncore_topology **topology;
+
+ if (!type->num_boxes)
+ return -EPERM;
- type->topology = kcalloc(uncore_max_dies(), sizeof(*type->topology),
- GFP_KERNEL);
- if (!type->topology)
- return -ENOMEM;
+ topology = kcalloc(uncore_max_dies(), sizeof(*topology), GFP_KERNEL);
+ if (!topology)
+ goto err;
for (die = 0; die < uncore_max_dies(); die++) {
- ret = skx_msr_cpu_bus_read(die_to_cpu(die),
- &type->topology[die].configuration);
+ topology[die] = kcalloc(type->num_boxes, sizeof(**topology), GFP_KERNEL);
+ if (!topology[die])
+ goto clear;
+ for (idx = 0; idx < type->num_boxes; idx++) {
+ topology[die][idx].untyped = kcalloc(type->num_boxes,
+ topology_size[topology_type],
+ GFP_KERNEL);
+ if (!topology[die][idx].untyped)
+ goto clear;
+ }
+ }
+
+ type->topology = topology;
+
+ return 0;
+clear:
+ for (; die >= 0; die--) {
+ for (idx = 0; idx < type->num_boxes; idx++)
+ kfree(topology[die][idx].untyped);
+ kfree(topology[die]);
+ }
+ kfree(topology);
+err:
+ return -ENOMEM;
+}
+
+static void pmu_free_topology(struct intel_uncore_type *type)
+{
+ int die, idx;
+
+ if (type->topology) {
+ for (die = 0; die < uncore_max_dies(); die++) {
+ for (idx = 0; idx < type->num_boxes; idx++)
+ kfree(type->topology[die][idx].untyped);
+ kfree(type->topology[die]);
+ }
+ kfree(type->topology);
+ type->topology = NULL;
+ }
+}
+
+static int skx_pmu_get_topology(struct intel_uncore_type *type,
+ int (*topology_cb)(struct intel_uncore_type*, int, int, u64))
+{
+ int die, ret = -EPERM;
+ u64 cpu_bus_msr;
+
+ for (die = 0; die < uncore_max_dies(); die++) {
+ ret = skx_msr_cpu_bus_read(die_to_cpu(die), &cpu_bus_msr);
if (ret)
break;
@@ -3783,15 +3872,33 @@ static int skx_iio_get_topology(struct intel_uncore_type *type)
if (ret < 0)
break;
- type->topology[die].segment = ret;
+ ret = topology_cb(type, ret, die, cpu_bus_msr);
+ if (ret)
+ break;
}
- if (ret < 0) {
- kfree(type->topology);
- type->topology = NULL;
+ return ret;
+}
+
+static int skx_iio_topology_cb(struct intel_uncore_type *type, int segment,
+ int die, u64 cpu_bus_msr)
+{
+ int idx;
+ struct intel_uncore_topology *t;
+
+ for (idx = 0; idx < type->num_boxes; idx++) {
+ t = &type->topology[die][idx];
+ t->pmu_idx = idx;
+ t->iio->segment = segment;
+ t->iio->pci_bus_no = (cpu_bus_msr >> (idx * BUS_NUM_STRIDE)) & 0xff;
}
- return ret;
+ return 0;
+}
+
+static int skx_iio_get_topology(struct intel_uncore_type *type)
+{
+ return skx_pmu_get_topology(type, skx_iio_topology_cb);
}
static struct attribute_group skx_iio_mapping_group = {
@@ -3803,8 +3910,25 @@ static const struct attribute_group *skx_iio_attr_update[] = {
NULL,
};
-static int
-pmu_iio_set_mapping(struct intel_uncore_type *type, struct attribute_group *ag)
+static void pmu_clear_mapping_attr(const struct attribute_group **groups,
+ struct attribute_group *ag)
+{
+ int i;
+
+ for (i = 0; groups[i]; i++) {
+ if (groups[i] == ag) {
+ for (i++; groups[i]; i++)
+ groups[i - 1] = groups[i];
+ groups[i - 1] = NULL;
+ break;
+ }
+ }
+}
+
+static void
+pmu_set_mapping(struct intel_uncore_type *type, struct attribute_group *ag,
+ ssize_t (*show)(struct device*, struct device_attribute*, char*),
+ int topology_type)
{
char buf[64];
int ret;
@@ -3812,11 +3936,13 @@ pmu_iio_set_mapping(struct intel_uncore_type *type, struct attribute_group *ag)
struct attribute **attrs = NULL;
struct dev_ext_attribute *eas = NULL;
- ret = type->get_topology(type);
+ ret = pmu_alloc_topology(type, topology_type);
if (ret < 0)
goto clear_attr_update;
- ret = -ENOMEM;
+ ret = type->get_topology(type);
+ if (ret < 0)
+ goto clear_topology;
/* One more for NULL. */
attrs = kcalloc((uncore_max_dies() + 1), sizeof(*attrs), GFP_KERNEL);
@@ -3828,20 +3954,20 @@ pmu_iio_set_mapping(struct intel_uncore_type *type, struct attribute_group *ag)
goto clear_attrs;
for (die = 0; die < uncore_max_dies(); die++) {
- sprintf(buf, "die%ld", die);
+ snprintf(buf, sizeof(buf), "die%ld", die);
sysfs_attr_init(&eas[die].attr.attr);
eas[die].attr.attr.name = kstrdup(buf, GFP_KERNEL);
if (!eas[die].attr.attr.name)
goto err;
eas[die].attr.attr.mode = 0444;
- eas[die].attr.show = skx_iio_mapping_show;
+ eas[die].attr.show = show;
eas[die].attr.store = NULL;
eas[die].var = (void *)die;
attrs[die] = &eas[die].attr.attr;
}
ag->attrs = attrs;
- return 0;
+ return;
err:
for (; die >= 0; die--)
kfree(eas[die].attr.attr.name);
@@ -3849,14 +3975,13 @@ err:
clear_attrs:
kfree(attrs);
clear_topology:
- kfree(type->topology);
+ pmu_free_topology(type);
clear_attr_update:
- type->attr_update = NULL;
- return ret;
+ pmu_clear_mapping_attr(type->attr_update, ag);
}
static void
-pmu_iio_cleanup_mapping(struct intel_uncore_type *type, struct attribute_group *ag)
+pmu_cleanup_mapping(struct intel_uncore_type *type, struct attribute_group *ag)
{
struct attribute **attr = ag->attrs;
@@ -3868,17 +3993,23 @@ pmu_iio_cleanup_mapping(struct intel_uncore_type *type, struct attribute_group *
kfree(attr_to_ext_attr(*ag->attrs));
kfree(ag->attrs);
ag->attrs = NULL;
- kfree(type->topology);
+ pmu_free_topology(type);
+}
+
+static void
+pmu_iio_set_mapping(struct intel_uncore_type *type, struct attribute_group *ag)
+{
+ pmu_set_mapping(type, ag, skx_iio_mapping_show, IIO_TOPOLOGY_TYPE);
}
-static int skx_iio_set_mapping(struct intel_uncore_type *type)
+static void skx_iio_set_mapping(struct intel_uncore_type *type)
{
- return pmu_iio_set_mapping(type, &skx_iio_mapping_group);
+ pmu_iio_set_mapping(type, &skx_iio_mapping_group);
}
static void skx_iio_cleanup_mapping(struct intel_uncore_type *type)
{
- pmu_iio_cleanup_mapping(type, &skx_iio_mapping_group);
+ pmu_cleanup_mapping(type, &skx_iio_mapping_group);
}
static struct intel_uncore_type skx_uncore_iio = {
@@ -4139,6 +4270,132 @@ static struct intel_uncore_ops skx_upi_uncore_pci_ops = {
.read_counter = snbep_uncore_pci_read_counter,
};
+static umode_t
+skx_upi_mapping_visible(struct kobject *kobj, struct attribute *attr, int die)
+{
+ struct intel_uncore_pmu *pmu = dev_to_uncore_pmu(kobj_to_dev(kobj));
+
+ return pmu->type->topology[die][pmu->pmu_idx].upi->enabled ? attr->mode : 0;
+}
+
+static ssize_t skx_upi_mapping_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct intel_uncore_pmu *pmu = dev_to_uncore_pmu(dev);
+ struct dev_ext_attribute *ea = to_dev_ext_attribute(attr);
+ long die = (long)ea->var;
+ struct uncore_upi_topology *upi = pmu->type->topology[die][pmu->pmu_idx].upi;
+
+ return sysfs_emit(buf, "upi_%d,die_%d\n", upi->pmu_idx_to, upi->die_to);
+}
+
+#define SKX_UPI_REG_DID 0x2058
+#define SKX_UPI_REGS_ADDR_DEVICE_LINK0 0x0e
+#define SKX_UPI_REGS_ADDR_FUNCTION 0x00
+
+/*
+ * UPI Link Parameter 0
+ * | Bit | Default | Description
+ * | 19:16 | 0h | base_nodeid - The NodeID of the sending socket.
+ * | 12:8 | 00h | sending_port - The processor die port number of the sending port.
+ */
+#define SKX_KTILP0_OFFSET 0x94
+
+/*
+ * UPI Pcode Status. This register is used by PCode to store the link training status.
+ * | Bit | Default | Description
+ * | 4 | 0h | ll_status_valid — Bit indicates the valid training status
+ * logged from PCode to the BIOS.
+ */
+#define SKX_KTIPCSTS_OFFSET 0x120
+
+static int upi_fill_topology(struct pci_dev *dev, struct intel_uncore_topology *tp,
+ int pmu_idx)
+{
+ int ret;
+ u32 upi_conf;
+ struct uncore_upi_topology *upi = tp->upi;
+
+ tp->pmu_idx = pmu_idx;
+ ret = pci_read_config_dword(dev, SKX_KTIPCSTS_OFFSET, &upi_conf);
+ if (ret) {
+ ret = pcibios_err_to_errno(ret);
+ goto err;
+ }
+ upi->enabled = (upi_conf >> 4) & 1;
+ if (upi->enabled) {
+ ret = pci_read_config_dword(dev, SKX_KTILP0_OFFSET,
+ &upi_conf);
+ if (ret) {
+ ret = pcibios_err_to_errno(ret);
+ goto err;
+ }
+ upi->die_to = (upi_conf >> 16) & 0xf;
+ upi->pmu_idx_to = (upi_conf >> 8) & 0x1f;
+ }
+err:
+ return ret;
+}
+
+static int skx_upi_topology_cb(struct intel_uncore_type *type, int segment,
+ int die, u64 cpu_bus_msr)
+{
+ int idx, ret;
+ struct intel_uncore_topology *upi;
+ unsigned int devfn;
+ struct pci_dev *dev = NULL;
+ u8 bus = cpu_bus_msr >> (3 * BUS_NUM_STRIDE);
+
+ for (idx = 0; idx < type->num_boxes; idx++) {
+ upi = &type->topology[die][idx];
+ devfn = PCI_DEVFN(SKX_UPI_REGS_ADDR_DEVICE_LINK0 + idx,
+ SKX_UPI_REGS_ADDR_FUNCTION);
+ dev = pci_get_domain_bus_and_slot(segment, bus, devfn);
+ if (dev) {
+ ret = upi_fill_topology(dev, upi, idx);
+ if (ret)
+ break;
+ }
+ }
+
+ pci_dev_put(dev);
+ return ret;
+}
+
+static int skx_upi_get_topology(struct intel_uncore_type *type)
+{
+ /* CPX case is not supported */
+ if (boot_cpu_data.x86_stepping == 11)
+ return -EPERM;
+
+ return skx_pmu_get_topology(type, skx_upi_topology_cb);
+}
+
+static struct attribute_group skx_upi_mapping_group = {
+ .is_visible = skx_upi_mapping_visible,
+};
+
+static const struct attribute_group *skx_upi_attr_update[] = {
+ &skx_upi_mapping_group,
+ NULL
+};
+
+static void
+pmu_upi_set_mapping(struct intel_uncore_type *type, struct attribute_group *ag)
+{
+ pmu_set_mapping(type, ag, skx_upi_mapping_show, UPI_TOPOLOGY_TYPE);
+}
+
+static void skx_upi_set_mapping(struct intel_uncore_type *type)
+{
+ pmu_upi_set_mapping(type, &skx_upi_mapping_group);
+}
+
+static void skx_upi_cleanup_mapping(struct intel_uncore_type *type)
+{
+ pmu_cleanup_mapping(type, &skx_upi_mapping_group);
+}
+
static struct intel_uncore_type skx_uncore_upi = {
.name = "upi",
.num_counters = 4,
@@ -4151,6 +4408,10 @@ static struct intel_uncore_type skx_uncore_upi = {
.box_ctl = SKX_UPI_PCI_PMON_BOX_CTL,
.ops = &skx_upi_uncore_pci_ops,
.format_group = &skx_upi_uncore_format_group,
+ .attr_update = skx_upi_attr_update,
+ .get_topology = skx_upi_get_topology,
+ .set_mapping = skx_upi_set_mapping,
+ .cleanup_mapping = skx_upi_cleanup_mapping,
};
static void skx_m2m_uncore_pci_init_box(struct intel_uncore_box *box)
@@ -4461,11 +4722,6 @@ static int sad_cfg_iio_topology(struct intel_uncore_type *type, u8 *sad_pmon_map
int die, stack_id, ret = -EPERM;
struct pci_dev *dev = NULL;
- type->topology = kcalloc(uncore_max_dies(), sizeof(*type->topology),
- GFP_KERNEL);
- if (!type->topology)
- return -ENOMEM;
-
while ((dev = pci_get_device(PCI_VENDOR_ID_INTEL, SNR_ICX_MESH2IIO_MMAP_DID, dev))) {
ret = pci_read_config_dword(dev, SNR_ICX_SAD_CONTROL_CFG, &sad_cfg);
if (ret) {
@@ -4483,14 +4739,12 @@ static int sad_cfg_iio_topology(struct intel_uncore_type *type, u8 *sad_pmon_map
/* Convert stack id from SAD_CONTROL to PMON notation. */
stack_id = sad_pmon_mapping[stack_id];
- ((u8 *)&(type->topology[die].configuration))[stack_id] = dev->bus->number;
- type->topology[die].segment = pci_domain_nr(dev->bus);
+ type->topology[die][stack_id].iio->segment = pci_domain_nr(dev->bus);
+ type->topology[die][stack_id].pmu_idx = stack_id;
+ type->topology[die][stack_id].iio->pci_bus_no = dev->bus->number;
}
- if (ret) {
- kfree(type->topology);
- type->topology = NULL;
- }
+ pci_dev_put(dev);
return ret;
}
@@ -4519,14 +4773,14 @@ static int snr_iio_get_topology(struct intel_uncore_type *type)
return sad_cfg_iio_topology(type, snr_sad_pmon_mapping);
}
-static int snr_iio_set_mapping(struct intel_uncore_type *type)
+static void snr_iio_set_mapping(struct intel_uncore_type *type)
{
- return pmu_iio_set_mapping(type, &snr_iio_mapping_group);
+ pmu_iio_set_mapping(type, &snr_iio_mapping_group);
}
static void snr_iio_cleanup_mapping(struct intel_uncore_type *type)
{
- pmu_iio_cleanup_mapping(type, &snr_iio_mapping_group);
+ pmu_cleanup_mapping(type, &snr_iio_mapping_group);
}
static struct event_constraint snr_uncore_iio_constraints[] = {
@@ -4857,6 +5111,8 @@ static int snr_uncore_mmio_map(struct intel_uncore_box *box,
addr += box_ctl;
+ pci_dev_put(pdev);
+
box->io_addr = ioremap(addr, type->mmio_map_size);
if (!box->io_addr) {
pr_warn("perf uncore: Failed to ioremap for %s.\n", type->name);
@@ -5137,14 +5393,19 @@ static int icx_iio_get_topology(struct intel_uncore_type *type)
return sad_cfg_iio_topology(type, icx_sad_pmon_mapping);
}
-static int icx_iio_set_mapping(struct intel_uncore_type *type)
+static void icx_iio_set_mapping(struct intel_uncore_type *type)
{
- return pmu_iio_set_mapping(type, &icx_iio_mapping_group);
+ /* Detect ICX-D system. This case is not supported */
+ if (boot_cpu_data.x86_model == INTEL_FAM6_ICELAKE_D) {
+ pmu_clear_mapping_attr(type->attr_update, &icx_iio_mapping_group);
+ return;
+ }
+ pmu_iio_set_mapping(type, &icx_iio_mapping_group);
}
static void icx_iio_cleanup_mapping(struct intel_uncore_type *type)
{
- pmu_iio_cleanup_mapping(type, &icx_iio_mapping_group);
+ pmu_cleanup_mapping(type, &icx_iio_mapping_group);
}
static struct intel_uncore_type icx_uncore_iio = {
@@ -5337,6 +5598,76 @@ static const struct attribute_group icx_upi_uncore_format_group = {
.attrs = icx_upi_uncore_formats_attr,
};
+#define ICX_UPI_REGS_ADDR_DEVICE_LINK0 0x02
+#define ICX_UPI_REGS_ADDR_FUNCTION 0x01
+
+static int discover_upi_topology(struct intel_uncore_type *type, int ubox_did, int dev_link0)
+{
+ struct pci_dev *ubox = NULL;
+ struct pci_dev *dev = NULL;
+ u32 nid, gid;
+ int i, idx, ret = -EPERM;
+ struct intel_uncore_topology *upi;
+ unsigned int devfn;
+
+ /* GIDNIDMAP method supports machines which have less than 8 sockets. */
+ if (uncore_max_dies() > 8)
+ goto err;
+
+ while ((ubox = pci_get_device(PCI_VENDOR_ID_INTEL, ubox_did, ubox))) {
+ ret = upi_nodeid_groupid(ubox, SKX_CPUNODEID, SKX_GIDNIDMAP, &nid, &gid);
+ if (ret) {
+ ret = pcibios_err_to_errno(ret);
+ break;
+ }
+
+ for (i = 0; i < 8; i++) {
+ if (nid != GIDNIDMAP(gid, i))
+ continue;
+ for (idx = 0; idx < type->num_boxes; idx++) {
+ upi = &type->topology[nid][idx];
+ devfn = PCI_DEVFN(dev_link0 + idx, ICX_UPI_REGS_ADDR_FUNCTION);
+ dev = pci_get_domain_bus_and_slot(pci_domain_nr(ubox->bus),
+ ubox->bus->number,
+ devfn);
+ if (dev) {
+ ret = upi_fill_topology(dev, upi, idx);
+ if (ret)
+ goto err;
+ }
+ }
+ }
+ }
+err:
+ pci_dev_put(ubox);
+ pci_dev_put(dev);
+ return ret;
+}
+
+static int icx_upi_get_topology(struct intel_uncore_type *type)
+{
+ return discover_upi_topology(type, ICX_UBOX_DID, ICX_UPI_REGS_ADDR_DEVICE_LINK0);
+}
+
+static struct attribute_group icx_upi_mapping_group = {
+ .is_visible = skx_upi_mapping_visible,
+};
+
+static const struct attribute_group *icx_upi_attr_update[] = {
+ &icx_upi_mapping_group,
+ NULL
+};
+
+static void icx_upi_set_mapping(struct intel_uncore_type *type)
+{
+ pmu_upi_set_mapping(type, &icx_upi_mapping_group);
+}
+
+static void icx_upi_cleanup_mapping(struct intel_uncore_type *type)
+{
+ pmu_cleanup_mapping(type, &icx_upi_mapping_group);
+}
+
static struct intel_uncore_type icx_uncore_upi = {
.name = "upi",
.num_counters = 4,
@@ -5349,6 +5680,10 @@ static struct intel_uncore_type icx_uncore_upi = {
.box_ctl = ICX_UPI_PCI_PMON_BOX_CTL,
.ops = &skx_upi_uncore_pci_ops,
.format_group = &icx_upi_uncore_format_group,
+ .attr_update = icx_upi_attr_update,
+ .get_topology = icx_upi_get_topology,
+ .set_mapping = icx_upi_set_mapping,
+ .cleanup_mapping = icx_upi_cleanup_mapping,
};
static struct event_constraint icx_uncore_m3upi_constraints[] = {
@@ -5780,9 +6115,43 @@ static struct intel_uncore_type spr_uncore_m2m = {
.name = "m2m",
};
+static struct attribute_group spr_upi_mapping_group = {
+ .is_visible = skx_upi_mapping_visible,
+};
+
+static const struct attribute_group *spr_upi_attr_update[] = {
+ &uncore_alias_group,
+ &spr_upi_mapping_group,
+ NULL
+};
+
+#define SPR_UPI_REGS_ADDR_DEVICE_LINK0 0x01
+
+static void spr_upi_set_mapping(struct intel_uncore_type *type)
+{
+ pmu_upi_set_mapping(type, &spr_upi_mapping_group);
+}
+
+static void spr_upi_cleanup_mapping(struct intel_uncore_type *type)
+{
+ pmu_cleanup_mapping(type, &spr_upi_mapping_group);
+}
+
+static int spr_upi_get_topology(struct intel_uncore_type *type)
+{
+ return discover_upi_topology(type, SPR_UBOX_DID, SPR_UPI_REGS_ADDR_DEVICE_LINK0);
+}
+
static struct intel_uncore_type spr_uncore_upi = {
- SPR_UNCORE_PCI_COMMON_FORMAT(),
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .event_mask_ext = SPR_RAW_EVENT_MASK_EXT,
+ .format_group = &spr_uncore_raw_format_group,
+ .ops = &spr_uncore_pci_ops,
.name = "upi",
+ .attr_update = spr_upi_attr_update,
+ .get_topology = spr_upi_get_topology,
+ .set_mapping = spr_upi_set_mapping,
+ .cleanup_mapping = spr_upi_cleanup_mapping,
};
static struct intel_uncore_type spr_uncore_m3upi = {
@@ -5986,6 +6355,12 @@ static void uncore_type_customized_copy(struct intel_uncore_type *to_type,
to_type->format_group = from_type->format_group;
if (from_type->attr_update)
to_type->attr_update = from_type->attr_update;
+ if (from_type->set_mapping)
+ to_type->set_mapping = from_type->set_mapping;
+ if (from_type->get_topology)
+ to_type->get_topology = from_type->get_topology;
+ if (from_type->cleanup_mapping)
+ to_type->cleanup_mapping = from_type->cleanup_mapping;
}
static struct intel_uncore_type **
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 332d2e6d8ae4..0e849f28a5c1 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -811,7 +811,7 @@ struct x86_pmu {
void (*cpu_dead)(int cpu);
void (*check_microcode)(void);
- void (*sched_task)(struct perf_event_context *ctx,
+ void (*sched_task)(struct perf_event_pmu_context *pmu_ctx,
bool sched_in);
/*
@@ -894,12 +894,12 @@ struct x86_pmu {
int num_topdown_events;
/*
- * perf task context (i.e. struct perf_event_context::task_ctx_data)
+ * perf task context (i.e. struct perf_event_pmu_context::task_ctx_data)
* switch helper to bridge calls from perf/core to perf/x86.
* See struct pmu::swap_task_ctx() usage for examples;
*/
- void (*swap_task_ctx)(struct perf_event_context *prev,
- struct perf_event_context *next);
+ void (*swap_task_ctx)(struct perf_event_pmu_context *prev_epc,
+ struct perf_event_pmu_context *next_epc);
/*
* AMD bits
@@ -925,7 +925,7 @@ struct x86_pmu {
int (*aux_output_match) (struct perf_event *event);
- int (*filter_match)(struct perf_event *event);
+ void (*filter)(struct pmu *pmu, int cpu, bool *ret);
/*
* Hybrid support
*
@@ -1180,8 +1180,6 @@ int x86_pmu_handle_irq(struct pt_regs *regs);
void x86_pmu_show_pmu_cap(int num_counters, int num_counters_fixed,
u64 intel_ctrl);
-void x86_pmu_update_cpu_context(struct pmu *pmu, int cpu);
-
extern struct event_constraint emptyconstraint;
extern struct event_constraint unconstrained;
@@ -1306,7 +1304,7 @@ void amd_pmu_lbr_reset(void);
void amd_pmu_lbr_read(void);
void amd_pmu_lbr_add(struct perf_event *event);
void amd_pmu_lbr_del(struct perf_event *event);
-void amd_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in);
+void amd_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in);
void amd_pmu_lbr_enable_all(void);
void amd_pmu_lbr_disable_all(void);
int amd_pmu_lbr_hw_config(struct perf_event *event);
@@ -1322,7 +1320,6 @@ void amd_brs_enable_all(void);
void amd_brs_disable_all(void);
void amd_brs_drain(void);
void amd_brs_lopwr_init(void);
-void amd_brs_disable_all(void);
int amd_brs_hw_config(struct perf_event *event);
void amd_brs_reset(void);
@@ -1330,7 +1327,7 @@ static inline void amd_pmu_brs_add(struct perf_event *event)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
- perf_sched_cb_inc(event->ctx->pmu);
+ perf_sched_cb_inc(event->pmu);
cpuc->lbr_users++;
/*
* No need to reset BRS because it is reset
@@ -1345,10 +1342,10 @@ static inline void amd_pmu_brs_del(struct perf_event *event)
cpuc->lbr_users--;
WARN_ON_ONCE(cpuc->lbr_users < 0);
- perf_sched_cb_dec(event->ctx->pmu);
+ perf_sched_cb_dec(event->pmu);
}
-void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in);
+void amd_pmu_brs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in);
#else
static inline int amd_brs_init(void)
{
@@ -1373,7 +1370,7 @@ static inline void amd_pmu_brs_del(struct perf_event *event)
{
}
-static inline void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in)
+static inline void amd_pmu_brs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
{
}
@@ -1533,7 +1530,7 @@ void intel_pmu_pebs_enable_all(void);
void intel_pmu_pebs_disable_all(void);
-void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in);
+void intel_pmu_pebs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in);
void intel_pmu_auto_reload_read(struct perf_event *event);
@@ -1541,10 +1538,10 @@ void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr);
void intel_ds_init(void);
-void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev,
- struct perf_event_context *next);
+void intel_pmu_lbr_swap_task_ctx(struct perf_event_pmu_context *prev_epc,
+ struct perf_event_pmu_context *next_epc);
-void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in);
+void intel_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in);
u64 lbr_from_signext_quirk_wr(u64 val);
diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
index bb56676f50ef..9b593f985805 100644
--- a/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@ -547,15 +547,14 @@ static void armpmu_disable(struct pmu *pmu)
* microarchitecture, and aren't suitable for another. Thus, only match CPUs of
* the same microarchitecture.
*/
-static int armpmu_filter_match(struct perf_event *event)
+static bool armpmu_filter(struct pmu *pmu, int cpu)
{
- struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
- unsigned int cpu = smp_processor_id();
- int ret;
+ struct arm_pmu *armpmu = to_arm_pmu(pmu);
+ bool ret;
ret = cpumask_test_cpu(cpu, &armpmu->supported_cpus);
- if (ret && armpmu->filter_match)
- return armpmu->filter_match(event);
+ if (ret && armpmu->filter)
+ return armpmu->filter(pmu, cpu);
return ret;
}
@@ -882,14 +881,13 @@ struct arm_pmu *armpmu_alloc(void)
.start = armpmu_start,
.stop = armpmu_stop,
.read = armpmu_read,
- .filter_match = armpmu_filter_match,
+ .filter = armpmu_filter,
.attr_groups = pmu->attr_groups,
/*
* This is a CPU PMU potentially in a heterogeneous
* configuration (e.g. big.LITTLE). This is not an uncore PMU,
* and we have taken ctx sharing into account (e.g. with our
- * pmu::filter_match callback and pmu::event_init group
- * validation).
+ * pmu::filter callback and pmu::event_init group validation).
*/
.capabilities = PERF_PMU_CAP_HETEROGENEOUS_CPUS | PERF_PMU_CAP_EXTENDED_REGS,
};
diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h
index 0c15c5b7f801..ef914a600087 100644
--- a/include/linux/perf/arm_pmu.h
+++ b/include/linux/perf/arm_pmu.h
@@ -100,7 +100,7 @@ struct arm_pmu {
void (*stop)(struct arm_pmu *);
void (*reset)(void *);
int (*map_event)(struct perf_event *event);
- int (*filter_match)(struct perf_event *event);
+ bool (*filter)(struct pmu *pmu, int cpu);
int num_events;
bool secure_access; /* 32-bit ARM only */
#define ARMV8_PMUV3_MAX_COMMON_EVENTS 0x40
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 0031f7b4d9ab..c6a3bac76966 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -266,6 +266,7 @@ struct hw_perf_event {
};
struct perf_event;
+struct perf_event_pmu_context;
/*
* Common implementation detail of pmu::{start,commit,cancel}_txn
@@ -308,7 +309,7 @@ struct pmu {
int capabilities;
int __percpu *pmu_disable_count;
- struct perf_cpu_context __percpu *pmu_cpu_context;
+ struct perf_cpu_pmu_context __percpu *cpu_pmu_context;
atomic_t exclusive_cnt; /* < 0: cpu; > 0: tsk */
int task_ctx_nr;
int hrtimer_interval_ms;
@@ -443,7 +444,7 @@ struct pmu {
/*
* context-switches callback
*/
- void (*sched_task) (struct perf_event_context *ctx,
+ void (*sched_task) (struct perf_event_pmu_context *pmu_ctx,
bool sched_in);
/*
@@ -457,8 +458,8 @@ struct pmu {
* implementation and Perf core context switch handling callbacks for usage
* examples.
*/
- void (*swap_task_ctx) (struct perf_event_context *prev,
- struct perf_event_context *next);
+ void (*swap_task_ctx) (struct perf_event_pmu_context *prev_epc,
+ struct perf_event_pmu_context *next_epc);
/* optional */
/*
@@ -522,9 +523,10 @@ struct pmu {
/* optional */
/*
- * Filter events for PMU-specific reasons.
+ * Skip programming this PMU on the given CPU. Typically needed for
+ * big.LITTLE things.
*/
- int (*filter_match) (struct perf_event *event); /* optional */
+ bool (*filter) (struct pmu *pmu, int cpu); /* optional */
/*
* Check period value for PERF_EVENT_IOC_PERIOD ioctl.
@@ -695,6 +697,11 @@ struct perf_event {
int group_caps;
struct perf_event *group_leader;
+ /*
+ * event->pmu will always point to pmu in which this event belongs.
+ * Whereas event->pmu_ctx->pmu may point to other pmu when group of
+ * different pmu events is created.
+ */
struct pmu *pmu;
void *pmu_private;
@@ -720,6 +727,12 @@ struct perf_event {
struct hw_perf_event hw;
struct perf_event_context *ctx;
+ /*
+ * event->pmu_ctx points to perf_event_pmu_context in which the event
+ * is added. This pmu_ctx can be of other pmu for sw event when that
+ * sw event is part of a group which also contains non-sw events.
+ */
+ struct perf_event_pmu_context *pmu_ctx;
atomic_long_t refcount;
/*
@@ -812,19 +825,69 @@ struct perf_event {
#endif /* CONFIG_PERF_EVENTS */
};
+/*
+ * ,-----------------------[1:n]----------------------.
+ * V V
+ * perf_event_context <-[1:n]-> perf_event_pmu_context <--- perf_event
+ * ^ ^ | |
+ * `--------[1:n]---------' `-[n:1]-> pmu <-[1:n]-'
+ *
+ *
+ * struct perf_event_pmu_context lifetime is refcount based and RCU freed
+ * (similar to perf_event_context). Locking is as if it were a member of
+ * perf_event_context; specifically:
+ *
+ * modification, both: ctx->mutex && ctx->lock
+ * reading, either: ctx->mutex || ctx->lock
+ *
+ * There is one exception to this; namely put_pmu_ctx() isn't always called
+ * with ctx->mutex held; this means that as long as we can guarantee the epc
+ * has events the above rules hold.
+ *
+ * Specificially, sys_perf_event_open()'s group_leader case depends on
+ * ctx->mutex pinning the configuration. Since we hold a reference on
+ * group_leader (through the filedesc) it can't go away, therefore it's
+ * associated pmu_ctx must exist and cannot change due to ctx->mutex.
+ */
+struct perf_event_pmu_context {
+ struct pmu *pmu;
+ struct perf_event_context *ctx;
+
+ struct list_head pmu_ctx_entry;
+
+ struct list_head pinned_active;
+ struct list_head flexible_active;
+
+ /* Used to avoid freeing per-cpu perf_event_pmu_context */
+ unsigned int embedded : 1;
+
+ unsigned int nr_events;
+
+ atomic_t refcount; /* event <-> epc */
+ struct rcu_head rcu_head;
+
+ void *task_ctx_data; /* pmu specific data */
+ /*
+ * Set when one or more (plausibly active) event can't be scheduled
+ * due to pmu overcommit or pmu constraints, except tolerant to
+ * events not necessary to be active due to scheduling constraints,
+ * such as cgroups.
+ */
+ int rotate_necessary;
+};
struct perf_event_groups {
struct rb_root tree;
u64 index;
};
+
/**
* struct perf_event_context - event context structure
*
* Used as a container for task events and CPU events as well:
*/
struct perf_event_context {
- struct pmu *pmu;
/*
* Protect the states of the events in the list,
* nr_active, and the list:
@@ -837,27 +900,21 @@ struct perf_event_context {
*/
struct mutex mutex;
- struct list_head active_ctx_list;
+ struct list_head pmu_ctx_list;
struct perf_event_groups pinned_groups;
struct perf_event_groups flexible_groups;
struct list_head event_list;
- struct list_head pinned_active;
- struct list_head flexible_active;
-
int nr_events;
- int nr_active;
int nr_user;
int is_active;
+
+ int nr_task_data;
int nr_stat;
int nr_freq;
int rotate_disable;
- /*
- * Set when nr_events != nr_active, except tolerant to events not
- * necessary to be active due to scheduling constraints, such as cgroups.
- */
- int rotate_necessary;
- refcount_t refcount;
+
+ refcount_t refcount; /* event <-> ctx */
struct task_struct *task;
/*
@@ -878,7 +935,6 @@ struct perf_event_context {
#ifdef CONFIG_CGROUP_PERF
int nr_cgroups; /* cgroup evts */
#endif
- void *task_ctx_data; /* pmu specific data */
struct rcu_head rcu_head;
/*
@@ -896,12 +952,13 @@ struct perf_event_context {
*/
#define PERF_NR_CONTEXTS 4
-/**
- * struct perf_cpu_context - per cpu event context structure
- */
-struct perf_cpu_context {
- struct perf_event_context ctx;
- struct perf_event_context *task_ctx;
+struct perf_cpu_pmu_context {
+ struct perf_event_pmu_context epc;
+ struct perf_event_pmu_context *task_epc;
+
+ struct list_head sched_cb_entry;
+ int sched_cb_usage;
+
int active_oncpu;
int exclusive;
@@ -909,16 +966,20 @@ struct perf_cpu_context {
struct hrtimer hrtimer;
ktime_t hrtimer_interval;
unsigned int hrtimer_active;
+};
+
+/**
+ * struct perf_event_cpu_context - per cpu event context structure
+ */
+struct perf_cpu_context {
+ struct perf_event_context ctx;
+ struct perf_event_context *task_ctx;
+ int online;
#ifdef CONFIG_CGROUP_PERF
struct perf_cgroup *cgrp;
- struct list_head cgrp_cpuctx_entry;
#endif
- struct list_head sched_cb_entry;
- int sched_cb_usage;
-
- int online;
/*
* Per-CPU storage for iterators used in visit_groups_merge. The default
* storage is of size 2 to hold the CPU and any CPU event iterators.
@@ -982,6 +1043,8 @@ perf_cgroup_from_task(struct task_struct *task, struct perf_event_context *ctx)
#ifdef CONFIG_PERF_EVENTS
+extern struct perf_event_context *perf_cpu_task_ctx(void);
+
extern void *perf_aux_output_begin(struct perf_output_handle *handle,
struct perf_event *event);
extern void perf_aux_output_end(struct perf_output_handle *handle,
@@ -1187,7 +1250,7 @@ static inline int is_software_event(struct perf_event *event)
*/
static inline int in_software_context(struct perf_event *event)
{
- return event->ctx->pmu->task_ctx_nr == perf_sw_context;
+ return event->pmu_ctx->pmu->task_ctx_nr == perf_sw_context;
}
static inline int is_exclusive_pmu(struct pmu *pmu)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ffb6eb55cd13..4e03f1dcbe52 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1243,7 +1243,7 @@ struct task_struct {
unsigned int futex_state;
#endif
#ifdef CONFIG_PERF_EVENTS
- struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts];
+ struct perf_event_context *perf_event_ctxp;
struct mutex perf_event_mutex;
struct list_head perf_event_list;
#endif
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 7f04f995c975..e47914ac8732 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -155,12 +155,6 @@ static int cpu_function_call(int cpu, remote_function_f func, void *info)
return data.ret;
}
-static inline struct perf_cpu_context *
-__get_cpu_context(struct perf_event_context *ctx)
-{
- return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
-}
-
static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
@@ -184,6 +178,14 @@ static bool is_kernel_event(struct perf_event *event)
return READ_ONCE(event->owner) == TASK_TOMBSTONE;
}
+static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
+
+struct perf_event_context *perf_cpu_task_ctx(void)
+{
+ lockdep_assert_irqs_disabled();
+ return this_cpu_ptr(&perf_cpu_context)->task_ctx;
+}
+
/*
* On task ctx scheduling...
*
@@ -217,7 +219,7 @@ static int event_function(void *info)
struct event_function_struct *efs = info;
struct perf_event *event = efs->event;
struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct perf_event_context *task_ctx = cpuctx->task_ctx;
int ret = 0;
@@ -314,7 +316,7 @@ again:
static void event_function_local(struct perf_event *event, event_f func, void *data)
{
struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct task_struct *task = READ_ONCE(ctx->task);
struct perf_event_context *task_ctx = NULL;
@@ -388,7 +390,6 @@ static DEFINE_MUTEX(perf_sched_mutex);
static atomic_t perf_sched_count;
static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
-static DEFINE_PER_CPU(int, perf_sched_cb_usages);
static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
static atomic_t nr_mmap_events __read_mostly;
@@ -448,7 +449,7 @@ static void update_perf_cpu_limits(void)
WRITE_ONCE(perf_sample_allowed_ns, tmp);
}
-static bool perf_rotate_context(struct perf_cpu_context *cpuctx);
+static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc);
int perf_proc_update_handler(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
@@ -571,12 +572,6 @@ void perf_sample_event_took(u64 sample_len_ns)
static atomic64_t perf_event_id;
-static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
- enum event_type_t event_type);
-
-static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
- enum event_type_t event_type);
-
static void update_context_time(struct perf_event_context *ctx);
static u64 perf_event_time(struct perf_event *event);
@@ -691,13 +686,31 @@ do { \
___p; \
})
+static void perf_ctx_disable(struct perf_event_context *ctx)
+{
+ struct perf_event_pmu_context *pmu_ctx;
+
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
+ perf_pmu_disable(pmu_ctx->pmu);
+}
+
+static void perf_ctx_enable(struct perf_event_context *ctx)
+{
+ struct perf_event_pmu_context *pmu_ctx;
+
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
+ perf_pmu_enable(pmu_ctx->pmu);
+}
+
+static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type);
+static void ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type);
+
#ifdef CONFIG_CGROUP_PERF
static inline bool
perf_cgroup_match(struct perf_event *event)
{
- struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
/* @event doesn't care about cgroup */
if (!event->cgrp)
@@ -823,54 +836,39 @@ perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
}
}
-static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
-
/*
* reschedule events based on the cgroup constraint of task.
*/
static void perf_cgroup_switch(struct task_struct *task)
{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct perf_cgroup *cgrp;
- struct perf_cpu_context *cpuctx, *tmp;
- struct list_head *list;
- unsigned long flags;
-
- /*
- * Disable interrupts and preemption to avoid this CPU's
- * cgrp_cpuctx_entry to change under us.
- */
- local_irq_save(flags);
cgrp = perf_cgroup_from_task(task, NULL);
- list = this_cpu_ptr(&cgrp_cpuctx_list);
- list_for_each_entry_safe(cpuctx, tmp, list, cgrp_cpuctx_entry) {
- WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
- if (READ_ONCE(cpuctx->cgrp) == cgrp)
- continue;
-
- perf_ctx_lock(cpuctx, cpuctx->task_ctx);
- perf_pmu_disable(cpuctx->ctx.pmu);
+ WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
+ if (READ_ONCE(cpuctx->cgrp) == cgrp)
+ return;
- cpu_ctx_sched_out(cpuctx, EVENT_ALL);
- /*
- * must not be done before ctxswout due
- * to update_cgrp_time_from_cpuctx() in
- * ctx_sched_out()
- */
- cpuctx->cgrp = cgrp;
- /*
- * set cgrp before ctxsw in to allow
- * perf_cgroup_set_timestamp() in ctx_sched_in()
- * to not have to pass task around
- */
- cpu_ctx_sched_in(cpuctx, EVENT_ALL);
+ perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+ perf_ctx_disable(&cpuctx->ctx);
- perf_pmu_enable(cpuctx->ctx.pmu);
- perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
- }
+ ctx_sched_out(&cpuctx->ctx, EVENT_ALL);
+ /*
+ * must not be done before ctxswout due
+ * to update_cgrp_time_from_cpuctx() in
+ * ctx_sched_out()
+ */
+ cpuctx->cgrp = cgrp;
+ /*
+ * set cgrp before ctxsw in to allow
+ * perf_cgroup_set_timestamp() in ctx_sched_in()
+ * to not have to pass task around
+ */
+ ctx_sched_in(&cpuctx->ctx, EVENT_ALL);
- local_irq_restore(flags);
+ perf_ctx_enable(&cpuctx->ctx);
+ perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}
static int perf_cgroup_ensure_storage(struct perf_event *event,
@@ -888,7 +886,7 @@ static int perf_cgroup_ensure_storage(struct perf_event *event,
heap_size++;
for_each_possible_cpu(cpu) {
- cpuctx = per_cpu_ptr(event->pmu->pmu_cpu_context, cpu);
+ cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
if (heap_size <= cpuctx->heap_size)
continue;
@@ -972,8 +970,6 @@ perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ct
return;
cpuctx->cgrp = perf_cgroup_from_task(current, ctx);
- list_add(&cpuctx->cgrp_cpuctx_entry,
- per_cpu_ptr(&cgrp_cpuctx_list, event->cpu));
}
static inline void
@@ -994,7 +990,6 @@ perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *c
return;
cpuctx->cgrp = NULL;
- list_del(&cpuctx->cgrp_cpuctx_entry);
}
#else /* !CONFIG_CGROUP_PERF */
@@ -1069,34 +1064,30 @@ static void perf_cgroup_switch(struct task_struct *task)
*/
static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
{
- struct perf_cpu_context *cpuctx;
+ struct perf_cpu_pmu_context *cpc;
bool rotations;
lockdep_assert_irqs_disabled();
- cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
- rotations = perf_rotate_context(cpuctx);
+ cpc = container_of(hr, struct perf_cpu_pmu_context, hrtimer);
+ rotations = perf_rotate_context(cpc);
- raw_spin_lock(&cpuctx->hrtimer_lock);
+ raw_spin_lock(&cpc->hrtimer_lock);
if (rotations)
- hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
+ hrtimer_forward_now(hr, cpc->hrtimer_interval);
else
- cpuctx->hrtimer_active = 0;
- raw_spin_unlock(&cpuctx->hrtimer_lock);
+ cpc->hrtimer_active = 0;
+ raw_spin_unlock(&cpc->hrtimer_lock);
return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
}
-static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
+static void __perf_mux_hrtimer_init(struct perf_cpu_pmu_context *cpc, int cpu)
{
- struct hrtimer *timer = &cpuctx->hrtimer;
- struct pmu *pmu = cpuctx->ctx.pmu;
+ struct hrtimer *timer = &cpc->hrtimer;
+ struct pmu *pmu = cpc->epc.pmu;
u64 interval;
- /* no multiplexing needed for SW PMU */
- if (pmu->task_ctx_nr == perf_sw_context)
- return;
-
/*
* check default is sane, if not set then force to
* default interval (1/tick)
@@ -1105,34 +1096,34 @@ static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
if (interval < 1)
interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
- cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
+ cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
- raw_spin_lock_init(&cpuctx->hrtimer_lock);
+ raw_spin_lock_init(&cpc->hrtimer_lock);
hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
timer->function = perf_mux_hrtimer_handler;
}
-static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
+static int perf_mux_hrtimer_restart(struct perf_cpu_pmu_context *cpc)
{
- struct hrtimer *timer = &cpuctx->hrtimer;
- struct pmu *pmu = cpuctx->ctx.pmu;
+ struct hrtimer *timer = &cpc->hrtimer;
unsigned long flags;
- /* not for SW PMU */
- if (pmu->task_ctx_nr == perf_sw_context)
- return 0;
-
- raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
- if (!cpuctx->hrtimer_active) {
- cpuctx->hrtimer_active = 1;
- hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
+ raw_spin_lock_irqsave(&cpc->hrtimer_lock, flags);
+ if (!cpc->hrtimer_active) {
+ cpc->hrtimer_active = 1;
+ hrtimer_forward_now(timer, cpc->hrtimer_interval);
hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
}
- raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
+ raw_spin_unlock_irqrestore(&cpc->hrtimer_lock, flags);
return 0;
}
+static int perf_mux_hrtimer_restart_ipi(void *arg)
+{
+ return perf_mux_hrtimer_restart(arg);
+}
+
void perf_pmu_disable(struct pmu *pmu)
{
int *count = this_cpu_ptr(pmu->pmu_disable_count);
@@ -1147,32 +1138,9 @@ void perf_pmu_enable(struct pmu *pmu)
pmu->pmu_enable(pmu);
}
-static DEFINE_PER_CPU(struct list_head, active_ctx_list);
-
-/*
- * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
- * perf_event_task_tick() are fully serialized because they're strictly cpu
- * affine and perf_event_ctx{activate,deactivate} are called with IRQs
- * disabled, while perf_event_task_tick is called from IRQ context.
- */
-static void perf_event_ctx_activate(struct perf_event_context *ctx)
+static void perf_assert_pmu_disabled(struct pmu *pmu)
{
- struct list_head *head = this_cpu_ptr(&active_ctx_list);
-
- lockdep_assert_irqs_disabled();
-
- WARN_ON(!list_empty(&ctx->active_ctx_list));
-
- list_add(&ctx->active_ctx_list, head);
-}
-
-static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
-{
- lockdep_assert_irqs_disabled();
-
- WARN_ON(list_empty(&ctx->active_ctx_list));
-
- list_del_init(&ctx->active_ctx_list);
+ WARN_ON_ONCE(*this_cpu_ptr(pmu->pmu_disable_count) == 0);
}
static void get_ctx(struct perf_event_context *ctx)
@@ -1199,7 +1167,6 @@ static void free_ctx(struct rcu_head *head)
struct perf_event_context *ctx;
ctx = container_of(head, struct perf_event_context, rcu_head);
- free_task_ctx_data(ctx->pmu, ctx->task_ctx_data);
kfree(ctx);
}
@@ -1384,7 +1351,7 @@ static u64 primary_event_id(struct perf_event *event)
* the context could get moved to another task.
*/
static struct perf_event_context *
-perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
+perf_lock_task_context(struct task_struct *task, unsigned long *flags)
{
struct perf_event_context *ctx;
@@ -1400,7 +1367,7 @@ retry:
*/
local_irq_save(*flags);
rcu_read_lock();
- ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
+ ctx = rcu_dereference(task->perf_event_ctxp);
if (ctx) {
/*
* If this context is a clone of another, it might
@@ -1413,7 +1380,7 @@ retry:
* can't get swapped on us any more.
*/
raw_spin_lock(&ctx->lock);
- if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
+ if (ctx != rcu_dereference(task->perf_event_ctxp)) {
raw_spin_unlock(&ctx->lock);
rcu_read_unlock();
local_irq_restore(*flags);
@@ -1440,12 +1407,12 @@ retry:
* reference count so that the context can't get freed.
*/
static struct perf_event_context *
-perf_pin_task_context(struct task_struct *task, int ctxn)
+perf_pin_task_context(struct task_struct *task)
{
struct perf_event_context *ctx;
unsigned long flags;
- ctx = perf_lock_task_context(task, ctxn, &flags);
+ ctx = perf_lock_task_context(task, &flags);
if (ctx) {
++ctx->pin_count;
raw_spin_unlock_irqrestore(&ctx->lock, flags);
@@ -1593,14 +1560,22 @@ static inline struct cgroup *event_cgroup(const struct perf_event *event)
* which provides ordering when rotating groups for the same CPU.
*/
static __always_inline int
-perf_event_groups_cmp(const int left_cpu, const struct cgroup *left_cgroup,
- const u64 left_group_index, const struct perf_event *right)
+perf_event_groups_cmp(const int left_cpu, const struct pmu *left_pmu,
+ const struct cgroup *left_cgroup, const u64 left_group_index,
+ const struct perf_event *right)
{
if (left_cpu < right->cpu)
return -1;
if (left_cpu > right->cpu)
return 1;
+ if (left_pmu) {
+ if (left_pmu < right->pmu_ctx->pmu)
+ return -1;
+ if (left_pmu > right->pmu_ctx->pmu)
+ return 1;
+ }
+
#ifdef CONFIG_CGROUP_PERF
{
const struct cgroup *right_cgroup = event_cgroup(right);
@@ -1643,12 +1618,13 @@ perf_event_groups_cmp(const int left_cpu, const struct cgroup *left_cgroup,
static inline bool __group_less(struct rb_node *a, const struct rb_node *b)
{
struct perf_event *e = __node_2_pe(a);
- return perf_event_groups_cmp(e->cpu, event_cgroup(e), e->group_index,
- __node_2_pe(b)) < 0;
+ return perf_event_groups_cmp(e->cpu, e->pmu_ctx->pmu, event_cgroup(e),
+ e->group_index, __node_2_pe(b)) < 0;
}
struct __group_key {
int cpu;
+ struct pmu *pmu;
struct cgroup *cgroup;
};
@@ -1657,14 +1633,25 @@ static inline int __group_cmp(const void *key, const struct rb_node *node)
const struct __group_key *a = key;
const struct perf_event *b = __node_2_pe(node);
- /* partial/subtree match: @cpu, @cgroup; ignore: @group_index */
- return perf_event_groups_cmp(a->cpu, a->cgroup, b->group_index, b);
+ /* partial/subtree match: @cpu, @pmu, @cgroup; ignore: @group_index */
+ return perf_event_groups_cmp(a->cpu, a->pmu, a->cgroup, b->group_index, b);
+}
+
+static inline int
+__group_cmp_ignore_cgroup(const void *key, const struct rb_node *node)
+{
+ const struct __group_key *a = key;
+ const struct perf_event *b = __node_2_pe(node);
+
+ /* partial/subtree match: @cpu, @pmu, ignore: @cgroup, @group_index */
+ return perf_event_groups_cmp(a->cpu, a->pmu, event_cgroup(b),
+ b->group_index, b);
}
/*
- * Insert @event into @groups' tree; using {@event->cpu, ++@groups->index} for
- * key (see perf_event_groups_less). This places it last inside the CPU
- * subtree.
+ * Insert @event into @groups' tree; using
+ * {@event->cpu, @event->pmu_ctx->pmu, event_cgroup(@event), ++@groups->index}
+ * as key. This places it last inside the {cpu,pmu,cgroup} subtree.
*/
static void
perf_event_groups_insert(struct perf_event_groups *groups,
@@ -1714,14 +1701,15 @@ del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
}
/*
- * Get the leftmost event in the cpu/cgroup subtree.
+ * Get the leftmost event in the {cpu,pmu,cgroup} subtree.
*/
static struct perf_event *
perf_event_groups_first(struct perf_event_groups *groups, int cpu,
- struct cgroup *cgrp)
+ struct pmu *pmu, struct cgroup *cgrp)
{
struct __group_key key = {
.cpu = cpu,
+ .pmu = pmu,
.cgroup = cgrp,
};
struct rb_node *node;
@@ -1733,14 +1721,12 @@ perf_event_groups_first(struct perf_event_groups *groups, int cpu,
return NULL;
}
-/*
- * Like rb_entry_next_safe() for the @cpu subtree.
- */
static struct perf_event *
-perf_event_groups_next(struct perf_event *event)
+perf_event_groups_next(struct perf_event *event, struct pmu *pmu)
{
struct __group_key key = {
.cpu = event->cpu,
+ .pmu = pmu,
.cgroup = event_cgroup(event),
};
struct rb_node *next;
@@ -1752,6 +1738,10 @@ perf_event_groups_next(struct perf_event *event)
return NULL;
}
+#define perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) \
+ for (event = perf_event_groups_first(groups, cpu, pmu, NULL); \
+ event; event = perf_event_groups_next(event, pmu))
+
/*
* Iterate through the whole groups tree.
*/
@@ -1796,6 +1786,7 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
perf_cgroup_event_enable(event, ctx);
ctx->generation++;
+ event->pmu_ctx->nr_events++;
}
/*
@@ -1941,7 +1932,8 @@ static void perf_group_attach(struct perf_event *event)
lockdep_assert_held(&event->ctx->lock);
/*
- * We can have double attach due to group movement in perf_event_open.
+ * We can have double attach due to group movement (move_group) in
+ * perf_event_open().
*/
if (event->attach_state & PERF_ATTACH_GROUP)
return;
@@ -2006,6 +1998,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
}
ctx->generation++;
+ event->pmu_ctx->nr_events--;
}
static int
@@ -2022,13 +2015,11 @@ perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event)
static void put_event(struct perf_event *event);
static void event_sched_out(struct perf_event *event,
- struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx);
static void perf_put_aux_event(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
struct perf_event *iter;
/*
@@ -2057,7 +2048,7 @@ static void perf_put_aux_event(struct perf_event *event)
* state so that we don't try to schedule it again. Note
* that perf_event_enable() will clear the ERROR status.
*/
- event_sched_out(iter, cpuctx, ctx);
+ event_sched_out(iter, ctx);
perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
}
}
@@ -2108,8 +2099,8 @@ static int perf_get_aux_event(struct perf_event *event,
static inline struct list_head *get_event_list(struct perf_event *event)
{
- struct perf_event_context *ctx = event->ctx;
- return event->attr.pinned ? &ctx->pinned_active : &ctx->flexible_active;
+ return event->attr.pinned ? &event->pmu_ctx->pinned_active :
+ &event->pmu_ctx->flexible_active;
}
/*
@@ -2120,10 +2111,7 @@ static inline struct list_head *get_event_list(struct perf_event *event)
*/
static inline void perf_remove_sibling_event(struct perf_event *event)
{
- struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
-
- event_sched_out(event, cpuctx, ctx);
+ event_sched_out(event, event->ctx);
perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
}
@@ -2212,53 +2200,22 @@ static bool is_orphaned_event(struct perf_event *event)
return event->state == PERF_EVENT_STATE_DEAD;
}
-static inline int __pmu_filter_match(struct perf_event *event)
-{
- struct pmu *pmu = event->pmu;
- return pmu->filter_match ? pmu->filter_match(event) : 1;
-}
-
-/*
- * Check whether we should attempt to schedule an event group based on
- * PMU-specific filtering. An event group can consist of HW and SW events,
- * potentially with a SW leader, so we must check all the filters, to
- * determine whether a group is schedulable:
- */
-static inline int pmu_filter_match(struct perf_event *event)
-{
- struct perf_event *sibling;
- unsigned long flags;
- int ret = 1;
-
- if (!__pmu_filter_match(event))
- return 0;
-
- local_irq_save(flags);
- for_each_sibling_event(sibling, event) {
- if (!__pmu_filter_match(sibling)) {
- ret = 0;
- break;
- }
- }
- local_irq_restore(flags);
-
- return ret;
-}
-
static inline int
event_filter_match(struct perf_event *event)
{
return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
- perf_cgroup_match(event) && pmu_filter_match(event);
+ perf_cgroup_match(event);
}
static void
-event_sched_out(struct perf_event *event,
- struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
+event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
{
+ struct perf_event_pmu_context *epc = event->pmu_ctx;
+ struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context);
enum perf_event_state state = PERF_EVENT_STATE_INACTIVE;
+ // XXX cpc serialization, probably per-cpu IRQ disabled
+
WARN_ON_ONCE(event->ctx != ctx);
lockdep_assert_held(&ctx->lock);
@@ -2301,38 +2258,32 @@ event_sched_out(struct perf_event *event,
perf_event_set_state(event, state);
if (!is_software_event(event))
- cpuctx->active_oncpu--;
- if (!--ctx->nr_active)
- perf_event_ctx_deactivate(ctx);
+ cpc->active_oncpu--;
if (event->attr.freq && event->attr.sample_freq)
ctx->nr_freq--;
- if (event->attr.exclusive || !cpuctx->active_oncpu)
- cpuctx->exclusive = 0;
+ if (event->attr.exclusive || !cpc->active_oncpu)
+ cpc->exclusive = 0;
perf_pmu_enable(event->pmu);
}
static void
-group_sched_out(struct perf_event *group_event,
- struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
+group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx)
{
struct perf_event *event;
if (group_event->state != PERF_EVENT_STATE_ACTIVE)
return;
- perf_pmu_disable(ctx->pmu);
+ perf_assert_pmu_disabled(group_event->pmu_ctx->pmu);
- event_sched_out(group_event, cpuctx, ctx);
+ event_sched_out(group_event, ctx);
/*
* Schedule out siblings (if any):
*/
for_each_sibling_event(event, group_event)
- event_sched_out(event, cpuctx, ctx);
-
- perf_pmu_enable(ctx->pmu);
+ event_sched_out(event, ctx);
}
#define DETACH_GROUP 0x01UL
@@ -2351,6 +2302,7 @@ __perf_remove_from_context(struct perf_event *event,
struct perf_event_context *ctx,
void *info)
{
+ struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx;
unsigned long flags = (unsigned long)info;
if (ctx->is_active & EVENT_TIME) {
@@ -2364,7 +2316,7 @@ __perf_remove_from_context(struct perf_event *event,
*/
if (flags & DETACH_DEAD)
event->pending_disable = 1;
- event_sched_out(event, cpuctx, ctx);
+ event_sched_out(event, ctx);
if (flags & DETACH_GROUP)
perf_group_detach(event);
if (flags & DETACH_CHILD)
@@ -2373,12 +2325,23 @@ __perf_remove_from_context(struct perf_event *event,
if (flags & DETACH_DEAD)
event->state = PERF_EVENT_STATE_DEAD;
+ if (!pmu_ctx->nr_events) {
+ pmu_ctx->rotate_necessary = 0;
+
+ if (ctx->task && ctx->is_active) {
+ struct perf_cpu_pmu_context *cpc;
+
+ cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context);
+ WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
+ cpc->task_epc = NULL;
+ }
+ }
+
if (!ctx->nr_events && ctx->is_active) {
if (ctx == &cpuctx->ctx)
update_cgrp_time_from_cpuctx(cpuctx, true);
ctx->is_active = 0;
- ctx->rotate_necessary = 0;
if (ctx->task) {
WARN_ON_ONCE(cpuctx->task_ctx != ctx);
cpuctx->task_ctx = NULL;
@@ -2408,12 +2371,8 @@ static void perf_remove_from_context(struct perf_event *event, unsigned long fla
* event_function_call() user.
*/
raw_spin_lock_irq(&ctx->lock);
- /*
- * Cgroup events are per-cpu events, and must IPI because of
- * cgrp_cpuctx_list.
- */
- if (!ctx->is_active && !is_cgroup_event(event)) {
- __perf_remove_from_context(event, __get_cpu_context(ctx),
+ if (!ctx->is_active) {
+ __perf_remove_from_context(event, this_cpu_ptr(&perf_cpu_context),
ctx, (void *)flags);
raw_spin_unlock_irq(&ctx->lock);
return;
@@ -2439,13 +2398,17 @@ static void __perf_event_disable(struct perf_event *event,
update_cgrp_time_from_event(event);
}
+ perf_pmu_disable(event->pmu_ctx->pmu);
+
if (event == event->group_leader)
- group_sched_out(event, cpuctx, ctx);
+ group_sched_out(event, ctx);
else
- event_sched_out(event, cpuctx, ctx);
+ event_sched_out(event, ctx);
perf_event_set_state(event, PERF_EVENT_STATE_OFF);
perf_cgroup_event_disable(event, ctx);
+
+ perf_pmu_enable(event->pmu_ctx->pmu);
}
/*
@@ -2507,10 +2470,10 @@ static void perf_log_throttle(struct perf_event *event, int enable);
static void perf_log_itrace_start(struct perf_event *event);
static int
-event_sched_in(struct perf_event *event,
- struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
+event_sched_in(struct perf_event *event, struct perf_event_context *ctx)
{
+ struct perf_event_pmu_context *epc = event->pmu_ctx;
+ struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context);
int ret = 0;
WARN_ON_ONCE(event->ctx != ctx);
@@ -2551,14 +2514,12 @@ event_sched_in(struct perf_event *event,
}
if (!is_software_event(event))
- cpuctx->active_oncpu++;
- if (!ctx->nr_active++)
- perf_event_ctx_activate(ctx);
+ cpc->active_oncpu++;
if (event->attr.freq && event->attr.sample_freq)
ctx->nr_freq++;
if (event->attr.exclusive)
- cpuctx->exclusive = 1;
+ cpc->exclusive = 1;
out:
perf_pmu_enable(event->pmu);
@@ -2567,26 +2528,24 @@ out:
}
static int
-group_sched_in(struct perf_event *group_event,
- struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
+group_sched_in(struct perf_event *group_event, struct perf_event_context *ctx)
{
struct perf_event *event, *partial_group = NULL;
- struct pmu *pmu = ctx->pmu;
+ struct pmu *pmu = group_event->pmu_ctx->pmu;
if (group_event->state == PERF_EVENT_STATE_OFF)
return 0;
pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
- if (event_sched_in(group_event, cpuctx, ctx))
+ if (event_sched_in(group_event, ctx))
goto error;
/*
* Schedule in siblings as one group (if any):
*/
for_each_sibling_event(event, group_event) {
- if (event_sched_in(event, cpuctx, ctx)) {
+ if (event_sched_in(event, ctx)) {
partial_group = event;
goto group_error;
}
@@ -2605,9 +2564,9 @@ group_error:
if (event == partial_group)
break;
- event_sched_out(event, cpuctx, ctx);
+ event_sched_out(event, ctx);
}
- event_sched_out(group_event, cpuctx, ctx);
+ event_sched_out(group_event, ctx);
error:
pmu->cancel_txn(pmu);
@@ -2617,10 +2576,11 @@ error:
/*
* Work out whether we can put this event group on the CPU now.
*/
-static int group_can_go_on(struct perf_event *event,
- struct perf_cpu_context *cpuctx,
- int can_add_hw)
+static int group_can_go_on(struct perf_event *event, int can_add_hw)
{
+ struct perf_event_pmu_context *epc = event->pmu_ctx;
+ struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context);
+
/*
* Groups consisting entirely of software events can always go on.
*/
@@ -2630,7 +2590,7 @@ static int group_can_go_on(struct perf_event *event,
* If an exclusive group is already on, no other hardware
* events can go on.
*/
- if (cpuctx->exclusive)
+ if (cpc->exclusive)
return 0;
/*
* If this group is exclusive and there are already
@@ -2652,36 +2612,29 @@ static void add_event_to_ctx(struct perf_event *event,
perf_group_attach(event);
}
-static void ctx_sched_out(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx,
- enum event_type_t event_type);
-static void
-ctx_sched_in(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx,
- enum event_type_t event_type);
-
-static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx,
- enum event_type_t event_type)
+static void task_ctx_sched_out(struct perf_event_context *ctx,
+ enum event_type_t event_type)
{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+
if (!cpuctx->task_ctx)
return;
if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
return;
- ctx_sched_out(ctx, cpuctx, event_type);
+ ctx_sched_out(ctx, event_type);
}
static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
- cpu_ctx_sched_in(cpuctx, EVENT_PINNED);
+ ctx_sched_in(&cpuctx->ctx, EVENT_PINNED);
if (ctx)
- ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
- cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
+ ctx_sched_in(ctx, EVENT_PINNED);
+ ctx_sched_in(&cpuctx->ctx, EVENT_FLEXIBLE);
if (ctx)
- ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
+ ctx_sched_in(ctx, EVENT_FLEXIBLE);
}
/*
@@ -2699,11 +2652,15 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
* event_type is a bit mask of the types of events involved. For CPU events,
* event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
*/
+/*
+ * XXX: ctx_resched() reschedule entire perf_event_context while adding new
+ * event to the context or enabling existing event in the context. We can
+ * probably optimize it by rescheduling only affected pmu_ctx.
+ */
static void ctx_resched(struct perf_cpu_context *cpuctx,
struct perf_event_context *task_ctx,
enum event_type_t event_type)
{
- enum event_type_t ctx_event_type;
bool cpu_event = !!(event_type & EVENT_CPU);
/*
@@ -2713,11 +2670,13 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
if (event_type & EVENT_PINNED)
event_type |= EVENT_FLEXIBLE;
- ctx_event_type = event_type & EVENT_ALL;
+ event_type &= EVENT_ALL;
- perf_pmu_disable(cpuctx->ctx.pmu);
- if (task_ctx)
- task_ctx_sched_out(cpuctx, task_ctx, event_type);
+ perf_ctx_disable(&cpuctx->ctx);
+ if (task_ctx) {
+ perf_ctx_disable(task_ctx);
+ task_ctx_sched_out(task_ctx, event_type);
+ }
/*
* Decide which cpu ctx groups to schedule out based on the types
@@ -2727,17 +2686,20 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
* - otherwise, do nothing more.
*/
if (cpu_event)
- cpu_ctx_sched_out(cpuctx, ctx_event_type);
- else if (ctx_event_type & EVENT_PINNED)
- cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+ ctx_sched_out(&cpuctx->ctx, event_type);
+ else if (event_type & EVENT_PINNED)
+ ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
perf_event_sched_in(cpuctx, task_ctx);
- perf_pmu_enable(cpuctx->ctx.pmu);
+
+ perf_ctx_enable(&cpuctx->ctx);
+ if (task_ctx)
+ perf_ctx_enable(task_ctx);
}
void perf_pmu_resched(struct pmu *pmu)
{
- struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct perf_event_context *task_ctx = cpuctx->task_ctx;
perf_ctx_lock(cpuctx, task_ctx);
@@ -2755,7 +2717,7 @@ static int __perf_install_in_context(void *info)
{
struct perf_event *event = info;
struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct perf_event_context *task_ctx = cpuctx->task_ctx;
bool reprogram = true;
int ret = 0;
@@ -2797,7 +2759,7 @@ static int __perf_install_in_context(void *info)
#endif
if (reprogram) {
- ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+ ctx_sched_out(ctx, EVENT_TIME);
add_event_to_ctx(event, ctx);
ctx_resched(cpuctx, task_ctx, get_event_type(event));
} else {
@@ -2830,7 +2792,7 @@ perf_install_in_context(struct perf_event_context *ctx,
WARN_ON_ONCE(!exclusive_event_installable(event, ctx));
if (event->cpu != -1)
- event->cpu = cpu;
+ WARN_ON_ONCE(event->cpu != cpu);
/*
* Ensures that if we can observe event->ctx, both the event and ctx
@@ -2842,8 +2804,6 @@ perf_install_in_context(struct perf_event_context *ctx,
* perf_event_attr::disabled events will not run and can be initialized
* without IPI. Except when this is the first event for the context, in
* that case we need the magic of the IPI to set ctx->is_active.
- * Similarly, cgroup events for the context also needs the IPI to
- * manipulate the cgrp_cpuctx_list.
*
* The IOC_ENABLE that is sure to follow the creation of a disabled
* event will issue the IPI and reprogram the hardware.
@@ -2945,7 +2905,7 @@ static void __perf_event_enable(struct perf_event *event,
return;
if (ctx->is_active)
- ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+ ctx_sched_out(ctx, EVENT_TIME);
perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
perf_cgroup_event_enable(event, ctx);
@@ -2954,7 +2914,7 @@ static void __perf_event_enable(struct perf_event *event,
return;
if (!event_filter_match(event)) {
- ctx_sched_in(ctx, cpuctx, EVENT_TIME);
+ ctx_sched_in(ctx, EVENT_TIME);
return;
}
@@ -2963,7 +2923,7 @@ static void __perf_event_enable(struct perf_event *event,
* then don't put it on unless the group is on.
*/
if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
- ctx_sched_in(ctx, cpuctx, EVENT_TIME);
+ ctx_sched_in(ctx, EVENT_TIME);
return;
}
@@ -3232,11 +3192,52 @@ out:
return err;
}
-static void ctx_sched_out(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx,
- enum event_type_t event_type)
+static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx,
+ enum event_type_t event_type)
{
+ struct perf_event_context *ctx = pmu_ctx->ctx;
struct perf_event *event, *tmp;
+ struct pmu *pmu = pmu_ctx->pmu;
+
+ if (ctx->task && !ctx->is_active) {
+ struct perf_cpu_pmu_context *cpc;
+
+ cpc = this_cpu_ptr(pmu->cpu_pmu_context);
+ WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
+ cpc->task_epc = NULL;
+ }
+
+ if (!event_type)
+ return;
+
+ perf_pmu_disable(pmu);
+ if (event_type & EVENT_PINNED) {
+ list_for_each_entry_safe(event, tmp,
+ &pmu_ctx->pinned_active,
+ active_list)
+ group_sched_out(event, ctx);
+ }
+
+ if (event_type & EVENT_FLEXIBLE) {
+ list_for_each_entry_safe(event, tmp,
+ &pmu_ctx->flexible_active,
+ active_list)
+ group_sched_out(event, ctx);
+ /*
+ * Since we cleared EVENT_FLEXIBLE, also clear
+ * rotate_necessary, is will be reset by
+ * ctx_flexible_sched_in() when needed.
+ */
+ pmu_ctx->rotate_necessary = 0;
+ }
+ perf_pmu_enable(pmu);
+}
+
+static void
+ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
+{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ struct perf_event_pmu_context *pmu_ctx;
int is_active = ctx->is_active;
lockdep_assert_held(&ctx->lock);
@@ -3284,27 +3285,8 @@ static void ctx_sched_out(struct perf_event_context *ctx,
is_active ^= ctx->is_active; /* changed bits */
- if (!ctx->nr_active || !(is_active & EVENT_ALL))
- return;
-
- perf_pmu_disable(ctx->pmu);
- if (is_active & EVENT_PINNED) {
- list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list)
- group_sched_out(event, cpuctx, ctx);
- }
-
- if (is_active & EVENT_FLEXIBLE) {
- list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list)
- group_sched_out(event, cpuctx, ctx);
-
- /*
- * Since we cleared EVENT_FLEXIBLE, also clear
- * rotate_necessary, is will be reset by
- * ctx_flexible_sched_in() when needed.
- */
- ctx->rotate_necessary = 0;
- }
- perf_pmu_enable(ctx->pmu);
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
+ __pmu_ctx_sched_out(pmu_ctx, is_active);
}
/*
@@ -3409,26 +3391,68 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
}
}
-static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
- struct task_struct *next)
+#define double_list_for_each_entry(pos1, pos2, head1, head2, member) \
+ for (pos1 = list_first_entry(head1, typeof(*pos1), member), \
+ pos2 = list_first_entry(head2, typeof(*pos2), member); \
+ !list_entry_is_head(pos1, head1, member) && \
+ !list_entry_is_head(pos2, head2, member); \
+ pos1 = list_next_entry(pos1, member), \
+ pos2 = list_next_entry(pos2, member))
+
+static void perf_event_swap_task_ctx_data(struct perf_event_context *prev_ctx,
+ struct perf_event_context *next_ctx)
{
- struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
+ struct perf_event_pmu_context *prev_epc, *next_epc;
+
+ if (!prev_ctx->nr_task_data)
+ return;
+
+ double_list_for_each_entry(prev_epc, next_epc,
+ &prev_ctx->pmu_ctx_list, &next_ctx->pmu_ctx_list,
+ pmu_ctx_entry) {
+
+ if (WARN_ON_ONCE(prev_epc->pmu != next_epc->pmu))
+ continue;
+
+ /*
+ * PMU specific parts of task perf context can require
+ * additional synchronization. As an example of such
+ * synchronization see implementation details of Intel
+ * LBR call stack data profiling;
+ */
+ if (prev_epc->pmu->swap_task_ctx)
+ prev_epc->pmu->swap_task_ctx(prev_epc, next_epc);
+ else
+ swap(prev_epc->task_ctx_data, next_epc->task_ctx_data);
+ }
+}
+
+static void perf_ctx_sched_task_cb(struct perf_event_context *ctx, bool sched_in)
+{
+ struct perf_event_pmu_context *pmu_ctx;
+ struct perf_cpu_pmu_context *cpc;
+
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+ cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context);
+
+ if (cpc->sched_cb_usage && pmu_ctx->pmu->sched_task)
+ pmu_ctx->pmu->sched_task(pmu_ctx, sched_in);
+ }
+}
+
+static void
+perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
+{
+ struct perf_event_context *ctx = task->perf_event_ctxp;
struct perf_event_context *next_ctx;
struct perf_event_context *parent, *next_parent;
- struct perf_cpu_context *cpuctx;
int do_switch = 1;
- struct pmu *pmu;
if (likely(!ctx))
return;
- pmu = ctx->pmu;
- cpuctx = __get_cpu_context(ctx);
- if (!cpuctx->task_ctx)
- return;
-
rcu_read_lock();
- next_ctx = next->perf_event_ctxp[ctxn];
+ next_ctx = rcu_dereference(next->perf_event_ctxp);
if (!next_ctx)
goto unlock;
@@ -3453,7 +3477,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
if (context_equiv(ctx, next_ctx)) {
- perf_pmu_disable(pmu);
+ perf_ctx_disable(ctx);
/* PMIs are disabled; ctx->nr_pending is stable. */
if (local_read(&ctx->nr_pending) ||
@@ -3470,21 +3494,10 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
WRITE_ONCE(ctx->task, next);
WRITE_ONCE(next_ctx->task, task);
- if (cpuctx->sched_cb_usage && pmu->sched_task)
- pmu->sched_task(ctx, false);
-
- /*
- * PMU specific parts of task perf context can require
- * additional synchronization. As an example of such
- * synchronization see implementation details of Intel
- * LBR call stack data profiling;
- */
- if (pmu->swap_task_ctx)
- pmu->swap_task_ctx(ctx, next_ctx);
- else
- swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
+ perf_ctx_sched_task_cb(ctx, false);
+ perf_event_swap_task_ctx_data(ctx, next_ctx);
- perf_pmu_enable(pmu);
+ perf_ctx_enable(ctx);
/*
* RCU_INIT_POINTER here is safe because we've not
@@ -3493,8 +3506,8 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
* since those values are always verified under
* ctx->lock which we're now holding.
*/
- RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx);
- RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx);
+ RCU_INIT_POINTER(task->perf_event_ctxp, next_ctx);
+ RCU_INIT_POINTER(next->perf_event_ctxp, ctx);
do_switch = 0;
@@ -3508,38 +3521,40 @@ unlock:
if (do_switch) {
raw_spin_lock(&ctx->lock);
- perf_pmu_disable(pmu);
+ perf_ctx_disable(ctx);
inside_switch:
- if (cpuctx->sched_cb_usage && pmu->sched_task)
- pmu->sched_task(ctx, false);
- task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
+ perf_ctx_sched_task_cb(ctx, false);
+ task_ctx_sched_out(ctx, EVENT_ALL);
- perf_pmu_enable(pmu);
+ perf_ctx_enable(ctx);
raw_spin_unlock(&ctx->lock);
}
}
static DEFINE_PER_CPU(struct list_head, sched_cb_list);
+static DEFINE_PER_CPU(int, perf_sched_cb_usages);
void perf_sched_cb_dec(struct pmu *pmu)
{
- struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+ struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context);
this_cpu_dec(perf_sched_cb_usages);
+ barrier();
- if (!--cpuctx->sched_cb_usage)
- list_del(&cpuctx->sched_cb_entry);
+ if (!--cpc->sched_cb_usage)
+ list_del(&cpc->sched_cb_entry);
}
void perf_sched_cb_inc(struct pmu *pmu)
{
- struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+ struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context);
- if (!cpuctx->sched_cb_usage++)
- list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
+ if (!cpc->sched_cb_usage++)
+ list_add(&cpc->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
+ barrier();
this_cpu_inc(perf_sched_cb_usages);
}
@@ -3551,19 +3566,21 @@ void perf_sched_cb_inc(struct pmu *pmu)
* PEBS requires this to provide PID/TID information. This requires we flush
* all queued PEBS records before we context switch to a new task.
*/
-static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in)
+static void __perf_pmu_sched_task(struct perf_cpu_pmu_context *cpc, bool sched_in)
{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct pmu *pmu;
- pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */
+ pmu = cpc->epc.pmu;
+ /* software PMUs will not have sched_task */
if (WARN_ON_ONCE(!pmu->sched_task))
return;
perf_ctx_lock(cpuctx, cpuctx->task_ctx);
perf_pmu_disable(pmu);
- pmu->sched_task(cpuctx->task_ctx, sched_in);
+ pmu->sched_task(cpc->task_epc, sched_in);
perf_pmu_enable(pmu);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
@@ -3573,26 +3590,20 @@ static void perf_pmu_sched_task(struct task_struct *prev,
struct task_struct *next,
bool sched_in)
{
- struct perf_cpu_context *cpuctx;
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ struct perf_cpu_pmu_context *cpc;
- if (prev == next)
+ /* cpuctx->task_ctx will be handled in perf_event_context_sched_in/out */
+ if (prev == next || cpuctx->task_ctx)
return;
- list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
- /* will be handled in perf_event_context_sched_in/out */
- if (cpuctx->task_ctx)
- continue;
-
- __perf_pmu_sched_task(cpuctx, sched_in);
- }
+ list_for_each_entry(cpc, this_cpu_ptr(&sched_cb_list), sched_cb_entry)
+ __perf_pmu_sched_task(cpc, sched_in);
}
static void perf_event_switch(struct task_struct *task,
struct task_struct *next_prev, bool sched_in);
-#define for_each_task_context_nr(ctxn) \
- for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
-
/*
* Called from scheduler to remove the events of the current task,
* with interrupts disabled.
@@ -3607,16 +3618,13 @@ static void perf_event_switch(struct task_struct *task,
void __perf_event_task_sched_out(struct task_struct *task,
struct task_struct *next)
{
- int ctxn;
-
if (__this_cpu_read(perf_sched_cb_usages))
perf_pmu_sched_task(task, next, false);
if (atomic_read(&nr_switch_events))
perf_event_switch(task, next, false);
- for_each_task_context_nr(ctxn)
- perf_event_context_sched_out(task, ctxn, next);
+ perf_event_context_sched_out(task, next);
/*
* if cgroup events exist on this CPU, then we need
@@ -3627,15 +3635,6 @@ void __perf_event_task_sched_out(struct task_struct *task,
perf_cgroup_switch(next);
}
-/*
- * Called with IRQs disabled
- */
-static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
- enum event_type_t event_type)
-{
- ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
-}
-
static bool perf_less_group_idx(const void *l, const void *r)
{
const struct perf_event *le = *(const struct perf_event **)l;
@@ -3667,21 +3666,39 @@ static void __heap_add(struct min_heap *heap, struct perf_event *event)
}
}
-static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
+static void __link_epc(struct perf_event_pmu_context *pmu_ctx)
+{
+ struct perf_cpu_pmu_context *cpc;
+
+ if (!pmu_ctx->ctx->task)
+ return;
+
+ cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context);
+ WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
+ cpc->task_epc = pmu_ctx;
+}
+
+static noinline int visit_groups_merge(struct perf_event_context *ctx,
struct perf_event_groups *groups, int cpu,
+ struct pmu *pmu,
int (*func)(struct perf_event *, void *),
void *data)
{
#ifdef CONFIG_CGROUP_PERF
struct cgroup_subsys_state *css = NULL;
#endif
+ struct perf_cpu_context *cpuctx = NULL;
/* Space for per CPU and/or any CPU event iterators. */
struct perf_event *itrs[2];
struct min_heap event_heap;
struct perf_event **evt;
int ret;
- if (cpuctx) {
+ if (pmu->filter && pmu->filter(pmu, cpu))
+ return 0;
+
+ if (!ctx->task) {
+ cpuctx = this_cpu_ptr(&perf_cpu_context);
event_heap = (struct min_heap){
.data = cpuctx->heap,
.nr = 0,
@@ -3701,17 +3718,22 @@ static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
.size = ARRAY_SIZE(itrs),
};
/* Events not within a CPU context may be on any CPU. */
- __heap_add(&event_heap, perf_event_groups_first(groups, -1, NULL));
+ __heap_add(&event_heap, perf_event_groups_first(groups, -1, pmu, NULL));
}
evt = event_heap.data;
- __heap_add(&event_heap, perf_event_groups_first(groups, cpu, NULL));
+ __heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, NULL));
#ifdef CONFIG_CGROUP_PERF
for (; css; css = css->parent)
- __heap_add(&event_heap, perf_event_groups_first(groups, cpu, css->cgroup));
+ __heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, css->cgroup));
#endif
+ if (event_heap.nr) {
+ __link_epc((*evt)->pmu_ctx);
+ perf_assert_pmu_disabled((*evt)->pmu_ctx->pmu);
+ }
+
min_heapify_all(&event_heap, &perf_min_heap);
while (event_heap.nr) {
@@ -3719,7 +3741,7 @@ static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
if (ret)
return ret;
- *evt = perf_event_groups_next(*evt);
+ *evt = perf_event_groups_next(*evt, pmu);
if (*evt)
min_heapify(&event_heap, 0, &perf_min_heap);
else
@@ -3761,7 +3783,6 @@ static inline void group_update_userpage(struct perf_event *group_event)
static int merge_sched_in(struct perf_event *event, void *data)
{
struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
int *can_add_hw = data;
if (event->state <= PERF_EVENT_STATE_OFF)
@@ -3770,8 +3791,8 @@ static int merge_sched_in(struct perf_event *event, void *data)
if (!event_filter_match(event))
return 0;
- if (group_can_go_on(event, cpuctx, *can_add_hw)) {
- if (!group_sched_in(event, cpuctx, ctx))
+ if (group_can_go_on(event, *can_add_hw)) {
+ if (!group_sched_in(event, ctx))
list_add_tail(&event->active_list, get_event_list(event));
}
@@ -3781,8 +3802,11 @@ static int merge_sched_in(struct perf_event *event, void *data)
perf_cgroup_event_disable(event, ctx);
perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
} else {
- ctx->rotate_necessary = 1;
- perf_mux_hrtimer_restart(cpuctx);
+ struct perf_cpu_pmu_context *cpc;
+
+ event->pmu_ctx->rotate_necessary = 1;
+ cpc = this_cpu_ptr(event->pmu_ctx->pmu->cpu_pmu_context);
+ perf_mux_hrtimer_restart(cpc);
group_update_userpage(event);
}
}
@@ -3790,39 +3814,53 @@ static int merge_sched_in(struct perf_event *event, void *data)
return 0;
}
-static void
-ctx_pinned_sched_in(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx)
+static void ctx_pinned_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
{
+ struct perf_event_pmu_context *pmu_ctx;
int can_add_hw = 1;
- if (ctx != &cpuctx->ctx)
- cpuctx = NULL;
-
- visit_groups_merge(cpuctx, &ctx->pinned_groups,
- smp_processor_id(),
- merge_sched_in, &can_add_hw);
+ if (pmu) {
+ visit_groups_merge(ctx, &ctx->pinned_groups,
+ smp_processor_id(), pmu,
+ merge_sched_in, &can_add_hw);
+ } else {
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+ can_add_hw = 1;
+ visit_groups_merge(ctx, &ctx->pinned_groups,
+ smp_processor_id(), pmu_ctx->pmu,
+ merge_sched_in, &can_add_hw);
+ }
+ }
}
-static void
-ctx_flexible_sched_in(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx)
+static void ctx_flexible_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
{
+ struct perf_event_pmu_context *pmu_ctx;
int can_add_hw = 1;
- if (ctx != &cpuctx->ctx)
- cpuctx = NULL;
+ if (pmu) {
+ visit_groups_merge(ctx, &ctx->flexible_groups,
+ smp_processor_id(), pmu,
+ merge_sched_in, &can_add_hw);
+ } else {
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+ can_add_hw = 1;
+ visit_groups_merge(ctx, &ctx->flexible_groups,
+ smp_processor_id(), pmu_ctx->pmu,
+ merge_sched_in, &can_add_hw);
+ }
+ }
+}
- visit_groups_merge(cpuctx, &ctx->flexible_groups,
- smp_processor_id(),
- merge_sched_in, &can_add_hw);
+static void __pmu_ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
+{
+ ctx_flexible_sched_in(ctx, pmu);
}
static void
-ctx_sched_in(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx,
- enum event_type_t event_type)
+ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
int is_active = ctx->is_active;
lockdep_assert_held(&ctx->lock);
@@ -3856,39 +3894,32 @@ ctx_sched_in(struct perf_event_context *ctx,
* in order to give them the best chance of going on.
*/
if (is_active & EVENT_PINNED)
- ctx_pinned_sched_in(ctx, cpuctx);
+ ctx_pinned_sched_in(ctx, NULL);
/* Then walk through the lower prio flexible groups */
if (is_active & EVENT_FLEXIBLE)
- ctx_flexible_sched_in(ctx, cpuctx);
+ ctx_flexible_sched_in(ctx, NULL);
}
-static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
- enum event_type_t event_type)
+static void perf_event_context_sched_in(struct task_struct *task)
{
- struct perf_event_context *ctx = &cpuctx->ctx;
-
- ctx_sched_in(ctx, cpuctx, event_type);
-}
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ struct perf_event_context *ctx;
-static void perf_event_context_sched_in(struct perf_event_context *ctx,
- struct task_struct *task)
-{
- struct perf_cpu_context *cpuctx;
- struct pmu *pmu;
+ rcu_read_lock();
+ ctx = rcu_dereference(task->perf_event_ctxp);
+ if (!ctx)
+ goto rcu_unlock;
- cpuctx = __get_cpu_context(ctx);
+ if (cpuctx->task_ctx == ctx) {
+ perf_ctx_lock(cpuctx, ctx);
+ perf_ctx_disable(ctx);
- /*
- * HACK: for HETEROGENEOUS the task context might have switched to a
- * different PMU, force (re)set the context,
- */
- pmu = ctx->pmu = cpuctx->ctx.pmu;
+ perf_ctx_sched_task_cb(ctx, true);
- if (cpuctx->task_ctx == ctx) {
- if (cpuctx->sched_cb_usage)
- __perf_pmu_sched_task(cpuctx, true);
- return;
+ perf_ctx_enable(ctx);
+ perf_ctx_unlock(cpuctx, ctx);
+ goto rcu_unlock;
}
perf_ctx_lock(cpuctx, ctx);
@@ -3899,7 +3930,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
if (!ctx->nr_events)
goto unlock;
- perf_pmu_disable(pmu);
+ perf_ctx_disable(ctx);
/*
* We want to keep the following priority order:
* cpu pinned (that don't need to move), task pinned,
@@ -3908,17 +3939,24 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
* However, if task's ctx is not carrying any pinned
* events, no need to flip the cpuctx's events around.
*/
- if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
- cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+ if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) {
+ perf_ctx_disable(&cpuctx->ctx);
+ ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
+ }
+
perf_event_sched_in(cpuctx, ctx);
- if (cpuctx->sched_cb_usage && pmu->sched_task)
- pmu->sched_task(cpuctx->task_ctx, true);
+ perf_ctx_sched_task_cb(cpuctx->task_ctx, true);
- perf_pmu_enable(pmu);
+ if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
+ perf_ctx_enable(&cpuctx->ctx);
+
+ perf_ctx_enable(ctx);
unlock:
perf_ctx_unlock(cpuctx, ctx);
+rcu_unlock:
+ rcu_read_unlock();
}
/*
@@ -3935,16 +3973,7 @@ unlock:
void __perf_event_task_sched_in(struct task_struct *prev,
struct task_struct *task)
{
- struct perf_event_context *ctx;
- int ctxn;
-
- for_each_task_context_nr(ctxn) {
- ctx = task->perf_event_ctxp[ctxn];
- if (likely(!ctx))
- continue;
-
- perf_event_context_sched_in(ctx, task);
- }
+ perf_event_context_sched_in(task);
if (atomic_read(&nr_switch_events))
perf_event_switch(task, prev, true);
@@ -4063,8 +4092,8 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bo
* events. At the same time, make sure, having freq events does not change
* the rate of unthrottling as that would introduce bias.
*/
-static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
- int needs_unthr)
+static void
+perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
{
struct perf_event *event;
struct hw_perf_event *hwc;
@@ -4076,16 +4105,16 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
* - context have events in frequency mode (needs freq adjust)
* - there are events to unthrottle on this cpu
*/
- if (!(ctx->nr_freq || needs_unthr))
+ if (!(ctx->nr_freq || unthrottle))
return;
raw_spin_lock(&ctx->lock);
- perf_pmu_disable(ctx->pmu);
list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
if (event->state != PERF_EVENT_STATE_ACTIVE)
continue;
+ // XXX use visit thingy to avoid the -1,cpu match
if (!event_filter_match(event))
continue;
@@ -4126,7 +4155,6 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
perf_pmu_enable(event->pmu);
}
- perf_pmu_enable(ctx->pmu);
raw_spin_unlock(&ctx->lock);
}
@@ -4148,72 +4176,109 @@ static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event)
/* pick an event from the flexible_groups to rotate */
static inline struct perf_event *
-ctx_event_to_rotate(struct perf_event_context *ctx)
+ctx_event_to_rotate(struct perf_event_pmu_context *pmu_ctx)
{
struct perf_event *event;
+ struct rb_node *node;
+ struct rb_root *tree;
+ struct __group_key key = {
+ .pmu = pmu_ctx->pmu,
+ };
/* pick the first active flexible event */
- event = list_first_entry_or_null(&ctx->flexible_active,
+ event = list_first_entry_or_null(&pmu_ctx->flexible_active,
struct perf_event, active_list);
+ if (event)
+ goto out;
/* if no active flexible event, pick the first event */
- if (!event) {
- event = rb_entry_safe(rb_first(&ctx->flexible_groups.tree),
- typeof(*event), group_node);
+ tree = &pmu_ctx->ctx->flexible_groups.tree;
+
+ if (!pmu_ctx->ctx->task) {
+ key.cpu = smp_processor_id();
+
+ node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup);
+ if (node)
+ event = __node_2_pe(node);
+ goto out;
}
+ key.cpu = -1;
+ node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup);
+ if (node) {
+ event = __node_2_pe(node);
+ goto out;
+ }
+
+ key.cpu = smp_processor_id();
+ node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup);
+ if (node)
+ event = __node_2_pe(node);
+
+out:
/*
* Unconditionally clear rotate_necessary; if ctx_flexible_sched_in()
* finds there are unschedulable events, it will set it again.
*/
- ctx->rotate_necessary = 0;
+ pmu_ctx->rotate_necessary = 0;
return event;
}
-static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
+static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc)
{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ struct perf_event_pmu_context *cpu_epc, *task_epc = NULL;
struct perf_event *cpu_event = NULL, *task_event = NULL;
- struct perf_event_context *task_ctx = NULL;
int cpu_rotate, task_rotate;
+ struct pmu *pmu;
/*
* Since we run this from IRQ context, nobody can install new
* events, thus the event count values are stable.
*/
- cpu_rotate = cpuctx->ctx.rotate_necessary;
- task_ctx = cpuctx->task_ctx;
- task_rotate = task_ctx ? task_ctx->rotate_necessary : 0;
+ cpu_epc = &cpc->epc;
+ pmu = cpu_epc->pmu;
+ task_epc = cpc->task_epc;
+
+ cpu_rotate = cpu_epc->rotate_necessary;
+ task_rotate = task_epc ? task_epc->rotate_necessary : 0;
if (!(cpu_rotate || task_rotate))
return false;
perf_ctx_lock(cpuctx, cpuctx->task_ctx);
- perf_pmu_disable(cpuctx->ctx.pmu);
+ perf_pmu_disable(pmu);
if (task_rotate)
- task_event = ctx_event_to_rotate(task_ctx);
+ task_event = ctx_event_to_rotate(task_epc);
if (cpu_rotate)
- cpu_event = ctx_event_to_rotate(&cpuctx->ctx);
+ cpu_event = ctx_event_to_rotate(cpu_epc);
/*
* As per the order given at ctx_resched() first 'pop' task flexible
* and then, if needed CPU flexible.
*/
- if (task_event || (task_ctx && cpu_event))
- ctx_sched_out(task_ctx, cpuctx, EVENT_FLEXIBLE);
- if (cpu_event)
- cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+ if (task_event || (task_epc && cpu_event)) {
+ update_context_time(task_epc->ctx);
+ __pmu_ctx_sched_out(task_epc, EVENT_FLEXIBLE);
+ }
- if (task_event)
- rotate_ctx(task_ctx, task_event);
- if (cpu_event)
+ if (cpu_event) {
+ update_context_time(&cpuctx->ctx);
+ __pmu_ctx_sched_out(cpu_epc, EVENT_FLEXIBLE);
rotate_ctx(&cpuctx->ctx, cpu_event);
+ __pmu_ctx_sched_in(&cpuctx->ctx, pmu);
+ }
- perf_event_sched_in(cpuctx, task_ctx);
+ if (task_event)
+ rotate_ctx(task_epc->ctx, task_event);
+
+ if (task_event || (task_epc && cpu_event))
+ __pmu_ctx_sched_in(task_epc->ctx, pmu);
- perf_pmu_enable(cpuctx->ctx.pmu);
+ perf_pmu_enable(pmu);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
return true;
@@ -4221,8 +4286,8 @@ static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
void perf_event_task_tick(void)
{
- struct list_head *head = this_cpu_ptr(&active_ctx_list);
- struct perf_event_context *ctx, *tmp;
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ struct perf_event_context *ctx;
int throttled;
lockdep_assert_irqs_disabled();
@@ -4231,8 +4296,13 @@ void perf_event_task_tick(void)
throttled = __this_cpu_xchg(perf_throttled_count, 0);
tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
- list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
- perf_adjust_freq_unthr_context(ctx, throttled);
+ perf_adjust_freq_unthr_context(&cpuctx->ctx, !!throttled);
+
+ rcu_read_lock();
+ ctx = rcu_dereference(current->perf_event_ctxp);
+ if (ctx)
+ perf_adjust_freq_unthr_context(ctx, !!throttled);
+ rcu_read_unlock();
}
static int event_enable_on_exec(struct perf_event *event,
@@ -4254,9 +4324,9 @@ static int event_enable_on_exec(struct perf_event *event,
* Enable all of a task's events that have been marked enable-on-exec.
* This expects task == current.
*/
-static void perf_event_enable_on_exec(int ctxn)
+static void perf_event_enable_on_exec(struct perf_event_context *ctx)
{
- struct perf_event_context *ctx, *clone_ctx = NULL;
+ struct perf_event_context *clone_ctx = NULL;
enum event_type_t event_type = 0;
struct perf_cpu_context *cpuctx;
struct perf_event *event;
@@ -4264,13 +4334,16 @@ static void perf_event_enable_on_exec(int ctxn)
int enabled = 0;
local_irq_save(flags);
- ctx = current->perf_event_ctxp[ctxn];
- if (!ctx || !ctx->nr_events)
+ if (WARN_ON_ONCE(current->perf_event_ctxp != ctx))
+ goto out;
+
+ if (!ctx->nr_events)
goto out;
- cpuctx = __get_cpu_context(ctx);
+ cpuctx = this_cpu_ptr(&perf_cpu_context);
perf_ctx_lock(cpuctx, ctx);
- ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+ ctx_sched_out(ctx, EVENT_TIME);
+
list_for_each_entry(event, &ctx->event_list, event_entry) {
enabled |= event_enable_on_exec(event, ctx);
event_type |= get_event_type(event);
@@ -4283,7 +4356,7 @@ static void perf_event_enable_on_exec(int ctxn)
clone_ctx = unclone_ctx(ctx);
ctx_resched(cpuctx, ctx, event_type);
} else {
- ctx_sched_in(ctx, cpuctx, EVENT_TIME);
+ ctx_sched_in(ctx, EVENT_TIME);
}
perf_ctx_unlock(cpuctx, ctx);
@@ -4302,17 +4375,13 @@ static void perf_event_exit_event(struct perf_event *event,
* Removes all events from the current task that have been marked
* remove-on-exec, and feeds their values back to parent events.
*/
-static void perf_event_remove_on_exec(int ctxn)
+static void perf_event_remove_on_exec(struct perf_event_context *ctx)
{
- struct perf_event_context *ctx, *clone_ctx = NULL;
+ struct perf_event_context *clone_ctx = NULL;
struct perf_event *event, *next;
unsigned long flags;
bool modified = false;
- ctx = perf_pin_task_context(current, ctxn);
- if (!ctx)
- return;
-
mutex_lock(&ctx->mutex);
if (WARN_ON_ONCE(ctx->task != current))
@@ -4333,13 +4402,11 @@ static void perf_event_remove_on_exec(int ctxn)
raw_spin_lock_irqsave(&ctx->lock, flags);
if (modified)
clone_ctx = unclone_ctx(ctx);
- --ctx->pin_count;
raw_spin_unlock_irqrestore(&ctx->lock, flags);
unlock:
mutex_unlock(&ctx->mutex);
- put_ctx(ctx);
if (clone_ctx)
put_ctx(clone_ctx);
}
@@ -4375,7 +4442,7 @@ static void __perf_event_read(void *info)
struct perf_read_data *data = info;
struct perf_event *sub, *event = data->event;
struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct pmu *pmu = event->pmu;
/*
@@ -4601,17 +4668,25 @@ static void __perf_event_init_context(struct perf_event_context *ctx)
{
raw_spin_lock_init(&ctx->lock);
mutex_init(&ctx->mutex);
- INIT_LIST_HEAD(&ctx->active_ctx_list);
+ INIT_LIST_HEAD(&ctx->pmu_ctx_list);
perf_event_groups_init(&ctx->pinned_groups);
perf_event_groups_init(&ctx->flexible_groups);
INIT_LIST_HEAD(&ctx->event_list);
- INIT_LIST_HEAD(&ctx->pinned_active);
- INIT_LIST_HEAD(&ctx->flexible_active);
refcount_set(&ctx->refcount, 1);
}
+static void
+__perf_init_event_pmu_context(struct perf_event_pmu_context *epc, struct pmu *pmu)
+{
+ epc->pmu = pmu;
+ INIT_LIST_HEAD(&epc->pmu_ctx_entry);
+ INIT_LIST_HEAD(&epc->pinned_active);
+ INIT_LIST_HEAD(&epc->flexible_active);
+ atomic_set(&epc->refcount, 1);
+}
+
static struct perf_event_context *
-alloc_perf_context(struct pmu *pmu, struct task_struct *task)
+alloc_perf_context(struct task_struct *task)
{
struct perf_event_context *ctx;
@@ -4622,7 +4697,6 @@ alloc_perf_context(struct pmu *pmu, struct task_struct *task)
__perf_event_init_context(ctx);
if (task)
ctx->task = get_task_struct(task);
- ctx->pmu = pmu;
return ctx;
}
@@ -4651,15 +4725,12 @@ find_lively_task_by_vpid(pid_t vpid)
* Returns a matching context with refcount and pincount.
*/
static struct perf_event_context *
-find_get_context(struct pmu *pmu, struct task_struct *task,
- struct perf_event *event)
+find_get_context(struct task_struct *task, struct perf_event *event)
{
struct perf_event_context *ctx, *clone_ctx = NULL;
struct perf_cpu_context *cpuctx;
- void *task_ctx_data = NULL;
unsigned long flags;
- int ctxn, err;
- int cpu = event->cpu;
+ int err;
if (!task) {
/* Must be root to operate on a CPU event: */
@@ -4667,7 +4738,7 @@ find_get_context(struct pmu *pmu, struct task_struct *task,
if (err)
return ERR_PTR(err);
- cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+ cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu);
ctx = &cpuctx->ctx;
get_ctx(ctx);
raw_spin_lock_irqsave(&ctx->lock, flags);
@@ -4678,43 +4749,22 @@ find_get_context(struct pmu *pmu, struct task_struct *task,
}
err = -EINVAL;
- ctxn = pmu->task_ctx_nr;
- if (ctxn < 0)
- goto errout;
-
- if (event->attach_state & PERF_ATTACH_TASK_DATA) {
- task_ctx_data = alloc_task_ctx_data(pmu);
- if (!task_ctx_data) {
- err = -ENOMEM;
- goto errout;
- }
- }
-
retry:
- ctx = perf_lock_task_context(task, ctxn, &flags);
+ ctx = perf_lock_task_context(task, &flags);
if (ctx) {
clone_ctx = unclone_ctx(ctx);
++ctx->pin_count;
- if (task_ctx_data && !ctx->task_ctx_data) {
- ctx->task_ctx_data = task_ctx_data;
- task_ctx_data = NULL;
- }
raw_spin_unlock_irqrestore(&ctx->lock, flags);
if (clone_ctx)
put_ctx(clone_ctx);
} else {
- ctx = alloc_perf_context(pmu, task);
+ ctx = alloc_perf_context(task);
err = -ENOMEM;
if (!ctx)
goto errout;
- if (task_ctx_data) {
- ctx->task_ctx_data = task_ctx_data;
- task_ctx_data = NULL;
- }
-
err = 0;
mutex_lock(&task->perf_event_mutex);
/*
@@ -4723,12 +4773,12 @@ retry:
*/
if (task->flags & PF_EXITING)
err = -ESRCH;
- else if (task->perf_event_ctxp[ctxn])
+ else if (task->perf_event_ctxp)
err = -EAGAIN;
else {
get_ctx(ctx);
++ctx->pin_count;
- rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
+ rcu_assign_pointer(task->perf_event_ctxp, ctx);
}
mutex_unlock(&task->perf_event_mutex);
@@ -4741,21 +4791,146 @@ retry:
}
}
- free_task_ctx_data(pmu, task_ctx_data);
return ctx;
errout:
- free_task_ctx_data(pmu, task_ctx_data);
return ERR_PTR(err);
}
+static struct perf_event_pmu_context *
+find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx,
+ struct perf_event *event)
+{
+ struct perf_event_pmu_context *new = NULL, *epc;
+ void *task_ctx_data = NULL;
+
+ if (!ctx->task) {
+ struct perf_cpu_pmu_context *cpc;
+
+ cpc = per_cpu_ptr(pmu->cpu_pmu_context, event->cpu);
+ epc = &cpc->epc;
+
+ if (!epc->ctx) {
+ atomic_set(&epc->refcount, 1);
+ epc->embedded = 1;
+ raw_spin_lock_irq(&ctx->lock);
+ list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list);
+ epc->ctx = ctx;
+ raw_spin_unlock_irq(&ctx->lock);
+ } else {
+ WARN_ON_ONCE(epc->ctx != ctx);
+ atomic_inc(&epc->refcount);
+ }
+
+ return epc;
+ }
+
+ new = kzalloc(sizeof(*epc), GFP_KERNEL);
+ if (!new)
+ return ERR_PTR(-ENOMEM);
+
+ if (event->attach_state & PERF_ATTACH_TASK_DATA) {
+ task_ctx_data = alloc_task_ctx_data(pmu);
+ if (!task_ctx_data) {
+ kfree(new);
+ return ERR_PTR(-ENOMEM);
+ }
+ }
+
+ __perf_init_event_pmu_context(new, pmu);
+
+ /*
+ * XXX
+ *
+ * lockdep_assert_held(&ctx->mutex);
+ *
+ * can't because perf_event_init_task() doesn't actually hold the
+ * child_ctx->mutex.
+ */
+
+ raw_spin_lock_irq(&ctx->lock);
+ list_for_each_entry(epc, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+ if (epc->pmu == pmu) {
+ WARN_ON_ONCE(epc->ctx != ctx);
+ atomic_inc(&epc->refcount);
+ goto found_epc;
+ }
+ }
+
+ epc = new;
+ new = NULL;
+
+ list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list);
+ epc->ctx = ctx;
+
+found_epc:
+ if (task_ctx_data && !epc->task_ctx_data) {
+ epc->task_ctx_data = task_ctx_data;
+ task_ctx_data = NULL;
+ ctx->nr_task_data++;
+ }
+ raw_spin_unlock_irq(&ctx->lock);
+
+ free_task_ctx_data(pmu, task_ctx_data);
+ kfree(new);
+
+ return epc;
+}
+
+static void get_pmu_ctx(struct perf_event_pmu_context *epc)
+{
+ WARN_ON_ONCE(!atomic_inc_not_zero(&epc->refcount));
+}
+
+static void free_epc_rcu(struct rcu_head *head)
+{
+ struct perf_event_pmu_context *epc = container_of(head, typeof(*epc), rcu_head);
+
+ kfree(epc->task_ctx_data);
+ kfree(epc);
+}
+
+static void put_pmu_ctx(struct perf_event_pmu_context *epc)
+{
+ unsigned long flags;
+
+ if (!atomic_dec_and_test(&epc->refcount))
+ return;
+
+ if (epc->ctx) {
+ struct perf_event_context *ctx = epc->ctx;
+
+ /*
+ * XXX
+ *
+ * lockdep_assert_held(&ctx->mutex);
+ *
+ * can't because of the call-site in _free_event()/put_event()
+ * which isn't always called under ctx->mutex.
+ */
+
+ WARN_ON_ONCE(list_empty(&epc->pmu_ctx_entry));
+ raw_spin_lock_irqsave(&ctx->lock, flags);
+ list_del_init(&epc->pmu_ctx_entry);
+ epc->ctx = NULL;
+ raw_spin_unlock_irqrestore(&ctx->lock, flags);
+ }
+
+ WARN_ON_ONCE(!list_empty(&epc->pinned_active));
+ WARN_ON_ONCE(!list_empty(&epc->flexible_active));
+
+ if (epc->embedded)
+ return;
+
+ call_rcu(&epc->rcu_head, free_epc_rcu);
+}
+
static void perf_event_free_filter(struct perf_event *event);
static void free_event_rcu(struct rcu_head *head)
{
- struct perf_event *event;
+ struct perf_event *event = container_of(head, typeof(*event), rcu_head);
- event = container_of(head, struct perf_event, rcu_head);
if (event->ns)
put_pid_ns(event->ns);
perf_event_free_filter(event);
@@ -4893,7 +5068,7 @@ static void perf_sched_delayed(struct work_struct *work)
*
* 1) cpu-wide events in the presence of per-task events,
* 2) per-task events in the presence of cpu-wide events,
- * 3) two matching events on the same context.
+ * 3) two matching events on the same perf_event_context.
*
* The former two cases are handled in the allocation path (perf_event_alloc(),
* _free_event()), the latter -- before the first perf_install_in_context().
@@ -5017,6 +5192,9 @@ static void _free_event(struct perf_event *event)
if (event->hw.target)
put_task_struct(event->hw.target);
+ if (event->pmu_ctx)
+ put_pmu_ctx(event->pmu_ctx);
+
/*
* perf_event_free_task() relies on put_ctx() being 'last', in particular
* all task references must be cleaned up.
@@ -5117,8 +5295,8 @@ int perf_event_release_kernel(struct perf_event *event)
LIST_HEAD(free_list);
/*
- * If we got here through err_file: fput(event_file); we will not have
- * attached to a context yet.
+ * If we got here through err_alloc: free_event(event); we will not
+ * have attached to a context yet.
*/
if (!ctx) {
WARN_ON_ONCE(event->attach_state &
@@ -5550,7 +5728,7 @@ static void __perf_event_period(struct perf_event *event,
active = (event->state == PERF_EVENT_STATE_ACTIVE);
if (active) {
- perf_pmu_disable(ctx->pmu);
+ perf_pmu_disable(event->pmu);
/*
* We could be throttled; unthrottle now to avoid the tick
* trying to unthrottle while we already re-started the event.
@@ -5566,7 +5744,7 @@ static void __perf_event_period(struct perf_event *event,
if (active) {
event->pmu->start(event, PERF_EF_RELOAD);
- perf_pmu_enable(ctx->pmu);
+ perf_pmu_enable(event->pmu);
}
}
@@ -7729,7 +7907,6 @@ perf_iterate_sb(perf_iterate_f output, void *data,
struct perf_event_context *task_ctx)
{
struct perf_event_context *ctx;
- int ctxn;
rcu_read_lock();
preempt_disable();
@@ -7746,11 +7923,9 @@ perf_iterate_sb(perf_iterate_f output, void *data,
perf_iterate_sb_cpu(output, data);
- for_each_task_context_nr(ctxn) {
- ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
- if (ctx)
- perf_iterate_ctx(ctx, output, data, false);
- }
+ ctx = rcu_dereference(current->perf_event_ctxp);
+ if (ctx)
+ perf_iterate_ctx(ctx, output, data, false);
done:
preempt_enable();
rcu_read_unlock();
@@ -7792,20 +7967,17 @@ static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
void perf_event_exec(void)
{
struct perf_event_context *ctx;
- int ctxn;
- for_each_task_context_nr(ctxn) {
- perf_event_enable_on_exec(ctxn);
- perf_event_remove_on_exec(ctxn);
+ ctx = perf_pin_task_context(current);
+ if (!ctx)
+ return;
- rcu_read_lock();
- ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
- if (ctx) {
- perf_iterate_ctx(ctx, perf_event_addr_filters_exec,
- NULL, true);
- }
- rcu_read_unlock();
- }
+ perf_event_enable_on_exec(ctx);
+ perf_event_remove_on_exec(ctx);
+ perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, true);
+
+ perf_unpin_context(ctx);
+ put_ctx(ctx);
}
struct remote_output {
@@ -7845,8 +8017,7 @@ static void __perf_event_output_stop(struct perf_event *event, void *data)
static int __perf_pmu_output_stop(void *info)
{
struct perf_event *event = info;
- struct pmu *pmu = event->ctx->pmu;
- struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct remote_output ro = {
.rb = event->rb,
};
@@ -8635,7 +8806,6 @@ static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
static void perf_addr_filters_adjust(struct vm_area_struct *vma)
{
struct perf_event_context *ctx;
- int ctxn;
/*
* Data tracing isn't supported yet and as such there is no need
@@ -8645,13 +8815,9 @@ static void perf_addr_filters_adjust(struct vm_area_struct *vma)
return;
rcu_read_lock();
- for_each_task_context_nr(ctxn) {
- ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
- if (!ctx)
- continue;
-
+ ctx = rcu_dereference(current->perf_event_ctxp);
+ if (ctx)
perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true);
- }
rcu_read_unlock();
}
@@ -9826,6 +9992,44 @@ static struct pmu perf_swevent = {
#ifdef CONFIG_EVENT_TRACING
+static void tp_perf_event_destroy(struct perf_event *event)
+{
+ perf_trace_destroy(event);
+}
+
+static int perf_tp_event_init(struct perf_event *event)
+{
+ int err;
+
+ if (event->attr.type != PERF_TYPE_TRACEPOINT)
+ return -ENOENT;
+
+ /*
+ * no branch sampling for tracepoint events
+ */
+ if (has_branch_stack(event))
+ return -EOPNOTSUPP;
+
+ err = perf_trace_init(event);
+ if (err)
+ return err;
+
+ event->destroy = tp_perf_event_destroy;
+
+ return 0;
+}
+
+static struct pmu perf_tracepoint = {
+ .task_ctx_nr = perf_sw_context,
+
+ .event_init = perf_tp_event_init,
+ .add = perf_trace_add,
+ .del = perf_trace_del,
+ .start = perf_swevent_start,
+ .stop = perf_swevent_stop,
+ .read = perf_swevent_read,
+};
+
static int perf_tp_filter_match(struct perf_event *event,
struct perf_sample_data *data)
{
@@ -9875,6 +10079,44 @@ void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
}
EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
+static void __perf_tp_event_target_task(u64 count, void *record,
+ struct pt_regs *regs,
+ struct perf_sample_data *data,
+ struct perf_event *event)
+{
+ struct trace_entry *entry = record;
+
+ if (event->attr.config != entry->type)
+ return;
+ /* Cannot deliver synchronous signal to other task. */
+ if (event->attr.sigtrap)
+ return;
+ if (perf_tp_event_match(event, data, regs))
+ perf_swevent_event(event, count, data, regs);
+}
+
+static void perf_tp_event_target_task(u64 count, void *record,
+ struct pt_regs *regs,
+ struct perf_sample_data *data,
+ struct perf_event_context *ctx)
+{
+ unsigned int cpu = smp_processor_id();
+ struct pmu *pmu = &perf_tracepoint;
+ struct perf_event *event, *sibling;
+
+ perf_event_groups_for_cpu_pmu(event, &ctx->pinned_groups, cpu, pmu) {
+ __perf_tp_event_target_task(count, record, regs, data, event);
+ for_each_sibling_event(sibling, event)
+ __perf_tp_event_target_task(count, record, regs, data, sibling);
+ }
+
+ perf_event_groups_for_cpu_pmu(event, &ctx->flexible_groups, cpu, pmu) {
+ __perf_tp_event_target_task(count, record, regs, data, event);
+ for_each_sibling_event(sibling, event)
+ __perf_tp_event_target_task(count, record, regs, data, sibling);
+ }
+}
+
void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
struct pt_regs *regs, struct hlist_head *head, int rctx,
struct task_struct *task)
@@ -9906,26 +10148,15 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
*/
if (task && task != current) {
struct perf_event_context *ctx;
- struct trace_entry *entry = record;
rcu_read_lock();
- ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
+ ctx = rcu_dereference(task->perf_event_ctxp);
if (!ctx)
goto unlock;
- list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
- if (event->cpu != smp_processor_id())
- continue;
- if (event->attr.type != PERF_TYPE_TRACEPOINT)
- continue;
- if (event->attr.config != entry->type)
- continue;
- /* Cannot deliver synchronous signal to other task. */
- if (event->attr.sigtrap)
- continue;
- if (perf_tp_event_match(event, &data, regs))
- perf_swevent_event(event, count, &data, regs);
- }
+ raw_spin_lock(&ctx->lock);
+ perf_tp_event_target_task(count, record, regs, &data, ctx);
+ raw_spin_unlock(&ctx->lock);
unlock:
rcu_read_unlock();
}
@@ -9934,44 +10165,6 @@ unlock:
}
EXPORT_SYMBOL_GPL(perf_tp_event);
-static void tp_perf_event_destroy(struct perf_event *event)
-{
- perf_trace_destroy(event);
-}
-
-static int perf_tp_event_init(struct perf_event *event)
-{
- int err;
-
- if (event->attr.type != PERF_TYPE_TRACEPOINT)
- return -ENOENT;
-
- /*
- * no branch sampling for tracepoint events
- */
- if (has_branch_stack(event))
- return -EOPNOTSUPP;
-
- err = perf_trace_init(event);
- if (err)
- return err;
-
- event->destroy = tp_perf_event_destroy;
-
- return 0;
-}
-
-static struct pmu perf_tracepoint = {
- .task_ctx_nr = perf_sw_context,
-
- .event_init = perf_tp_event_init,
- .add = perf_trace_add,
- .del = perf_trace_del,
- .start = perf_swevent_start,
- .stop = perf_swevent_stop,
- .read = perf_swevent_read,
-};
-
#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)
/*
* Flags in config, used by dynamic PMU kprobe and uprobe
@@ -11058,36 +11251,9 @@ static int perf_event_idx_default(struct perf_event *event)
return 0;
}
-/*
- * Ensures all contexts with the same task_ctx_nr have the same
- * pmu_cpu_context too.
- */
-static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
-{
- struct pmu *pmu;
-
- if (ctxn < 0)
- return NULL;
-
- list_for_each_entry(pmu, &pmus, entry) {
- if (pmu->task_ctx_nr == ctxn)
- return pmu->pmu_cpu_context;
- }
-
- return NULL;
-}
-
static void free_pmu_context(struct pmu *pmu)
{
- /*
- * Static contexts such as perf_sw_context have a global lifetime
- * and may be shared between different PMUs. Avoid freeing them
- * when a single PMU is going away.
- */
- if (pmu->task_ctx_nr > perf_invalid_context)
- return;
-
- free_percpu(pmu->pmu_cpu_context);
+ free_percpu(pmu->cpu_pmu_context);
}
/*
@@ -11151,12 +11317,11 @@ perf_event_mux_interval_ms_store(struct device *dev,
/* update all cpuctx for this PMU */
cpus_read_lock();
for_each_online_cpu(cpu) {
- struct perf_cpu_context *cpuctx;
- cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
- cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
+ struct perf_cpu_pmu_context *cpc;
+ cpc = per_cpu_ptr(pmu->cpu_pmu_context, cpu);
+ cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
- cpu_function_call(cpu,
- (remote_function_f)perf_mux_hrtimer_restart, cpuctx);
+ cpu_function_call(cpu, perf_mux_hrtimer_restart_ipi, cpc);
}
cpus_read_unlock();
mutex_unlock(&mux_interval_mutex);
@@ -11193,13 +11358,15 @@ static int pmu_dev_alloc(struct pmu *pmu)
pmu->dev->groups = pmu->attr_groups;
device_initialize(pmu->dev);
- ret = dev_set_name(pmu->dev, "%s", pmu->name);
- if (ret)
- goto free_dev;
dev_set_drvdata(pmu->dev, pmu);
pmu->dev->bus = &pmu_bus;
pmu->dev->release = pmu_dev_release;
+
+ ret = dev_set_name(pmu->dev, "%s", pmu->name);
+ if (ret)
+ goto free_dev;
+
ret = device_add(pmu->dev);
if (ret)
goto free_dev;
@@ -11267,47 +11434,19 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
}
skip_type:
- if (pmu->task_ctx_nr == perf_hw_context) {
- static int hw_context_taken = 0;
-
- /*
- * Other than systems with heterogeneous CPUs, it never makes
- * sense for two PMUs to share perf_hw_context. PMUs which are
- * uncore must use perf_invalid_context.
- */
- if (WARN_ON_ONCE(hw_context_taken &&
- !(pmu->capabilities & PERF_PMU_CAP_HETEROGENEOUS_CPUS)))
- pmu->task_ctx_nr = perf_invalid_context;
-
- hw_context_taken = 1;
- }
-
- pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
- if (pmu->pmu_cpu_context)
- goto got_cpu_context;
-
ret = -ENOMEM;
- pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
- if (!pmu->pmu_cpu_context)
+ pmu->cpu_pmu_context = alloc_percpu(struct perf_cpu_pmu_context);
+ if (!pmu->cpu_pmu_context)
goto free_dev;
for_each_possible_cpu(cpu) {
- struct perf_cpu_context *cpuctx;
+ struct perf_cpu_pmu_context *cpc;
- cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
- __perf_event_init_context(&cpuctx->ctx);
- lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
- lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
- cpuctx->ctx.pmu = pmu;
- cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
-
- __perf_mux_hrtimer_init(cpuctx, cpu);
-
- cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default);
- cpuctx->heap = cpuctx->heap_default;
+ cpc = per_cpu_ptr(pmu->cpu_pmu_context, cpu);
+ __perf_init_event_pmu_context(&cpc->epc, pmu);
+ __perf_mux_hrtimer_init(cpc, cpu);
}
-got_cpu_context:
if (!pmu->start_txn) {
if (pmu->pmu_enable) {
/*
@@ -11786,10 +11925,11 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
}
/*
- * Disallow uncore-cgroup events, they don't make sense as the cgroup will
- * be different on other CPUs in the uncore mask.
+ * Disallow uncore-task events. Similarly, disallow uncore-cgroup
+ * events (they don't make sense as the cgroup will be different
+ * on other CPUs in the uncore mask).
*/
- if (pmu->task_ctx_nr == perf_invalid_context && cgroup_fd != -1) {
+ if (pmu->task_ctx_nr == perf_invalid_context && (task || cgroup_fd != -1)) {
err = -EINVAL;
goto err_pmu;
}
@@ -12136,37 +12276,6 @@ static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
return 0;
}
-/*
- * Variation on perf_event_ctx_lock_nested(), except we take two context
- * mutexes.
- */
-static struct perf_event_context *
-__perf_event_ctx_lock_double(struct perf_event *group_leader,
- struct perf_event_context *ctx)
-{
- struct perf_event_context *gctx;
-
-again:
- rcu_read_lock();
- gctx = READ_ONCE(group_leader->ctx);
- if (!refcount_inc_not_zero(&gctx->refcount)) {
- rcu_read_unlock();
- goto again;
- }
- rcu_read_unlock();
-
- mutex_lock_double(&gctx->mutex, &ctx->mutex);
-
- if (group_leader->ctx != gctx) {
- mutex_unlock(&ctx->mutex);
- mutex_unlock(&gctx->mutex);
- put_ctx(gctx);
- goto again;
- }
-
- return gctx;
-}
-
static bool
perf_check_permission(struct perf_event_attr *attr, struct task_struct *task)
{
@@ -12212,9 +12321,10 @@ SYSCALL_DEFINE5(perf_event_open,
pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
{
struct perf_event *group_leader = NULL, *output_event = NULL;
+ struct perf_event_pmu_context *pmu_ctx;
struct perf_event *event, *sibling;
struct perf_event_attr attr;
- struct perf_event_context *ctx, *gctx;
+ struct perf_event_context *ctx;
struct file *event_file = NULL;
struct fd group = {NULL, 0};
struct task_struct *task = NULL;
@@ -12344,42 +12454,53 @@ SYSCALL_DEFINE5(perf_event_open,
if (pmu->task_ctx_nr == perf_sw_context)
event->event_caps |= PERF_EV_CAP_SOFTWARE;
- if (group_leader) {
- if (is_software_event(event) &&
- !in_software_context(group_leader)) {
- /*
- * If the event is a sw event, but the group_leader
- * is on hw context.
- *
- * Allow the addition of software events to hw
- * groups, this is safe because software events
- * never fail to schedule.
- */
- pmu = group_leader->ctx->pmu;
- } else if (!is_software_event(event) &&
- is_software_event(group_leader) &&
- (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
- /*
- * In case the group is a pure software group, and we
- * try to add a hardware event, move the whole group to
- * the hardware context.
- */
- move_group = 1;
- }
+ if (task) {
+ err = down_read_interruptible(&task->signal->exec_update_lock);
+ if (err)
+ goto err_alloc;
+
+ /*
+ * We must hold exec_update_lock across this and any potential
+ * perf_install_in_context() call for this new event to
+ * serialize against exec() altering our credentials (and the
+ * perf_event_exit_task() that could imply).
+ */
+ err = -EACCES;
+ if (!perf_check_permission(&attr, task))
+ goto err_cred;
}
/*
* Get the target context (task or percpu):
*/
- ctx = find_get_context(pmu, task, event);
+ ctx = find_get_context(task, event);
if (IS_ERR(ctx)) {
err = PTR_ERR(ctx);
- goto err_alloc;
+ goto err_cred;
+ }
+
+ mutex_lock(&ctx->mutex);
+
+ if (ctx->task == TASK_TOMBSTONE) {
+ err = -ESRCH;
+ goto err_locked;
+ }
+
+ if (!task) {
+ /*
+ * Check if the @cpu we're creating an event for is online.
+ *
+ * We use the perf_cpu_context::ctx::mutex to serialize against
+ * the hotplug notifiers. See perf_event_{init,exit}_cpu().
+ */
+ struct perf_cpu_context *cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu);
+
+ if (!cpuctx->online) {
+ err = -ENODEV;
+ goto err_locked;
+ }
}
- /*
- * Look up the group leader (we will attach this event to it):
- */
if (group_leader) {
err = -EINVAL;
@@ -12388,11 +12509,11 @@ SYSCALL_DEFINE5(perf_event_open,
* becoming part of another group-sibling):
*/
if (group_leader->group_leader != group_leader)
- goto err_context;
+ goto err_locked;
/* All events in a group should have the same clock */
if (group_leader->clock != event->clock)
- goto err_context;
+ goto err_locked;
/*
* Make sure we're both events for the same CPU;
@@ -12400,145 +12521,76 @@ SYSCALL_DEFINE5(perf_event_open,
* you can never concurrently schedule them anyhow.
*/
if (group_leader->cpu != event->cpu)
- goto err_context;
-
- /*
- * Make sure we're both on the same task, or both
- * per-CPU events.
- */
- if (group_leader->ctx->task != ctx->task)
- goto err_context;
+ goto err_locked;
/*
- * Do not allow to attach to a group in a different task
- * or CPU context. If we're moving SW events, we'll fix
- * this up later, so allow that.
- *
- * Racy, not holding group_leader->ctx->mutex, see comment with
- * perf_event_ctx_lock().
+ * Make sure we're both on the same context; either task or cpu.
*/
- if (!move_group && group_leader->ctx != ctx)
- goto err_context;
+ if (group_leader->ctx != ctx)
+ goto err_locked;
/*
* Only a group leader can be exclusive or pinned
*/
if (attr.exclusive || attr.pinned)
- goto err_context;
- }
-
- if (output_event) {
- err = perf_event_set_output(event, output_event);
- if (err)
- goto err_context;
- }
-
- event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
- f_flags);
- if (IS_ERR(event_file)) {
- err = PTR_ERR(event_file);
- event_file = NULL;
- goto err_context;
- }
-
- if (task) {
- err = down_read_interruptible(&task->signal->exec_update_lock);
- if (err)
- goto err_file;
-
- /*
- * We must hold exec_update_lock across this and any potential
- * perf_install_in_context() call for this new event to
- * serialize against exec() altering our credentials (and the
- * perf_event_exit_task() that could imply).
- */
- err = -EACCES;
- if (!perf_check_permission(&attr, task))
- goto err_cred;
- }
-
- if (move_group) {
- gctx = __perf_event_ctx_lock_double(group_leader, ctx);
-
- if (gctx->task == TASK_TOMBSTONE) {
- err = -ESRCH;
goto err_locked;
- }
- /*
- * Check if we raced against another sys_perf_event_open() call
- * moving the software group underneath us.
- */
- if (!(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
+ if (is_software_event(event) &&
+ !in_software_context(group_leader)) {
/*
- * If someone moved the group out from under us, check
- * if this new event wound up on the same ctx, if so
- * its the regular !move_group case, otherwise fail.
+ * If the event is a sw event, but the group_leader
+ * is on hw context.
+ *
+ * Allow the addition of software events to hw
+ * groups, this is safe because software events
+ * never fail to schedule.
+ *
+ * Note the comment that goes with struct
+ * perf_event_pmu_context.
*/
- if (gctx != ctx) {
- err = -EINVAL;
- goto err_locked;
- } else {
- perf_event_ctx_unlock(group_leader, gctx);
- move_group = 0;
- goto not_move_group;
+ pmu = group_leader->pmu_ctx->pmu;
+ } else if (!is_software_event(event)) {
+ if (is_software_event(group_leader) &&
+ (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
+ /*
+ * In case the group is a pure software group, and we
+ * try to add a hardware event, move the whole group to
+ * the hardware context.
+ */
+ move_group = 1;
}
- }
-
- /*
- * Failure to create exclusive events returns -EBUSY.
- */
- err = -EBUSY;
- if (!exclusive_event_installable(group_leader, ctx))
- goto err_locked;
- for_each_sibling_event(sibling, group_leader) {
- if (!exclusive_event_installable(sibling, ctx))
+ /* Don't allow group of multiple hw events from different pmus */
+ if (!in_software_context(group_leader) &&
+ group_leader->pmu_ctx->pmu != pmu)
goto err_locked;
}
- } else {
- mutex_lock(&ctx->mutex);
-
- /*
- * Now that we hold ctx->lock, (re)validate group_leader->ctx == ctx,
- * see the group_leader && !move_group test earlier.
- */
- if (group_leader && group_leader->ctx != ctx) {
- err = -EINVAL;
- goto err_locked;
- }
}
-not_move_group:
- if (ctx->task == TASK_TOMBSTONE) {
- err = -ESRCH;
+ /*
+ * Now that we're certain of the pmu; find the pmu_ctx.
+ */
+ pmu_ctx = find_get_pmu_context(pmu, ctx, event);
+ if (IS_ERR(pmu_ctx)) {
+ err = PTR_ERR(pmu_ctx);
goto err_locked;
}
+ event->pmu_ctx = pmu_ctx;
- if (!perf_event_validate_size(event)) {
- err = -E2BIG;
- goto err_locked;
+ if (output_event) {
+ err = perf_event_set_output(event, output_event);
+ if (err)
+ goto err_context;
}
- if (!task) {
- /*
- * Check if the @cpu we're creating an event for is online.
- *
- * We use the perf_cpu_context::ctx::mutex to serialize against
- * the hotplug notifiers. See perf_event_{init,exit}_cpu().
- */
- struct perf_cpu_context *cpuctx =
- container_of(ctx, struct perf_cpu_context, ctx);
-
- if (!cpuctx->online) {
- err = -ENODEV;
- goto err_locked;
- }
+ if (!perf_event_validate_size(event)) {
+ err = -E2BIG;
+ goto err_context;
}
if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) {
err = -EINVAL;
- goto err_locked;
+ goto err_context;
}
/*
@@ -12547,36 +12599,33 @@ not_move_group:
*/
if (!exclusive_event_installable(event, ctx)) {
err = -EBUSY;
- goto err_locked;
+ goto err_context;
}
WARN_ON_ONCE(ctx->parent_ctx);
+ event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, f_flags);
+ if (IS_ERR(event_file)) {
+ err = PTR_ERR(event_file);
+ event_file = NULL;
+ goto err_context;
+ }
+
/*
* This is the point on no return; we cannot fail hereafter. This is
* where we start modifying current state.
*/
if (move_group) {
- /*
- * See perf_event_ctx_lock() for comments on the details
- * of swizzling perf_event::ctx.
- */
perf_remove_from_context(group_leader, 0);
- put_ctx(gctx);
+ put_pmu_ctx(group_leader->pmu_ctx);
for_each_sibling_event(sibling, group_leader) {
perf_remove_from_context(sibling, 0);
- put_ctx(gctx);
+ put_pmu_ctx(sibling->pmu_ctx);
}
/*
- * Wait for everybody to stop referencing the events through
- * the old lists, before installing it on new lists.
- */
- synchronize_rcu();
-
- /*
* Install the group siblings before the group leader.
*
* Because a group leader will try and install the entire group
@@ -12587,9 +12636,10 @@ not_move_group:
* reachable through the group lists.
*/
for_each_sibling_event(sibling, group_leader) {
+ sibling->pmu_ctx = pmu_ctx;
+ get_pmu_ctx(pmu_ctx);
perf_event__state_init(sibling);
perf_install_in_context(ctx, sibling, sibling->cpu);
- get_ctx(ctx);
}
/*
@@ -12597,9 +12647,10 @@ not_move_group:
* event. What we want here is event in the initial
* startup state, ready to be add into new context.
*/
+ group_leader->pmu_ctx = pmu_ctx;
+ get_pmu_ctx(pmu_ctx);
perf_event__state_init(group_leader);
perf_install_in_context(ctx, group_leader, group_leader->cpu);
- get_ctx(ctx);
}
/*
@@ -12616,8 +12667,6 @@ not_move_group:
perf_install_in_context(ctx, event, event->cpu);
perf_unpin_context(ctx);
- if (move_group)
- perf_event_ctx_unlock(group_leader, gctx);
mutex_unlock(&ctx->mutex);
if (task) {
@@ -12639,25 +12688,17 @@ not_move_group:
fd_install(event_fd, event_file);
return event_fd;
+err_context:
+ /* event->pmu_ctx freed by free_event() */
err_locked:
- if (move_group)
- perf_event_ctx_unlock(group_leader, gctx);
mutex_unlock(&ctx->mutex);
+ perf_unpin_context(ctx);
+ put_ctx(ctx);
err_cred:
if (task)
up_read(&task->signal->exec_update_lock);
-err_file:
- fput(event_file);
-err_context:
- perf_unpin_context(ctx);
- put_ctx(ctx);
err_alloc:
- /*
- * If event_file is set, the fput() above will have called ->release()
- * and that will take care of freeing the event.
- */
- if (!event_file)
- free_event(event);
+ free_event(event);
err_task:
if (task)
put_task_struct(task);
@@ -12683,8 +12724,10 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
perf_overflow_handler_t overflow_handler,
void *context)
{
+ struct perf_event_pmu_context *pmu_ctx;
struct perf_event_context *ctx;
struct perf_event *event;
+ struct pmu *pmu;
int err;
/*
@@ -12703,14 +12746,18 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
/* Mark owner so we could distinguish it from user events. */
event->owner = TASK_TOMBSTONE;
+ pmu = event->pmu;
+
+ if (pmu->task_ctx_nr == perf_sw_context)
+ event->event_caps |= PERF_EV_CAP_SOFTWARE;
/*
* Get the target context (task or percpu):
*/
- ctx = find_get_context(event->pmu, task, event);
+ ctx = find_get_context(task, event);
if (IS_ERR(ctx)) {
err = PTR_ERR(ctx);
- goto err_free;
+ goto err_alloc;
}
WARN_ON_ONCE(ctx->parent_ctx);
@@ -12720,6 +12767,13 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
goto err_unlock;
}
+ pmu_ctx = find_get_pmu_context(pmu, ctx, event);
+ if (IS_ERR(pmu_ctx)) {
+ err = PTR_ERR(pmu_ctx);
+ goto err_unlock;
+ }
+ event->pmu_ctx = pmu_ctx;
+
if (!task) {
/*
* Check if the @cpu we're creating an event for is online.
@@ -12731,13 +12785,13 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
container_of(ctx, struct perf_cpu_context, ctx);
if (!cpuctx->online) {
err = -ENODEV;
- goto err_unlock;
+ goto err_pmu_ctx;
}
}
if (!exclusive_event_installable(event, ctx)) {
err = -EBUSY;
- goto err_unlock;
+ goto err_pmu_ctx;
}
perf_install_in_context(ctx, event, event->cpu);
@@ -12746,44 +12800,61 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
return event;
+err_pmu_ctx:
+ put_pmu_ctx(pmu_ctx);
err_unlock:
mutex_unlock(&ctx->mutex);
perf_unpin_context(ctx);
put_ctx(ctx);
-err_free:
+err_alloc:
free_event(event);
err:
return ERR_PTR(err);
}
EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
-void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
+static void __perf_pmu_remove(struct perf_event_context *ctx,
+ int cpu, struct pmu *pmu,
+ struct perf_event_groups *groups,
+ struct list_head *events)
{
- struct perf_event_context *src_ctx;
- struct perf_event_context *dst_ctx;
- struct perf_event *event, *tmp;
- LIST_HEAD(events);
-
- src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
- dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
+ struct perf_event *event, *sibling;
- /*
- * See perf_event_ctx_lock() for comments on the details
- * of swizzling perf_event::ctx.
- */
- mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
- list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
- event_entry) {
+ perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) {
perf_remove_from_context(event, 0);
- unaccount_event_cpu(event, src_cpu);
- put_ctx(src_ctx);
- list_add(&event->migrate_entry, &events);
+ unaccount_event_cpu(event, cpu);
+ put_pmu_ctx(event->pmu_ctx);
+ list_add(&event->migrate_entry, events);
+
+ for_each_sibling_event(sibling, event) {
+ perf_remove_from_context(sibling, 0);
+ unaccount_event_cpu(sibling, cpu);
+ put_pmu_ctx(sibling->pmu_ctx);
+ list_add(&sibling->migrate_entry, events);
+ }
}
+}
- /*
- * Wait for the events to quiesce before re-instating them.
- */
- synchronize_rcu();
+static void __perf_pmu_install_event(struct pmu *pmu,
+ struct perf_event_context *ctx,
+ int cpu, struct perf_event *event)
+{
+ struct perf_event_pmu_context *epc;
+
+ event->cpu = cpu;
+ epc = find_get_pmu_context(pmu, ctx, event);
+ event->pmu_ctx = epc;
+
+ if (event->state >= PERF_EVENT_STATE_OFF)
+ event->state = PERF_EVENT_STATE_INACTIVE;
+ account_event_cpu(event, cpu);
+ perf_install_in_context(ctx, event, cpu);
+}
+
+static void __perf_pmu_install(struct perf_event_context *ctx,
+ int cpu, struct pmu *pmu, struct list_head *events)
+{
+ struct perf_event *event, *tmp;
/*
* Re-instate events in 2 passes.
@@ -12793,30 +12864,48 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
* leader will enable its siblings, even if those are still on the old
* context.
*/
- list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
+ list_for_each_entry_safe(event, tmp, events, migrate_entry) {
if (event->group_leader == event)
continue;
list_del(&event->migrate_entry);
- if (event->state >= PERF_EVENT_STATE_OFF)
- event->state = PERF_EVENT_STATE_INACTIVE;
- account_event_cpu(event, dst_cpu);
- perf_install_in_context(dst_ctx, event, dst_cpu);
- get_ctx(dst_ctx);
+ __perf_pmu_install_event(pmu, ctx, cpu, event);
}
/*
* Once all the siblings are setup properly, install the group leaders
* to make it go.
*/
- list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
+ list_for_each_entry_safe(event, tmp, events, migrate_entry) {
list_del(&event->migrate_entry);
- if (event->state >= PERF_EVENT_STATE_OFF)
- event->state = PERF_EVENT_STATE_INACTIVE;
- account_event_cpu(event, dst_cpu);
- perf_install_in_context(dst_ctx, event, dst_cpu);
- get_ctx(dst_ctx);
+ __perf_pmu_install_event(pmu, ctx, cpu, event);
}
+}
+
+void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
+{
+ struct perf_event_context *src_ctx, *dst_ctx;
+ LIST_HEAD(events);
+
+ src_ctx = &per_cpu_ptr(&perf_cpu_context, src_cpu)->ctx;
+ dst_ctx = &per_cpu_ptr(&perf_cpu_context, dst_cpu)->ctx;
+
+ /*
+ * See perf_event_ctx_lock() for comments on the details
+ * of swizzling perf_event::ctx.
+ */
+ mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
+
+ __perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->pinned_groups, &events);
+ __perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->flexible_groups, &events);
+
+ /*
+ * Wait for the events to quiesce before re-instating them.
+ */
+ synchronize_rcu();
+
+ __perf_pmu_install(dst_ctx, dst_cpu, pmu, &events);
+
mutex_unlock(&dst_ctx->mutex);
mutex_unlock(&src_ctx->mutex);
}
@@ -12896,14 +12985,14 @@ perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
perf_event_wakeup(event);
}
-static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
+static void perf_event_exit_task_context(struct task_struct *child)
{
struct perf_event_context *child_ctx, *clone_ctx = NULL;
struct perf_event *child_event, *next;
WARN_ON_ONCE(child != current);
- child_ctx = perf_pin_task_context(child, ctxn);
+ child_ctx = perf_pin_task_context(child);
if (!child_ctx)
return;
@@ -12925,13 +13014,13 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
* in.
*/
raw_spin_lock_irq(&child_ctx->lock);
- task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx, EVENT_ALL);
+ task_ctx_sched_out(child_ctx, EVENT_ALL);
/*
* Now that the context is inactive, destroy the task <-> ctx relation
* and mark the context dead.
*/
- RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL);
+ RCU_INIT_POINTER(child->perf_event_ctxp, NULL);
put_ctx(child_ctx); /* cannot be last */
WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE);
put_task_struct(current); /* cannot be last */
@@ -12966,7 +13055,6 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
void perf_event_exit_task(struct task_struct *child)
{
struct perf_event *event, *tmp;
- int ctxn;
mutex_lock(&child->perf_event_mutex);
list_for_each_entry_safe(event, tmp, &child->perf_event_list,
@@ -12982,8 +13070,7 @@ void perf_event_exit_task(struct task_struct *child)
}
mutex_unlock(&child->perf_event_mutex);
- for_each_task_context_nr(ctxn)
- perf_event_exit_task_context(child, ctxn);
+ perf_event_exit_task_context(child);
/*
* The perf_event_exit_task_context calls perf_event_task
@@ -13026,56 +13113,51 @@ void perf_event_free_task(struct task_struct *task)
{
struct perf_event_context *ctx;
struct perf_event *event, *tmp;
- int ctxn;
- for_each_task_context_nr(ctxn) {
- ctx = task->perf_event_ctxp[ctxn];
- if (!ctx)
- continue;
+ ctx = rcu_access_pointer(task->perf_event_ctxp);
+ if (!ctx)
+ return;
- mutex_lock(&ctx->mutex);
- raw_spin_lock_irq(&ctx->lock);
- /*
- * Destroy the task <-> ctx relation and mark the context dead.
- *
- * This is important because even though the task hasn't been
- * exposed yet the context has been (through child_list).
- */
- RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], NULL);
- WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
- put_task_struct(task); /* cannot be last */
- raw_spin_unlock_irq(&ctx->lock);
+ mutex_lock(&ctx->mutex);
+ raw_spin_lock_irq(&ctx->lock);
+ /*
+ * Destroy the task <-> ctx relation and mark the context dead.
+ *
+ * This is important because even though the task hasn't been
+ * exposed yet the context has been (through child_list).
+ */
+ RCU_INIT_POINTER(task->perf_event_ctxp, NULL);
+ WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
+ put_task_struct(task); /* cannot be last */
+ raw_spin_unlock_irq(&ctx->lock);
- list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry)
- perf_free_event(event, ctx);
- mutex_unlock(&ctx->mutex);
+ list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry)
+ perf_free_event(event, ctx);
- /*
- * perf_event_release_kernel() could've stolen some of our
- * child events and still have them on its free_list. In that
- * case we must wait for these events to have been freed (in
- * particular all their references to this task must've been
- * dropped).
- *
- * Without this copy_process() will unconditionally free this
- * task (irrespective of its reference count) and
- * _free_event()'s put_task_struct(event->hw.target) will be a
- * use-after-free.
- *
- * Wait for all events to drop their context reference.
- */
- wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1);
- put_ctx(ctx); /* must be last */
- }
+ mutex_unlock(&ctx->mutex);
+
+ /*
+ * perf_event_release_kernel() could've stolen some of our
+ * child events and still have them on its free_list. In that
+ * case we must wait for these events to have been freed (in
+ * particular all their references to this task must've been
+ * dropped).
+ *
+ * Without this copy_process() will unconditionally free this
+ * task (irrespective of its reference count) and
+ * _free_event()'s put_task_struct(event->hw.target) will be a
+ * use-after-free.
+ *
+ * Wait for all events to drop their context reference.
+ */
+ wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1);
+ put_ctx(ctx); /* must be last */
}
void perf_event_delayed_put(struct task_struct *task)
{
- int ctxn;
-
- for_each_task_context_nr(ctxn)
- WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
+ WARN_ON_ONCE(task->perf_event_ctxp);
}
struct file *perf_event_get(unsigned int fd)
@@ -13125,6 +13207,7 @@ inherit_event(struct perf_event *parent_event,
struct perf_event_context *child_ctx)
{
enum perf_event_state parent_state = parent_event->state;
+ struct perf_event_pmu_context *pmu_ctx;
struct perf_event *child_event;
unsigned long flags;
@@ -13145,17 +13228,12 @@ inherit_event(struct perf_event *parent_event,
if (IS_ERR(child_event))
return child_event;
-
- if ((child_event->attach_state & PERF_ATTACH_TASK_DATA) &&
- !child_ctx->task_ctx_data) {
- struct pmu *pmu = child_event->pmu;
-
- child_ctx->task_ctx_data = alloc_task_ctx_data(pmu);
- if (!child_ctx->task_ctx_data) {
- free_event(child_event);
- return ERR_PTR(-ENOMEM);
- }
+ pmu_ctx = find_get_pmu_context(child_event->pmu, child_ctx, child_event);
+ if (IS_ERR(pmu_ctx)) {
+ free_event(child_event);
+ return NULL;
}
+ child_event->pmu_ctx = pmu_ctx;
/*
* is_orphaned_event() and list_add_tail(&parent_event->child_list)
@@ -13278,11 +13356,11 @@ static int inherit_group(struct perf_event *parent_event,
static int
inherit_task_group(struct perf_event *event, struct task_struct *parent,
struct perf_event_context *parent_ctx,
- struct task_struct *child, int ctxn,
+ struct task_struct *child,
u64 clone_flags, int *inherited_all)
{
- int ret;
struct perf_event_context *child_ctx;
+ int ret;
if (!event->attr.inherit ||
(event->attr.inherit_thread && !(clone_flags & CLONE_THREAD)) ||
@@ -13292,7 +13370,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
return 0;
}
- child_ctx = child->perf_event_ctxp[ctxn];
+ child_ctx = child->perf_event_ctxp;
if (!child_ctx) {
/*
* This is executed from the parent task context, so
@@ -13300,16 +13378,14 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
* First allocate and initialize a context for the
* child.
*/
- child_ctx = alloc_perf_context(parent_ctx->pmu, child);
+ child_ctx = alloc_perf_context(child);
if (!child_ctx)
return -ENOMEM;
- child->perf_event_ctxp[ctxn] = child_ctx;
+ child->perf_event_ctxp = child_ctx;
}
- ret = inherit_group(event, parent, parent_ctx,
- child, child_ctx);
-
+ ret = inherit_group(event, parent, parent_ctx, child, child_ctx);
if (ret)
*inherited_all = 0;
@@ -13319,8 +13395,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
/*
* Initialize the perf_event context in task_struct
*/
-static int perf_event_init_context(struct task_struct *child, int ctxn,
- u64 clone_flags)
+static int perf_event_init_context(struct task_struct *child, u64 clone_flags)
{
struct perf_event_context *child_ctx, *parent_ctx;
struct perf_event_context *cloned_ctx;
@@ -13330,14 +13405,14 @@ static int perf_event_init_context(struct task_struct *child, int ctxn,
unsigned long flags;
int ret = 0;
- if (likely(!parent->perf_event_ctxp[ctxn]))
+ if (likely(!parent->perf_event_ctxp))
return 0;
/*
* If the parent's context is a clone, pin it so it won't get
* swapped under us.
*/
- parent_ctx = perf_pin_task_context(parent, ctxn);
+ parent_ctx = perf_pin_task_context(parent);
if (!parent_ctx)
return 0;
@@ -13360,8 +13435,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn,
*/
perf_event_groups_for_each(event, &parent_ctx->pinned_groups) {
ret = inherit_task_group(event, parent, parent_ctx,
- child, ctxn, clone_flags,
- &inherited_all);
+ child, clone_flags, &inherited_all);
if (ret)
goto out_unlock;
}
@@ -13377,8 +13451,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn,
perf_event_groups_for_each(event, &parent_ctx->flexible_groups) {
ret = inherit_task_group(event, parent, parent_ctx,
- child, ctxn, clone_flags,
- &inherited_all);
+ child, clone_flags, &inherited_all);
if (ret)
goto out_unlock;
}
@@ -13386,7 +13459,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn,
raw_spin_lock_irqsave(&parent_ctx->lock, flags);
parent_ctx->rotate_disable = 0;
- child_ctx = child->perf_event_ctxp[ctxn];
+ child_ctx = child->perf_event_ctxp;
if (child_ctx && inherited_all) {
/*
@@ -13422,18 +13495,16 @@ out_unlock:
*/
int perf_event_init_task(struct task_struct *child, u64 clone_flags)
{
- int ctxn, ret;
+ int ret;
- memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
+ child->perf_event_ctxp = NULL;
mutex_init(&child->perf_event_mutex);
INIT_LIST_HEAD(&child->perf_event_list);
- for_each_task_context_nr(ctxn) {
- ret = perf_event_init_context(child, ctxn, clone_flags);
- if (ret) {
- perf_event_free_task(child);
- return ret;
- }
+ ret = perf_event_init_context(child, clone_flags);
+ if (ret) {
+ perf_event_free_task(child);
+ return ret;
}
return 0;
@@ -13442,6 +13513,7 @@ int perf_event_init_task(struct task_struct *child, u64 clone_flags)
static void __init perf_event_init_all_cpus(void)
{
struct swevent_htable *swhash;
+ struct perf_cpu_context *cpuctx;
int cpu;
zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);
@@ -13449,15 +13521,19 @@ static void __init perf_event_init_all_cpus(void)
for_each_possible_cpu(cpu) {
swhash = &per_cpu(swevent_htable, cpu);
mutex_init(&swhash->hlist_mutex);
- INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
-#ifdef CONFIG_CGROUP_PERF
- INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
-#endif
INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
+
+ cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
+ __perf_event_init_context(&cpuctx->ctx);
+ lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
+ lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
+ cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
+ cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default);
+ cpuctx->heap = cpuctx->heap_default;
}
}
@@ -13479,12 +13555,12 @@ static void perf_swevent_init_cpu(unsigned int cpu)
#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
static void __perf_event_exit_context(void *__info)
{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct perf_event_context *ctx = __info;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
struct perf_event *event;
raw_spin_lock(&ctx->lock);
- ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+ ctx_sched_out(ctx, EVENT_TIME);
list_for_each_entry(event, &ctx->event_list, event_entry)
__perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
raw_spin_unlock(&ctx->lock);
@@ -13494,18 +13570,16 @@ static void perf_event_exit_cpu_context(int cpu)
{
struct perf_cpu_context *cpuctx;
struct perf_event_context *ctx;
- struct pmu *pmu;
+ // XXX simplify cpuctx->online
mutex_lock(&pmus_lock);
- list_for_each_entry(pmu, &pmus, entry) {
- cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
- ctx = &cpuctx->ctx;
+ cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
+ ctx = &cpuctx->ctx;
- mutex_lock(&ctx->mutex);
- smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
- cpuctx->online = 0;
- mutex_unlock(&ctx->mutex);
- }
+ mutex_lock(&ctx->mutex);
+ smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
+ cpuctx->online = 0;
+ mutex_unlock(&ctx->mutex);
cpumask_clear_cpu(cpu, perf_online_mask);
mutex_unlock(&pmus_lock);
}
@@ -13519,20 +13593,17 @@ int perf_event_init_cpu(unsigned int cpu)
{
struct perf_cpu_context *cpuctx;
struct perf_event_context *ctx;
- struct pmu *pmu;
perf_swevent_init_cpu(cpu);
mutex_lock(&pmus_lock);
cpumask_set_cpu(cpu, perf_online_mask);
- list_for_each_entry(pmu, &pmus, entry) {
- cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
- ctx = &cpuctx->ctx;
+ cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
+ ctx = &cpuctx->ctx;
- mutex_lock(&ctx->mutex);
- cpuctx->online = 1;
- mutex_unlock(&ctx->mutex);
- }
+ mutex_lock(&ctx->mutex);
+ cpuctx->online = 1;
+ mutex_unlock(&ctx->mutex);
mutex_unlock(&pmus_lock);
return 0;
@@ -13669,9 +13740,12 @@ static int perf_cgroup_css_online(struct cgroup_subsys_state *css)
static int __perf_cgroup_move(void *info)
{
struct task_struct *task = info;
- rcu_read_lock();
- perf_cgroup_switch(task);
- rcu_read_unlock();
+
+ preempt_disable();
+ if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
+ perf_cgroup_switch(task);
+ preempt_enable();
+
return 0;
}