diff options
27 files changed, 1128 insertions, 59 deletions
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index 459abc5d1819..ea128e32e8ca 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c @@ -708,6 +708,7 @@ static_assert(offsetof(siginfo_t, si_upper) == 0x18); static_assert(offsetof(siginfo_t, si_pkey) == 0x14); static_assert(offsetof(siginfo_t, si_perf_data) == 0x10); static_assert(offsetof(siginfo_t, si_perf_type) == 0x14); +static_assert(offsetof(siginfo_t, si_perf_flags) == 0x18); static_assert(offsetof(siginfo_t, si_band) == 0x0c); static_assert(offsetof(siginfo_t, si_fd) == 0x10); static_assert(offsetof(siginfo_t, si_call_addr) == 0x0c); diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index 4a4122ef6f39..41b5d9d3672a 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -1011,6 +1011,7 @@ static_assert(offsetof(siginfo_t, si_upper) == 0x28); static_assert(offsetof(siginfo_t, si_pkey) == 0x20); static_assert(offsetof(siginfo_t, si_perf_data) == 0x18); static_assert(offsetof(siginfo_t, si_perf_type) == 0x20); +static_assert(offsetof(siginfo_t, si_perf_flags) == 0x24); static_assert(offsetof(siginfo_t, si_band) == 0x10); static_assert(offsetof(siginfo_t, si_fd) == 0x18); static_assert(offsetof(siginfo_t, si_call_addr) == 0x10); diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c index d984282b979f..4700f8522d27 100644 --- a/arch/arm64/kernel/signal32.c +++ b/arch/arm64/kernel/signal32.c @@ -487,6 +487,7 @@ static_assert(offsetof(compat_siginfo_t, si_upper) == 0x18); static_assert(offsetof(compat_siginfo_t, si_pkey) == 0x14); static_assert(offsetof(compat_siginfo_t, si_perf_data) == 0x10); static_assert(offsetof(compat_siginfo_t, si_perf_type) == 0x14); +static_assert(offsetof(compat_siginfo_t, si_perf_flags) == 0x18); static_assert(offsetof(compat_siginfo_t, si_band) == 0x0c); static_assert(offsetof(compat_siginfo_t, si_fd) == 0x10); static_assert(offsetof(compat_siginfo_t, si_call_addr) == 0x0c); diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c index 49533f65958a..b9f6908a31bc 100644 --- a/arch/m68k/kernel/signal.c +++ b/arch/m68k/kernel/signal.c @@ -625,6 +625,7 @@ static inline void siginfo_build_tests(void) /* _sigfault._perf */ BUILD_BUG_ON(offsetof(siginfo_t, si_perf_data) != 0x10); BUILD_BUG_ON(offsetof(siginfo_t, si_perf_type) != 0x14); + BUILD_BUG_ON(offsetof(siginfo_t, si_perf_flags) != 0x18); /* _sigpoll */ BUILD_BUG_ON(offsetof(siginfo_t, si_band) != 0x0c); diff --git a/arch/sparc/kernel/signal32.c b/arch/sparc/kernel/signal32.c index f9fe502b81c6..dad38960d1a8 100644 --- a/arch/sparc/kernel/signal32.c +++ b/arch/sparc/kernel/signal32.c @@ -779,5 +779,6 @@ static_assert(offsetof(compat_siginfo_t, si_upper) == 0x18); static_assert(offsetof(compat_siginfo_t, si_pkey) == 0x14); static_assert(offsetof(compat_siginfo_t, si_perf_data) == 0x10); static_assert(offsetof(compat_siginfo_t, si_perf_type) == 0x14); +static_assert(offsetof(compat_siginfo_t, si_perf_flags) == 0x18); static_assert(offsetof(compat_siginfo_t, si_band) == 0x0c); static_assert(offsetof(compat_siginfo_t, si_fd) == 0x10); diff --git a/arch/sparc/kernel/signal_64.c b/arch/sparc/kernel/signal_64.c index 8b9fc76cd3e0..570e43e6fda5 100644 --- a/arch/sparc/kernel/signal_64.c +++ b/arch/sparc/kernel/signal_64.c @@ -590,5 +590,6 @@ static_assert(offsetof(siginfo_t, si_upper) == 0x28); static_assert(offsetof(siginfo_t, si_pkey) == 0x20); static_assert(offsetof(siginfo_t, si_perf_data) == 0x18); static_assert(offsetof(siginfo_t, si_perf_type) == 0x20); +static_assert(offsetof(siginfo_t, si_perf_flags) == 0x24); static_assert(offsetof(siginfo_t, si_band) == 0x10); static_assert(offsetof(siginfo_t, si_fd) == 0x14); diff --git a/arch/x86/events/Kconfig b/arch/x86/events/Kconfig index d6cdfe631674..09c56965750a 100644 --- a/arch/x86/events/Kconfig +++ b/arch/x86/events/Kconfig @@ -44,4 +44,12 @@ config PERF_EVENTS_AMD_UNCORE To compile this driver as a module, choose M here: the module will be called 'amd-uncore'. + +config PERF_EVENTS_AMD_BRS + depends on PERF_EVENTS && CPU_SUP_AMD + bool "AMD Zen3 Branch Sampling support" + help + Enable AMD Zen3 branch sampling support (BRS) which samples up to + 16 consecutive taken branches in registers. + endmenu diff --git a/arch/x86/events/amd/Makefile b/arch/x86/events/amd/Makefile index 6cbe38d5fd9d..b9f5d4610256 100644 --- a/arch/x86/events/amd/Makefile +++ b/arch/x86/events/amd/Makefile @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_CPU_SUP_AMD) += core.o +obj-$(CONFIG_PERF_EVENTS_AMD_BRS) += brs.o obj-$(CONFIG_PERF_EVENTS_AMD_POWER) += power.o obj-$(CONFIG_X86_LOCAL_APIC) += ibs.o obj-$(CONFIG_PERF_EVENTS_AMD_UNCORE) += amd-uncore.o diff --git a/arch/x86/events/amd/brs.c b/arch/x86/events/amd/brs.c new file mode 100644 index 000000000000..895c82165d85 --- /dev/null +++ b/arch/x86/events/amd/brs.c @@ -0,0 +1,364 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Implement support for AMD Fam19h Branch Sampling feature + * Based on specifications published in AMD PPR Fam19 Model 01 + * + * Copyright 2021 Google LLC + * Contributed by Stephane Eranian <eranian@google.com> + */ +#include <linux/kernel.h> +#include <linux/jump_label.h> +#include <asm/msr.h> +#include <asm/cpufeature.h> + +#include "../perf_event.h" + +#define BRS_POISON 0xFFFFFFFFFFFFFFFEULL /* mark limit of valid entries */ + +/* Debug Extension Configuration register layout */ +union amd_debug_extn_cfg { + __u64 val; + struct { + __u64 rsvd0:2, /* reserved */ + brsmen:1, /* branch sample enable */ + rsvd4_3:2,/* reserved - must be 0x3 */ + vb:1, /* valid branches recorded */ + rsvd2:10, /* reserved */ + msroff:4, /* index of next entry to write */ + rsvd3:4, /* reserved */ + pmc:3, /* #PMC holding the sampling event */ + rsvd4:37; /* reserved */ + }; +}; + +static inline unsigned int brs_from(int idx) +{ + return MSR_AMD_SAMP_BR_FROM + 2 * idx; +} + +static inline unsigned int brs_to(int idx) +{ + return MSR_AMD_SAMP_BR_FROM + 2 * idx + 1; +} + +static inline void set_debug_extn_cfg(u64 val) +{ + /* bits[4:3] must always be set to 11b */ + wrmsrl(MSR_AMD_DBG_EXTN_CFG, val | 3ULL << 3); +} + +static inline u64 get_debug_extn_cfg(void) +{ + u64 val; + + rdmsrl(MSR_AMD_DBG_EXTN_CFG, val); + return val; +} + +static bool __init amd_brs_detect(void) +{ + if (!boot_cpu_has(X86_FEATURE_BRS)) + return false; + + switch (boot_cpu_data.x86) { + case 0x19: /* AMD Fam19h (Zen3) */ + x86_pmu.lbr_nr = 16; + + /* No hardware filtering supported */ + x86_pmu.lbr_sel_map = NULL; + x86_pmu.lbr_sel_mask = 0; + break; + default: + return false; + } + + return true; +} + +/* + * Current BRS implementation does not support branch type or privilege level + * filtering. Therefore, this function simply enforces these limitations. No need for + * a br_sel_map. Software filtering is not supported because it would not correlate well + * with a sampling period. + */ +int amd_brs_setup_filter(struct perf_event *event) +{ + u64 type = event->attr.branch_sample_type; + + /* No BRS support */ + if (!x86_pmu.lbr_nr) + return -EOPNOTSUPP; + + /* Can only capture all branches, i.e., no filtering */ + if ((type & ~PERF_SAMPLE_BRANCH_PLM_ALL) != PERF_SAMPLE_BRANCH_ANY) + return -EINVAL; + + return 0; +} + +/* tos = top of stack, i.e., last valid entry written */ +static inline int amd_brs_get_tos(union amd_debug_extn_cfg *cfg) +{ + /* + * msroff: index of next entry to write so top-of-stack is one off + * if BRS is full then msroff is set back to 0. + */ + return (cfg->msroff ? cfg->msroff : x86_pmu.lbr_nr) - 1; +} + +/* + * make sure we have a sane BRS offset to begin with + * especially with kexec + */ +void amd_brs_reset(void) +{ + /* + * Reset config + */ + set_debug_extn_cfg(0); + + /* + * Mark first entry as poisoned + */ + wrmsrl(brs_to(0), BRS_POISON); +} + +int __init amd_brs_init(void) +{ + if (!amd_brs_detect()) + return -EOPNOTSUPP; + + pr_cont("%d-deep BRS, ", x86_pmu.lbr_nr); + + return 0; +} + +void amd_brs_enable(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + union amd_debug_extn_cfg cfg; + + /* Activate only on first user */ + if (++cpuc->brs_active > 1) + return; + + cfg.val = 0; /* reset all fields */ + cfg.brsmen = 1; /* enable branch sampling */ + + /* Set enable bit */ + set_debug_extn_cfg(cfg.val); +} + +void amd_brs_enable_all(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + if (cpuc->lbr_users) + amd_brs_enable(); +} + +void amd_brs_disable(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + union amd_debug_extn_cfg cfg; + + /* Check if active (could be disabled via x86_pmu_disable_all()) */ + if (!cpuc->brs_active) + return; + + /* Only disable for last user */ + if (--cpuc->brs_active) + return; + + /* + * Clear the brsmen bit but preserve the others as they contain + * useful state such as vb and msroff + */ + cfg.val = get_debug_extn_cfg(); + + /* + * When coming in on interrupt and BRS is full, then hw will have + * already stopped BRS, no need to issue wrmsr again + */ + if (cfg.brsmen) { + cfg.brsmen = 0; + set_debug_extn_cfg(cfg.val); + } +} + +void amd_brs_disable_all(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + if (cpuc->lbr_users) + amd_brs_disable(); +} + +static bool amd_brs_match_plm(struct perf_event *event, u64 to) +{ + int type = event->attr.branch_sample_type; + int plm_k = PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_HV; + int plm_u = PERF_SAMPLE_BRANCH_USER; + + if (!(type & plm_k) && kernel_ip(to)) + return 0; + + if (!(type & plm_u) && !kernel_ip(to)) + return 0; + + return 1; +} + +/* + * Caller must ensure amd_brs_inuse() is true before calling + * return: + */ +void amd_brs_drain(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct perf_event *event = cpuc->events[0]; + struct perf_branch_entry *br = cpuc->lbr_entries; + union amd_debug_extn_cfg cfg; + u32 i, nr = 0, num, tos, start; + u32 shift = 64 - boot_cpu_data.x86_virt_bits; + + /* + * BRS event forced on PMC0, + * so check if there is an event. + * It is possible to have lbr_users > 0 but the event + * not yet scheduled due to long latency PMU irq + */ + if (!event) + goto empty; + + cfg.val = get_debug_extn_cfg(); + + /* Sanity check [0-x86_pmu.lbr_nr] */ + if (WARN_ON_ONCE(cfg.msroff >= x86_pmu.lbr_nr)) + goto empty; + + /* No valid branch */ + if (cfg.vb == 0) + goto empty; + + /* + * msr.off points to next entry to be written + * tos = most recent entry index = msr.off - 1 + * BRS register buffer saturates, so we know we have + * start < tos and that we have to read from start to tos + */ + start = 0; + tos = amd_brs_get_tos(&cfg); + + num = tos - start + 1; + + /* + * BRS is only one pass (saturation) from MSROFF to depth-1 + * MSROFF wraps to zero when buffer is full + */ + for (i = 0; i < num; i++) { + u32 brs_idx = tos - i; + u64 from, to; + + rdmsrl(brs_to(brs_idx), to); + + /* Entry does not belong to us (as marked by kernel) */ + if (to == BRS_POISON) + break; + + /* + * Sign-extend SAMP_BR_TO to 64 bits, bits 61-63 are reserved. + * Necessary to generate proper virtual addresses suitable for + * symbolization + */ + to = (u64)(((s64)to << shift) >> shift); + + if (!amd_brs_match_plm(event, to)) + continue; + + rdmsrl(brs_from(brs_idx), from); + + perf_clear_branch_entry_bitfields(br+nr); + + br[nr].from = from; + br[nr].to = to; + + nr++; + } +empty: + /* Record number of sampled branches */ + cpuc->lbr_stack.nr = nr; +} + +/* + * Poison most recent entry to prevent reuse by next task + * required because BRS entry are not tagged by PID + */ +static void amd_brs_poison_buffer(void) +{ + union amd_debug_extn_cfg cfg; + unsigned int idx; + + /* Get current state */ + cfg.val = get_debug_extn_cfg(); + + /* idx is most recently written entry */ + idx = amd_brs_get_tos(&cfg); + + /* Poison target of entry */ + wrmsrl(brs_to(idx), BRS_POISON); +} + +/* + * On context switch in, we need to make sure no samples from previous user + * are left in the BRS. + * + * On ctxswin, sched_in = true, called after the PMU has started + * On ctxswout, sched_in = false, called before the PMU is stopped + */ +void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + /* no active users */ + if (!cpuc->lbr_users) + return; + + /* + * On context switch in, we need to ensure we do not use entries + * from previous BRS user on that CPU, so we poison the buffer as + * a faster way compared to resetting all entries. + */ + if (sched_in) + amd_brs_poison_buffer(); +} + +/* + * called from ACPI processor_idle.c or acpi_pad.c + * with interrupts disabled + */ +void perf_amd_brs_lopwr_cb(bool lopwr_in) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + union amd_debug_extn_cfg cfg; + + /* + * on mwait in, we may end up in non C0 state. + * we must disable branch sampling to avoid holding the NMI + * for too long. We disable it in hardware but we + * keep the state in cpuc, so we can re-enable. + * + * The hardware will deliver the NMI if needed when brsmen cleared + */ + if (cpuc->brs_active) { + cfg.val = get_debug_extn_cfg(); + cfg.brsmen = !lopwr_in; + set_debug_extn_cfg(cfg.val); + } +} + +DEFINE_STATIC_CALL_NULL(perf_lopwr_cb, perf_amd_brs_lopwr_cb); +EXPORT_STATIC_CALL_TRAMP_GPL(perf_lopwr_cb); + +void __init amd_brs_lopwr_init(void) +{ + static_call_update(perf_lopwr_cb, perf_amd_brs_lopwr_cb); +} diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 9687a8aef01c..262e39a85031 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only #include <linux/perf_event.h> +#include <linux/jump_label.h> #include <linux/export.h> #include <linux/types.h> #include <linux/init.h> @@ -7,6 +8,7 @@ #include <linux/delay.h> #include <linux/jiffies.h> #include <asm/apicdef.h> +#include <asm/apic.h> #include <asm/nmi.h> #include "../perf_event.h" @@ -18,6 +20,9 @@ static unsigned long perf_nmi_window; #define AMD_MERGE_EVENT ((0xFULL << 32) | 0xFFULL) #define AMD_MERGE_EVENT_ENABLE (AMD_MERGE_EVENT | ARCH_PERFMON_EVENTSEL_ENABLE) +/* PMC Enable and Overflow bits for PerfCntrGlobal* registers */ +static u64 amd_pmu_global_cntr_mask __read_mostly; + static __initconst const u64 amd_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] @@ -325,8 +330,16 @@ static inline bool amd_is_pair_event_code(struct hw_perf_event *hwc) } } +#define AMD_FAM19H_BRS_EVENT 0xc4 /* RETIRED_TAKEN_BRANCH_INSTRUCTIONS */ +static inline int amd_is_brs_event(struct perf_event *e) +{ + return (e->hw.config & AMD64_RAW_EVENT_MASK) == AMD_FAM19H_BRS_EVENT; +} + static int amd_core_hw_config(struct perf_event *event) { + int ret = 0; + if (event->attr.exclude_host && event->attr.exclude_guest) /* * When HO == GO == 1 the hardware treats that as GO == HO == 0 @@ -343,7 +356,66 @@ static int amd_core_hw_config(struct perf_event *event) if ((x86_pmu.flags & PMU_FL_PAIR) && amd_is_pair_event_code(&event->hw)) event->hw.flags |= PERF_X86_EVENT_PAIR; - return 0; + /* + * if branch stack is requested + */ + if (has_branch_stack(event)) { + /* + * Due to interrupt holding, BRS is not recommended in + * counting mode. + */ + if (!is_sampling_event(event)) + return -EINVAL; + + /* + * Due to the way BRS operates by holding the interrupt until + * lbr_nr entries have been captured, it does not make sense + * to allow sampling on BRS with an event that does not match + * what BRS is capturing, i.e., retired taken branches. + * Otherwise the correlation with the event's period is even + * more loose: + * + * With retired taken branch: + * Effective P = P + 16 + X + * With any other event: + * Effective P = P + Y + X + * + * Where X is the number of taken branches due to interrupt + * skid. Skid is large. + * + * Where Y is the occurences of the event while BRS is + * capturing the lbr_nr entries. + * + * By using retired taken branches, we limit the impact on the + * Y variable. We know it cannot be more than the depth of + * BRS. + */ + if (!amd_is_brs_event(event)) + return -EINVAL; + + /* + * BRS implementation does not work with frequency mode + * reprogramming of the period. + */ + if (event->attr.freq) + return -EINVAL; + /* + * The kernel subtracts BRS depth from period, so it must + * be big enough. + */ + if (event->attr.sample_period <= x86_pmu.lbr_nr) + return -EINVAL; + + /* + * Check if we can allow PERF_SAMPLE_BRANCH_STACK + */ + ret = amd_brs_setup_filter(event); + + /* only set in case of success */ + if (!ret) + event->hw.flags |= PERF_X86_EVENT_AMD_BRS; + } + return ret; } static inline int amd_is_nb_event(struct hw_perf_event *hwc) @@ -366,7 +438,7 @@ static int amd_pmu_hw_config(struct perf_event *event) if (event->attr.precise_ip && get_ibs_caps()) return -ENOENT; - if (has_branch_stack(event)) + if (has_branch_stack(event) && !x86_pmu.lbr_nr) return -EOPNOTSUPP; ret = x86_pmu_hw_config(event); @@ -510,6 +582,18 @@ static struct amd_nb *amd_alloc_nb(int cpu) return nb; } +static void amd_pmu_cpu_reset(int cpu) +{ + if (x86_pmu.version < 2) + return; + + /* Clear enable bits i.e. PerfCntrGlobalCtl.PerfCntrEn */ + wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, 0); + + /* Clear overflow bits i.e. PerfCntrGLobalStatus.PerfCntrOvfl */ + wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, amd_pmu_global_cntr_mask); +} + static int amd_pmu_cpu_prepare(int cpu) { struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); @@ -555,6 +639,9 @@ static void amd_pmu_cpu_starting(int cpu) cpuc->amd_nb->nb_id = nb_id; cpuc->amd_nb->refcnt++; + + amd_brs_reset(); + amd_pmu_cpu_reset(cpu); } static void amd_pmu_cpu_dead(int cpu) @@ -574,8 +661,54 @@ static void amd_pmu_cpu_dead(int cpu) cpuhw->amd_nb = NULL; } + + amd_pmu_cpu_reset(cpu); +} + +static inline void amd_pmu_set_global_ctl(u64 ctl) +{ + wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, ctl); +} + +static inline u64 amd_pmu_get_global_status(void) +{ + u64 status; + + /* PerfCntrGlobalStatus is read-only */ + rdmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, status); + + return status & amd_pmu_global_cntr_mask; +} + +static inline void amd_pmu_ack_global_status(u64 status) +{ + /* + * PerfCntrGlobalStatus is read-only but an overflow acknowledgment + * mechanism exists; writing 1 to a bit in PerfCntrGlobalStatusClr + * clears the same bit in PerfCntrGlobalStatus + */ + + /* Only allow modifications to PerfCntrGlobalStatus.PerfCntrOvfl */ + status &= amd_pmu_global_cntr_mask; + wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, status); +} + +static bool amd_pmu_test_overflow_topbit(int idx) +{ + u64 counter; + + rdmsrl(x86_pmu_event_addr(idx), counter); + + return !(counter & BIT_ULL(x86_pmu.cntval_bits - 1)); +} + +static bool amd_pmu_test_overflow_status(int idx) +{ + return amd_pmu_get_global_status() & BIT_ULL(idx); } +DEFINE_STATIC_CALL(amd_pmu_test_overflow, amd_pmu_test_overflow_topbit); + /* * When a PMC counter overflows, an NMI is used to process the event and * reset the counter. NMI latency can result in the counter being updated @@ -588,7 +721,6 @@ static void amd_pmu_cpu_dead(int cpu) static void amd_pmu_wait_on_overflow(int idx) { unsigned int i; - u64 counter; /* * Wait for the counter to be reset if it has overflowed. This loop @@ -596,8 +728,7 @@ static void amd_pmu_wait_on_overflow(int idx) * forever... */ for (i = 0; i < OVERFLOW_WAIT_COUNT; i++) { - rdmsrl(x86_pmu_event_addr(idx), counter); - if (counter & (1ULL << (x86_pmu.cntval_bits - 1))) + if (!static_call(amd_pmu_test_overflow)(idx)) break; /* Might be in IRQ context, so can't sleep */ @@ -605,13 +736,11 @@ static void amd_pmu_wait_on_overflow(int idx) } } -static void amd_pmu_disable_all(void) +static void amd_pmu_check_overflow(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int idx; - x86_pmu_disable_all(); - /* * This shouldn't be called from NMI context, but add a safeguard here * to return, since if we're in NMI context we can't wait for an NMI @@ -634,6 +763,50 @@ static void amd_pmu_disable_all(void) } } +static void amd_pmu_enable_event(struct perf_event *event) +{ + x86_pmu_enable_event(event); +} + +static void amd_pmu_enable_all(int added) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct hw_perf_event *hwc; + int idx; + + amd_brs_enable_all(); + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + hwc = &cpuc->events[idx]->hw; + + /* only activate events which are marked as active */ + if (!test_bit(idx, cpuc->active_mask)) + continue; + + amd_pmu_enable_event(cpuc->events[idx]); + } +} + +static void amd_pmu_v2_enable_event(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + /* + * Testing cpu_hw_events.enabled should be skipped in this case unlike + * in x86_pmu_enable_event(). + * + * Since cpu_hw_events.enabled is set only after returning from + * x86_pmu_start(), the PMCs must be programmed and kept ready. + * Counting starts only after x86_pmu_enable_all() is called. + */ + __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); +} + +static void amd_pmu_v2_enable_all(int added) +{ + amd_pmu_set_global_ctl(amd_pmu_global_cntr_mask); +} + static void amd_pmu_disable_event(struct perf_event *event) { x86_pmu_disable_event(event); @@ -651,6 +824,32 @@ static void amd_pmu_disable_event(struct perf_event *event) amd_pmu_wait_on_overflow(event->hw.idx); } +static void amd_pmu_disable_all(void) +{ + amd_brs_disable_all(); + x86_pmu_disable_all(); + amd_pmu_check_overflow(); +} + +static void amd_pmu_v2_disable_all(void) +{ + /* Disable all PMCs */ + amd_pmu_set_global_ctl(0); + amd_pmu_check_overflow(); +} + +static void amd_pmu_add_event(struct perf_event *event) +{ + if (needs_branch_stack(event)) + amd_pmu_brs_add(event); +} + +static void amd_pmu_del_event(struct perf_event *event) +{ + if (needs_branch_stack(event)) + amd_pmu_brs_del(event); +} + /* * Because of NMI latency, if multiple PMC counters are active or other sources * of NMIs are received, the perf NMI handler can handle one or more overflowed @@ -669,13 +868,8 @@ static void amd_pmu_disable_event(struct perf_event *event) * handled a counter. When an un-handled NMI is received, it will be claimed * only if arriving within that window. */ -static int amd_pmu_handle_irq(struct pt_regs *regs) +static inline int amd_pmu_adjust_nmi_window(int handled) { - int handled; - - /* Process any counter overflows */ - handled = x86_pmu_handle_irq(regs); - /* * If a counter was handled, record a timestamp such that un-handled * NMIs will be claimed if arriving within that window. @@ -692,6 +886,113 @@ static int amd_pmu_handle_irq(struct pt_regs *regs) return NMI_HANDLED; } +static int amd_pmu_handle_irq(struct pt_regs *regs) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + int handled; + int pmu_enabled; + + /* + * Save the PMU state. + * It needs to be restored when leaving the handler. + */ + pmu_enabled = cpuc->enabled; + cpuc->enabled = 0; + + /* stop everything (includes BRS) */ + amd_pmu_disable_all(); + + /* Drain BRS is in use (could be inactive) */ + if (cpuc->lbr_users) + amd_brs_drain(); + + /* Process any counter overflows */ + handled = x86_pmu_handle_irq(regs); + + cpuc->enabled = pmu_enabled; + if (pmu_enabled) + amd_pmu_enable_all(0); + + return amd_pmu_adjust_nmi_window(handled); +} + +static int amd_pmu_v2_handle_irq(struct pt_regs *regs) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct perf_sample_data data; + struct hw_perf_event *hwc; + struct perf_event *event; + int handled = 0, idx; + u64 status, mask; + bool pmu_enabled; + + /* + * Save the PMU state as it needs to be restored when leaving the + * handler + */ + pmu_enabled = cpuc->enabled; + cpuc->enabled = 0; + + /* Stop counting */ + amd_pmu_v2_disable_all(); + + status = amd_pmu_get_global_status(); + + /* Check if any overflows are pending */ + if (!status) + goto done; + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + if (!test_bit(idx, cpuc->active_mask)) + continue; + + event = cpuc->events[idx]; + hwc = &event->hw; + x86_perf_event_update(event); + mask = BIT_ULL(idx); + + if (!(status & mask)) + continue; + + /* Event overflow */ + handled++; + perf_sample_data_init(&data, 0, hwc->last_period); + + if (!x86_perf_event_set_period(event)) + continue; + + if (perf_event_overflow(event, &data, regs)) + x86_pmu_stop(event, 0); + + status &= ~mask; + } + + /* + * It should never be the case that some overflows are not handled as + * the corresponding PMCs are expected to be inactive according to the + * active_mask + */ + WARN_ON(status > 0); + + /* Clear overflow bits */ + amd_pmu_ack_global_status(~status); + + /* + * Unmasking the LVTPC is not required as the Mask (M) bit of the LVT + * PMI entry is not set by the local APIC when a PMC overflow occurs + */ + inc_irq_stat(apic_perf_irqs); + +done: + cpuc->enabled = pmu_enabled; + + /* Resume counting only if PMU is active */ + if (pmu_enabled) + amd_pmu_v2_enable_all(0); + + return amd_pmu_adjust_nmi_window(handled); +} + static struct event_constraint * amd_get_event_constraints(struct cpu_hw_events *cpuc, int idx, struct perf_event *event) @@ -897,6 +1198,51 @@ static void amd_put_event_constraints_f17h(struct cpu_hw_events *cpuc, --cpuc->n_pair; } +/* + * Because of the way BRS operates with an inactive and active phases, and + * the link to one counter, it is not possible to have two events using BRS + * scheduled at the same time. There would be an issue with enforcing the + * period of each one and given that the BRS saturates, it would not be possible + * to guarantee correlated content for all events. Therefore, in situations + * where multiple events want to use BRS, the kernel enforces mutual exclusion. + * Exclusion is enforced by chosing only one counter for events using BRS. + * The event scheduling logic will then automatically multiplex the + * events and ensure that at most one event is actively using BRS. + * + * The BRS counter could be any counter, but there is no constraint on Fam19h, + * therefore all counters are equal and thus we pick the first one: PMC0 + */ +static struct event_constraint amd_fam19h_brs_cntr0_constraint = + EVENT_CONSTRAINT(0, 0x1, AMD64_RAW_EVENT_MASK); + +static struct event_constraint amd_fam19h_brs_pair_cntr0_constraint = + __EVENT_CONSTRAINT(0, 0x1, AMD64_RAW_EVENT_MASK, 1, 0, PERF_X86_EVENT_PAIR); + +static struct event_constraint * +amd_get_event_constraints_f19h(struct cpu_hw_events *cpuc, int idx, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + bool has_brs = has_amd_brs(hwc); + + /* + * In case BRS is used with an event requiring a counter pair, + * the kernel allows it but only on counter 0 & 1 to enforce + * multiplexing requiring to protect BRS in case of multiple + * BRS users + */ + if (amd_is_pair_event_code(hwc)) { + return has_brs ? &amd_fam19h_brs_pair_cntr0_constraint + : &pair_constraint; + } + + if (has_brs) + return &amd_fam19h_brs_cntr0_constraint; + + return &unconstrained; +} + + static ssize_t amd_event_sysfs_show(char *page, u64 config) { u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT) | @@ -905,12 +1251,19 @@ static ssize_t amd_event_sysfs_show(char *page, u64 config) return x86_event_sysfs_show(page, config, event); } +static void amd_pmu_sched_task(struct perf_event_context *ctx, + bool sched_in) +{ + if (sched_in && x86_pmu.lbr_nr) + amd_pmu_brs_sched_task(ctx, sched_in); +} + static __initconst const struct x86_pmu amd_pmu = { .name = "AMD", .handle_irq = amd_pmu_handle_irq, .disable_all = amd_pmu_disable_all, - .enable_all = x86_pmu_enable_all, - .enable = x86_pmu_enable_event, + .enable_all = amd_pmu_enable_all, + .enable = amd_pmu_enable_event, .disable = amd_pmu_disable_event, .hw_config = amd_pmu_hw_config, .schedule_events = x86_schedule_events, @@ -920,6 +1273,8 @@ static __initconst const struct x86_pmu amd_pmu = { .event_map = amd_pmu_event_map, .max_events = ARRAY_SIZE(amd_perfmon_event_map), .num_counters = AMD64_NUM_COUNTERS, + .add = amd_pmu_add_event, + .del = amd_pmu_del_event, .cntval_bits = 48, .cntval_mask = (1ULL << 48) - 1, .apic = 1, @@ -938,8 +1293,55 @@ static __initconst const struct x86_pmu amd_pmu = { .amd_nb_constraints = 1, }; +static ssize_t branches_show(struct device *cdev, + struct device_attribute *attr, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu.lbr_nr); +} + +static DEVICE_ATTR_RO(branches); + +static struct attribute *amd_pmu_brs_attrs[] = { + &dev_attr_branches.attr, + NULL, +}; + +static umode_t +amd_brs_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return x86_pmu.lbr_nr ? attr->mode : 0; +} + +static struct attribute_group group_caps_amd_brs = { + .name = "caps", + .attrs = amd_pmu_brs_attrs, + .is_visible = amd_brs_is_visible, +}; + +EVENT_ATTR_STR(branch-brs, amd_branch_brs, + "event=" __stringify(AMD_FAM19H_BRS_EVENT)"\n"); + +static struct attribute *amd_brs_events_attrs[] = { + EVENT_PTR(amd_branch_brs), + NULL, +}; + +static struct attribute_group group_events_amd_brs = { + .name = "events", + .attrs = amd_brs_events_attrs, + .is_visible = amd_brs_is_visible, +}; + +static const struct attribute_group *amd_attr_update[] = { + &group_caps_amd_brs, + &group_events_amd_brs, + NULL, +}; + static int __init amd_core_pmu_init(void) { + union cpuid_0x80000022_ebx ebx; u64 even_ctr_mask = 0ULL; int i; @@ -957,6 +1359,27 @@ static int __init amd_core_pmu_init(void) x86_pmu.eventsel = MSR_F15H_PERF_CTL; x86_pmu.perfctr = MSR_F15H_PERF_CTR; x86_pmu.num_counters = AMD64_NUM_COUNTERS_CORE; + + /* Check for Performance Monitoring v2 support */ + if (boot_cpu_has(X86_FEATURE_PERFMON_V2)) { + ebx.full = cpuid_ebx(EXT_PERFMON_DEBUG_FEATURES); + + /* Update PMU version for later usage */ + x86_pmu.version = 2; + + /* Find the number of available Core PMCs */ + x86_pmu.num_counters = ebx.split.num_core_pmc; + + amd_pmu_global_cntr_mask = (1ULL << x86_pmu.num_counters) - 1; + + /* Update PMC handling functions */ + x86_pmu.enable_all = amd_pmu_v2_enable_all; + x86_pmu.disable_all = amd_pmu_v2_disable_all; + x86_pmu.enable = amd_pmu_v2_enable_event; + x86_pmu.handle_irq = amd_pmu_v2_handle_irq; + static_call_update(amd_pmu_test_overflow, amd_pmu_test_overflow_status); + } + /* * AMD Core perfctr has separate MSRs for the NB events, see * the amd/uncore.c driver. @@ -989,6 +1412,22 @@ static int __init amd_core_pmu_init(void) x86_pmu.flags |= PMU_FL_PAIR; } + /* + * BRS requires special event constraints and flushing on ctxsw. + */ + if (boot_cpu_data.x86 >= 0x19 && !amd_brs_init()) { + x86_pmu.get_event_constraints = amd_get_event_constraints_f19h; + x86_pmu.sched_task = amd_pmu_sched_task; + /* + * put_event_constraints callback same as Fam17h, set above + */ + + /* branch sampling must be stopped when entering low power */ + amd_brs_lopwr_init(); + } + + x86_pmu.attr_update = amd_attr_update; + pr_cont("core perfctr, "); return 0; } diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 9739019d4b67..11e8b493e015 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -304,6 +304,16 @@ static int perf_ibs_init(struct perf_event *event) hwc->config_base = perf_ibs->msr; hwc->config = config; + /* + * rip recorded by IbsOpRip will not be consistent with rsp and rbp + * recorded as part of interrupt regs. Thus we need to use rip from + * interrupt regs while unwinding call stack. Setting _EARLY flag + * makes sure we unwind call-stack before perf sample rip is set to + * IbsOpRip. + */ + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) + event->attr.sample_type |= __PERF_SAMPLE_CALLCHAIN_EARLY; + return 0; } @@ -687,6 +697,14 @@ fail: data.raw = &raw; } + /* + * rip recorded by IbsOpRip will not be consistent with rsp and rbp + * recorded as part of interrupt regs. Thus we need to use rip from + * interrupt regs while unwinding call stack. + */ + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) + data.callchain = perf_callchain(event, iregs); + throttle = perf_event_overflow(event, &data, ®s); out: if (throttle) { diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index eef816fc216d..b08052b05db6 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1338,6 +1338,10 @@ static void x86_pmu_enable(struct pmu *pmu) if (hwc->state & PERF_HES_ARCH) continue; + /* + * if cpuc->enabled = 0, then no wrmsr as + * per x86_pmu_enable_event() + */ x86_pmu_start(event, PERF_EF_RELOAD); } cpuc->n_added = 0; @@ -1371,6 +1375,13 @@ int x86_perf_event_set_period(struct perf_event *event) return x86_pmu.set_topdown_event_period(event); /* + * decrease period by the depth of the BRS feature to get + * the last N taken branches and approximate the desired period + */ + if (has_branch_stack(event)) + period = amd_brs_adjust_period(period); + + /* * If we are way outside a reasonable range then just skip forward: */ if (unlikely(left <= -period)) { @@ -1704,11 +1715,15 @@ int x86_pmu_handle_irq(struct pt_regs *regs) * event overflow */ handled++; - perf_sample_data_init(&data, 0, event->hw.last_period); if (!x86_perf_event_set_period(event)) continue; + perf_sample_data_init(&data, 0, event->hw.last_period); + + if (has_branch_stack(event)) + data.br_stack = &cpuc->lbr_stack; + if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); } @@ -1837,7 +1852,7 @@ ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, cha /* string trumps id */ if (pmu_attr->event_str) - return sprintf(page, "%s", pmu_attr->event_str); + return sprintf(page, "%s\n", pmu_attr->event_str); return x86_pmu.events_sysfs_show(page, config); } diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index fe1742c4ca49..13179f31fe10 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -769,6 +769,7 @@ void intel_pmu_lbr_disable_all(void) void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) { unsigned long mask = x86_pmu.lbr_nr - 1; + struct perf_branch_entry *br = cpuc->lbr_entries; u64 tos = intel_pmu_lbr_tos(); int i; @@ -784,15 +785,11 @@ void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr); - cpuc->lbr_entries[i].from = msr_lastbranch.from; - cpuc->lbr_entries[i].to = msr_lastbranch.to; - cpuc->lbr_entries[i].mispred = 0; - cpuc->lbr_entries[i].predicted = 0; - cpuc->lbr_entries[i].in_tx = 0; - cpuc->lbr_entries[i].abort = 0; - cpuc->lbr_entries[i].cycles = 0; - cpuc->lbr_entries[i].type = 0; - cpuc->lbr_entries[i].reserved = 0; + perf_clear_branch_entry_bitfields(br); + + br->from = msr_lastbranch.from; + br->to = msr_lastbranch.to; + br++; } cpuc->lbr_stack.nr = i; cpuc->lbr_stack.hw_idx = tos; @@ -807,6 +804,7 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) { bool need_info = false, call_stack = false; unsigned long mask = x86_pmu.lbr_nr - 1; + struct perf_branch_entry *br = cpuc->lbr_entries; u64 tos = intel_pmu_lbr_tos(); int i; int out = 0; @@ -878,15 +876,14 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) if (abort && x86_pmu.lbr_double_abort && out > 0) out--; - cpuc->lbr_entries[out].from = from; - cpuc->lbr_entries[out].to = to; - cpuc->lbr_entries[out].mispred = mis; - cpuc->lbr_entries[out].predicted = pred; - cpuc->lbr_entries[out].in_tx = in_tx; - cpuc->lbr_entries[out].abort = abort; - cpuc->lbr_entries[out].cycles = cycles; - cpuc->lbr_entries[out].type = 0; - cpuc->lbr_entries[out].reserved = 0; + perf_clear_branch_entry_bitfields(br+out); + br[out].from = from; + br[out].to = to; + br[out].mispred = mis; + br[out].predicted = pred; + br[out].in_tx = in_tx; + br[out].abort = abort; + br[out].cycles = cycles; out++; } cpuc->lbr_stack.nr = out; @@ -951,6 +948,8 @@ static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc, to = rdlbr_to(i, lbr); info = rdlbr_info(i, lbr); + perf_clear_branch_entry_bitfields(e); + e->from = from; e->to = to; e->mispred = get_lbr_mispred(info); @@ -959,7 +958,6 @@ static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc, e->abort = !!(info & LBR_INFO_ABORT); e->cycles = get_lbr_cycles(info); e->type = get_lbr_br_type(info); - e->reserved = 0; } cpuc->lbr_stack.nr = i; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 150261d929b9..3b0324584da3 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -67,22 +67,23 @@ static inline bool constraint_match(struct event_constraint *c, u64 ecode) /* * struct hw_perf_event.flags flags */ -#define PERF_X86_EVENT_PEBS_LDLAT 0x0001 /* ld+ldlat data address sampling */ -#define PERF_X86_EVENT_PEBS_ST 0x0002 /* st data address sampling */ -#define PERF_X86_EVENT_PEBS_ST_HSW 0x0004 /* haswell style datala, store */ -#define PERF_X86_EVENT_PEBS_LD_HSW 0x0008 /* haswell style datala, load */ -#define PERF_X86_EVENT_PEBS_NA_HSW 0x0010 /* haswell style datala, unknown */ -#define PERF_X86_EVENT_EXCL 0x0020 /* HT exclusivity on counter */ -#define PERF_X86_EVENT_DYNAMIC 0x0040 /* dynamic alloc'd constraint */ - -#define PERF_X86_EVENT_EXCL_ACCT 0x0100 /* accounted EXCL event */ -#define PERF_X86_EVENT_AUTO_RELOAD 0x0200 /* use PEBS auto-reload */ -#define PERF_X86_EVENT_LARGE_PEBS 0x0400 /* use large PEBS */ -#define PERF_X86_EVENT_PEBS_VIA_PT 0x0800 /* use PT buffer for PEBS */ -#define PERF_X86_EVENT_PAIR 0x1000 /* Large Increment per Cycle */ -#define PERF_X86_EVENT_LBR_SELECT 0x2000 /* Save/Restore MSR_LBR_SELECT */ -#define PERF_X86_EVENT_TOPDOWN 0x4000 /* Count Topdown slots/metrics events */ -#define PERF_X86_EVENT_PEBS_STLAT 0x8000 /* st+stlat data address sampling */ +#define PERF_X86_EVENT_PEBS_LDLAT 0x00001 /* ld+ldlat data address sampling */ +#define PERF_X86_EVENT_PEBS_ST 0x00002 /* st data address sampling */ +#define PERF_X86_EVENT_PEBS_ST_HSW 0x00004 /* haswell style datala, store */ +#define PERF_X86_EVENT_PEBS_LD_HSW 0x00008 /* haswell style datala, load */ +#define PERF_X86_EVENT_PEBS_NA_HSW 0x00010 /* haswell style datala, unknown */ +#define PERF_X86_EVENT_EXCL 0x00020 /* HT exclusivity on counter */ +#define PERF_X86_EVENT_DYNAMIC 0x00040 /* dynamic alloc'd constraint */ + +#define PERF_X86_EVENT_EXCL_ACCT 0x00100 /* accounted EXCL event */ +#define PERF_X86_EVENT_AUTO_RELOAD 0x00200 /* use PEBS auto-reload */ +#define PERF_X86_EVENT_LARGE_PEBS 0x00400 /* use large PEBS */ +#define PERF_X86_EVENT_PEBS_VIA_PT 0x00800 /* use PT buffer for PEBS */ +#define PERF_X86_EVENT_PAIR 0x01000 /* Large Increment per Cycle */ +#define PERF_X86_EVENT_LBR_SELECT 0x02000 /* Save/Restore MSR_LBR_SELECT */ +#define PERF_X86_EVENT_TOPDOWN 0x04000 /* Count Topdown slots/metrics events */ +#define PERF_X86_EVENT_PEBS_STLAT 0x08000 /* st+stlat data address sampling */ +#define PERF_X86_EVENT_AMD_BRS 0x10000 /* AMD Branch Sampling */ static inline bool is_topdown_count(struct perf_event *event) { @@ -325,6 +326,8 @@ struct cpu_hw_events { * AMD specific bits */ struct amd_nb *amd_nb; + int brs_active; /* BRS is enabled */ + /* Inverted mask of bits to clear in the perf_ctr ctrl registers */ u64 perf_ctr_virt_mask; int n_pair; /* Large increment events */ @@ -1105,6 +1108,11 @@ int x86_pmu_hw_config(struct perf_event *event); void x86_pmu_disable_all(void); +static inline bool has_amd_brs(struct hw_perf_event *hwc) +{ + return hwc->flags & PERF_X86_EVENT_AMD_BRS; +} + static inline bool is_counter_pair(struct hw_perf_event *hwc) { return hwc->flags & PERF_X86_EVENT_PAIR; @@ -1211,6 +1219,88 @@ static inline bool fixed_counter_disabled(int i, struct pmu *pmu) int amd_pmu_init(void); +#ifdef CONFIG_PERF_EVENTS_AMD_BRS +int amd_brs_init(void); +void amd_brs_disable(void); +void amd_brs_enable(void); +void amd_brs_enable_all(void); +void amd_brs_disable_all(void); +void amd_brs_drain(void); +void amd_brs_lopwr_init(void); +void amd_brs_disable_all(void); +int amd_brs_setup_filter(struct perf_event *event); +void amd_brs_reset(void); + +static inline void amd_pmu_brs_add(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + perf_sched_cb_inc(event->ctx->pmu); + cpuc->lbr_users++; + /* + * No need to reset BRS because it is reset + * on brs_enable() and it is saturating + */ +} + +static inline void amd_pmu_brs_del(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + cpuc->lbr_users--; + WARN_ON_ONCE(cpuc->lbr_users < 0); + + perf_sched_cb_dec(event->ctx->pmu); +} + +void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in); + +static inline s64 amd_brs_adjust_period(s64 period) +{ + if (period > x86_pmu.lbr_nr) + return period - x86_pmu.lbr_nr; + + return period; +} +#else +static inline int amd_brs_init(void) +{ + return 0; +} +static inline void amd_brs_disable(void) {} +static inline void amd_brs_enable(void) {} +static inline void amd_brs_drain(void) {} +static inline void amd_brs_lopwr_init(void) {} +static inline void amd_brs_disable_all(void) {} +static inline int amd_brs_setup_filter(struct perf_event *event) +{ + return 0; +} +static inline void amd_brs_reset(void) {} + +static inline void amd_pmu_brs_add(struct perf_event *event) +{ +} + +static inline void amd_pmu_brs_del(struct perf_event *event) +{ +} + +static inline void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in) +{ +} + +static inline s64 amd_brs_adjust_period(s64 period) +{ + return period; +} + +static inline void amd_brs_enable_all(void) +{ +} + +#endif + #else /* CONFIG_CPU_SUP_AMD */ static inline int amd_pmu_init(void) @@ -1218,6 +1308,27 @@ static inline int amd_pmu_init(void) return 0; } +static inline int amd_brs_init(void) +{ + return -EOPNOTSUPP; +} + +static inline void amd_brs_drain(void) +{ +} + +static inline void amd_brs_enable_all(void) +{ +} + +static inline void amd_brs_disable_all(void) +{ +} + +static inline s64 amd_brs_adjust_period(s64 period) +{ + return period; +} #endif /* CONFIG_CPU_SUP_AMD */ static inline int is_pebs_pt(struct perf_event *event) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 73e643ae94b6..b50e0872ad1e 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -211,7 +211,7 @@ #define X86_FEATURE_SSBD ( 7*32+17) /* Speculative Store Bypass Disable */ #define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */ #define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* "" Fill RSB on context switches */ -/* FREE! ( 7*32+20) */ +#define X86_FEATURE_PERFMON_V2 ( 7*32+20) /* AMD Performance Monitoring Version 2 */ #define X86_FEATURE_USE_IBPB ( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled */ #define X86_FEATURE_USE_IBRS_FW ( 7*32+22) /* "" Use IBRS during runtime firmware calls */ #define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE ( 7*32+23) /* "" Disable Speculative Store Bypass. */ @@ -315,6 +315,7 @@ #define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */ #define X86_FEATURE_AMD_SSB_NO (13*32+26) /* "" Speculative Store Bypass is fixed in hardware. */ #define X86_FEATURE_CPPC (13*32+27) /* Collaborative Processor Performance Control */ +#define X86_FEATURE_BRS (13*32+31) /* Branch Sampling available */ /* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */ #define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index ee15311b6be1..cf4ce6aeba67 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -524,6 +524,11 @@ #define AMD_CPPC_DES_PERF(x) (((x) & 0xff) << 16) #define AMD_CPPC_ENERGY_PERF_PREF(x) (((x) & 0xff) << 24) +/* AMD Performance Counter Global Status and Control MSRs */ +#define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS 0xc0000300 +#define MSR_AMD64_PERF_CNTR_GLOBAL_CTL 0xc0000301 +#define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR 0xc0000302 + /* Fam 17h MSRs */ #define MSR_F17H_IRPERF 0xc00000e9 @@ -688,6 +693,10 @@ #define MSR_IA32_PERF_CTL 0x00000199 #define INTEL_PERF_CTL_MASK 0xffff +/* AMD Branch Sampling configuration */ +#define MSR_AMD_DBG_EXTN_CFG 0xc000010f +#define MSR_AMD_SAMP_BR_FROM 0xc0010300 + #define MSR_IA32_MPERF 0x000000e7 #define MSR_IA32_APERF 0x000000e8 diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index b06e4c573add..7aa1d420c779 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -2,6 +2,8 @@ #ifndef _ASM_X86_PERF_EVENT_H #define _ASM_X86_PERF_EVENT_H +#include <linux/static_call.h> + /* * Performance event hw details: */ @@ -184,6 +186,18 @@ union cpuid28_ecx { unsigned int full; }; +/* + * AMD "Extended Performance Monitoring and Debug" CPUID + * detection/enumeration details: + */ +union cpuid_0x80000022_ebx { + struct { + /* Number of Core Performance Counters */ + unsigned int num_core_pmc:4; + } split; + unsigned int full; +}; + struct x86_pmu_capability { int version; int num_counters_gp; @@ -371,6 +385,11 @@ struct pebs_xmm { }; /* + * AMD Extended Performance Monitoring and Debug cpuid feature detection + */ +#define EXT_PERFMON_DEBUG_FEATURES 0x80000022 + +/* * IBS cpuid feature detection */ @@ -518,6 +537,27 @@ static inline void intel_pt_handle_vmx(int on) #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) extern void amd_pmu_enable_virt(void); extern void amd_pmu_disable_virt(void); + +#if defined(CONFIG_PERF_EVENTS_AMD_BRS) + +#define PERF_NEEDS_LOPWR_CB 1 + +/* + * architectural low power callback impacts + * drivers/acpi/processor_idle.c + * drivers/acpi/acpi_pad.c + */ +extern void perf_amd_brs_lopwr_cb(bool lopwr_in); + +DECLARE_STATIC_CALL(perf_lopwr_cb, perf_amd_brs_lopwr_cb); + +static inline void perf_lopwr_cb(bool lopwr_in) +{ + static_call_mod(perf_lopwr_cb)(lopwr_in); +} + +#endif /* PERF_NEEDS_LOPWR_CB */ + #else static inline void amd_pmu_enable_virt(void) { } static inline void amd_pmu_disable_virt(void) { } diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 4143b1e4c5c6..dbaa8326d6f2 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -43,6 +43,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 }, + { X86_FEATURE_PERFMON_V2, CPUID_EAX, 0, 0x80000022, 0 }, { 0, 0, 0, 0, 0 } }; diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c index b52407c56000..879ef8c72f5c 100644 --- a/arch/x86/kernel/signal_compat.c +++ b/arch/x86/kernel/signal_compat.c @@ -149,8 +149,10 @@ static inline void signal_compat_build_tests(void) BUILD_BUG_ON(offsetof(siginfo_t, si_perf_data) != 0x18); BUILD_BUG_ON(offsetof(siginfo_t, si_perf_type) != 0x20); + BUILD_BUG_ON(offsetof(siginfo_t, si_perf_flags) != 0x24); BUILD_BUG_ON(offsetof(compat_siginfo_t, si_perf_data) != 0x10); BUILD_BUG_ON(offsetof(compat_siginfo_t, si_perf_type) != 0x14); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_perf_flags) != 0x18); CHECK_CSI_OFFSET(_sigpoll); CHECK_CSI_SIZE (_sigpoll, 2*sizeof(int)); diff --git a/drivers/acpi/acpi_pad.c b/drivers/acpi/acpi_pad.c index f45979aa2d64..ec0e22a1e25d 100644 --- a/drivers/acpi/acpi_pad.c +++ b/drivers/acpi/acpi_pad.c @@ -17,6 +17,7 @@ #include <linux/tick.h> #include <linux/slab.h> #include <linux/acpi.h> +#include <linux/perf_event.h> #include <asm/mwait.h> #include <xen/xen.h> @@ -164,6 +165,9 @@ static int power_saving_thread(void *data) tsc_marked_unstable = 1; } local_irq_disable(); + + perf_lopwr_cb(true); + tick_broadcast_enable(); tick_broadcast_enter(); stop_critical_timings(); @@ -172,6 +176,9 @@ static int power_saving_thread(void *data) start_critical_timings(); tick_broadcast_exit(); + + perf_lopwr_cb(false); + local_irq_enable(); if (time_before(expire_time, jiffies)) { diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index eb95e188d62b..e9c84d0ac55b 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -21,6 +21,7 @@ #include <linux/cpuidle.h> #include <linux/cpu.h> #include <linux/minmax.h> +#include <linux/perf_event.h> #include <acpi/processor.h> /* @@ -544,6 +545,8 @@ static void wait_for_freeze(void) */ static void __cpuidle acpi_idle_do_entry(struct acpi_processor_cx *cx) { + perf_lopwr_cb(true); + if (cx->entry_method == ACPI_CSTATE_FFH) { /* Call into architectural FFH based C-state */ acpi_processor_ffh_cstate_enter(cx); @@ -554,6 +557,8 @@ static void __cpuidle acpi_idle_do_entry(struct acpi_processor_cx *cx) inb(cx->address); wait_for_freeze(); } + + perf_lopwr_cb(false); } /** diff --git a/include/linux/compat.h b/include/linux/compat.h index 1c758b0e0359..01fddf72a81f 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -235,6 +235,7 @@ typedef struct compat_siginfo { struct { compat_ulong_t _data; u32 _type; + u32 _flags; } _perf; }; } _sigfault; diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index af97dd427501..da759560eec5 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1063,6 +1063,22 @@ static inline void perf_sample_data_init(struct perf_sample_data *data, data->txn = 0; } +/* + * Clear all bitfields in the perf_branch_entry. + * The to and from fields are not cleared because they are + * systematically modified by caller. + */ +static inline void perf_clear_branch_entry_bitfields(struct perf_branch_entry *br) +{ + br->mispred = 0; + br->predicted = 0; + br->in_tx = 0; + br->abort = 0; + br->cycles = 0; + br->type = 0; + br->reserved = 0; +} + extern void perf_output_sample(struct perf_output_handle *handle, struct perf_event_header *header, struct perf_sample_data *data, @@ -1660,4 +1676,10 @@ typedef int (perf_snapshot_branch_stack_t)(struct perf_branch_entry *entries, unsigned int cnt); DECLARE_STATIC_CALL(perf_snapshot_branch_stack, perf_snapshot_branch_stack_t); +#ifndef PERF_NEEDS_LOPWR_CB +static inline void perf_lopwr_cb(bool mode) +{ +} +#endif + #endif /* _LINUX_PERF_EVENT_H */ diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 3c8b34876744..bab7cc56b13a 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -320,7 +320,7 @@ int send_sig_mceerr(int code, void __user *, short, struct task_struct *); int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper); int force_sig_pkuerr(void __user *addr, u32 pkey); -int force_sig_perf(void __user *addr, u32 type, u64 sig_data); +int send_sig_perf(void __user *addr, u32 type, u64 sig_data); int force_sig_ptrace_errno_trap(int errno, void __user *addr); int force_sig_fault_trapno(int sig, int code, void __user *addr, int trapno); diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h index 3ba180f550d7..ffbe4cec9f32 100644 --- a/include/uapi/asm-generic/siginfo.h +++ b/include/uapi/asm-generic/siginfo.h @@ -99,6 +99,7 @@ union __sifields { struct { unsigned long _data; __u32 _type; + __u32 _flags; } _perf; }; } _sigfault; @@ -164,6 +165,7 @@ typedef struct siginfo { #define si_pkey _sifields._sigfault._addr_pkey._pkey #define si_perf_data _sifields._sigfault._perf._data #define si_perf_type _sifields._sigfault._perf._type +#define si_perf_flags _sifields._sigfault._perf._flags #define si_band _sifields._sigpoll._band #define si_fd _sifields._sigpoll._fd #define si_call_addr _sifields._sigsys._call_addr @@ -271,6 +273,11 @@ typedef struct siginfo { */ /* + * Flags for si_perf_flags if SIGTRAP si_code is TRAP_PERF. + */ +#define TRAP_PERF_FLAG_ASYNC (1u << 0) + +/* * SIGCHLD si_codes */ #define CLD_EXITED 1 /* child has exited */ diff --git a/kernel/events/core.c b/kernel/events/core.c index 7858bafffa9d..7699be46f3a1 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6428,8 +6428,8 @@ static void perf_sigtrap(struct perf_event *event) if (current->flags & PF_EXITING) return; - force_sig_perf((void __user *)event->pending_addr, - event->attr.type, event->attr.sig_data); + send_sig_perf((void __user *)event->pending_addr, + event->attr.type, event->attr.sig_data); } static void perf_pending_event_disable(struct perf_event *event) diff --git a/kernel/signal.c b/kernel/signal.c index 30cd1ca43bcd..e43bc2a692f5 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1805,7 +1805,7 @@ int force_sig_pkuerr(void __user *addr, u32 pkey) } #endif -int force_sig_perf(void __user *addr, u32 type, u64 sig_data) +int send_sig_perf(void __user *addr, u32 type, u64 sig_data) { struct kernel_siginfo info; @@ -1817,7 +1817,18 @@ int force_sig_perf(void __user *addr, u32 type, u64 sig_data) info.si_perf_data = sig_data; info.si_perf_type = type; - return force_sig_info(&info); + /* + * Signals generated by perf events should not terminate the whole + * process if SIGTRAP is blocked, however, delivering the signal + * asynchronously is better than not delivering at all. But tell user + * space if the signal was asynchronous, so it can clearly be + * distinguished from normal synchronous ones. + */ + info.si_perf_flags = sigismember(¤t->blocked, info.si_signo) ? + TRAP_PERF_FLAG_ASYNC : + 0; + + return send_sig_info(info.si_signo, &info, current); } /** @@ -3432,6 +3443,7 @@ void copy_siginfo_to_external32(struct compat_siginfo *to, to->si_addr = ptr_to_compat(from->si_addr); to->si_perf_data = from->si_perf_data; to->si_perf_type = from->si_perf_type; + to->si_perf_flags = from->si_perf_flags; break; case SIL_CHLD: to->si_pid = from->si_pid; @@ -3509,6 +3521,7 @@ static int post_copy_siginfo_from_user32(kernel_siginfo_t *to, to->si_addr = compat_ptr(from->si_addr); to->si_perf_data = from->si_perf_data; to->si_perf_type = from->si_perf_type; + to->si_perf_flags = from->si_perf_flags; break; case SIL_CHLD: to->si_pid = from->si_pid; @@ -4722,6 +4735,7 @@ static inline void siginfo_buildtime_checks(void) CHECK_OFFSET(si_pkey); CHECK_OFFSET(si_perf_data); CHECK_OFFSET(si_perf_type); + CHECK_OFFSET(si_perf_flags); /* sigpoll */ CHECK_OFFSET(si_band); |