From c243cecb58e3905baeace8827201c14df8481e2a Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 31 Jan 2022 09:24:49 +0200 Subject: perf/x86/intel/pt: Relax address filter validation The requirement for 64-bit address filters is that they are canonical addresses. In other respects any address range is allowed which would include user space addresses. That can be useful for tracing virtual machine guests because address filtering can be used to advantage in place of current privilege level (CPL) filtering. Signed-off-by: Adrian Hunter Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220131072453.2839535-2-adrian.hunter@intel.com --- arch/x86/events/intel/pt.c | 63 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 50 insertions(+), 13 deletions(-) diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index 7f406c14715f..0ebcf9a56f97 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -13,6 +13,8 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include +#include +#include #include #include @@ -1347,11 +1349,37 @@ static void pt_addr_filters_fini(struct perf_event *event) event->hw.addr_filters = NULL; } -static inline bool valid_kernel_ip(unsigned long ip) +#ifdef CONFIG_X86_64 +static u64 canonical_address(u64 vaddr, u8 vaddr_bits) { - return virt_addr_valid(ip) && kernel_ip(ip); + return ((s64)vaddr << (64 - vaddr_bits)) >> (64 - vaddr_bits); } +static u64 is_canonical_address(u64 vaddr, u8 vaddr_bits) +{ + return canonical_address(vaddr, vaddr_bits) == vaddr; +} + +/* Clamp to a canonical address greater-than-or-equal-to the address given */ +static u64 clamp_to_ge_canonical_addr(u64 vaddr, u8 vaddr_bits) +{ + return is_canonical_address(vaddr, vaddr_bits) ? + vaddr : + -BIT_ULL(vaddr_bits - 1); +} + +/* Clamp to a canonical address less-than-or-equal-to the address given */ +static u64 clamp_to_le_canonical_addr(u64 vaddr, u8 vaddr_bits) +{ + return is_canonical_address(vaddr, vaddr_bits) ? + vaddr : + BIT_ULL(vaddr_bits - 1) - 1; +} +#else +#define clamp_to_ge_canonical_addr(x, y) (x) +#define clamp_to_le_canonical_addr(x, y) (x) +#endif + static int pt_event_addr_filters_validate(struct list_head *filters) { struct perf_addr_filter *filter; @@ -1366,14 +1394,6 @@ static int pt_event_addr_filters_validate(struct list_head *filters) filter->action == PERF_ADDR_FILTER_ACTION_START) return -EOPNOTSUPP; - if (!filter->path.dentry) { - if (!valid_kernel_ip(filter->offset)) - return -EINVAL; - - if (!valid_kernel_ip(filter->offset + filter->size)) - return -EINVAL; - } - if (++range > intel_pt_validate_hw_cap(PT_CAP_num_address_ranges)) return -EOPNOTSUPP; } @@ -1397,9 +1417,26 @@ static void pt_event_addr_filters_sync(struct perf_event *event) if (filter->path.dentry && !fr[range].start) { msr_a = msr_b = 0; } else { - /* apply the offset */ - msr_a = fr[range].start; - msr_b = msr_a + fr[range].size - 1; + unsigned long n = fr[range].size - 1; + unsigned long a = fr[range].start; + unsigned long b; + + if (a > ULONG_MAX - n) + b = ULONG_MAX; + else + b = a + n; + /* + * Apply the offset. 64-bit addresses written to the + * MSRs must be canonical, but the range can encompass + * non-canonical addresses. Since software cannot + * execute at non-canonical addresses, adjusting to + * canonical addresses does not affect the result of the + * address filter. + */ + msr_a = clamp_to_ge_canonical_addr(a, boot_cpu_data.x86_virt_bits); + msr_b = clamp_to_le_canonical_addr(b, boot_cpu_data.x86_virt_bits); + if (msr_b < msr_a) + msr_a = msr_b = 0; } filters->filter[range].msr_a = msr_a; -- cgit v1.2.3 From 1fb85d06ad6754796cd1b920639ca9d8840abefd Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 31 Jan 2022 09:24:50 +0200 Subject: x86: Share definition of __is_canonical_address() Reduce code duplication by moving canonical address code to a common header file. Signed-off-by: Adrian Hunter Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220131072453.2839535-3-adrian.hunter@intel.com --- arch/x86/events/intel/pt.c | 14 ++------------ arch/x86/include/asm/page.h | 10 ++++++++++ arch/x86/kvm/emulate.c | 4 ++-- arch/x86/kvm/x86.c | 2 +- arch/x86/kvm/x86.h | 7 +------ arch/x86/mm/maccess.c | 7 +------ 6 files changed, 17 insertions(+), 27 deletions(-) diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index 0ebcf9a56f97..93676a5b66db 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -1350,20 +1350,10 @@ static void pt_addr_filters_fini(struct perf_event *event) } #ifdef CONFIG_X86_64 -static u64 canonical_address(u64 vaddr, u8 vaddr_bits) -{ - return ((s64)vaddr << (64 - vaddr_bits)) >> (64 - vaddr_bits); -} - -static u64 is_canonical_address(u64 vaddr, u8 vaddr_bits) -{ - return canonical_address(vaddr, vaddr_bits) == vaddr; -} - /* Clamp to a canonical address greater-than-or-equal-to the address given */ static u64 clamp_to_ge_canonical_addr(u64 vaddr, u8 vaddr_bits) { - return is_canonical_address(vaddr, vaddr_bits) ? + return __is_canonical_address(vaddr, vaddr_bits) ? vaddr : -BIT_ULL(vaddr_bits - 1); } @@ -1371,7 +1361,7 @@ static u64 clamp_to_ge_canonical_addr(u64 vaddr, u8 vaddr_bits) /* Clamp to a canonical address less-than-or-equal-to the address given */ static u64 clamp_to_le_canonical_addr(u64 vaddr, u8 vaddr_bits) { - return is_canonical_address(vaddr, vaddr_bits) ? + return __is_canonical_address(vaddr, vaddr_bits) ? vaddr : BIT_ULL(vaddr_bits - 1) - 1; } diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h index 4d5810c8fab7..9cc82f305f4b 100644 --- a/arch/x86/include/asm/page.h +++ b/arch/x86/include/asm/page.h @@ -71,6 +71,16 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr, extern bool __virt_addr_valid(unsigned long kaddr); #define virt_addr_valid(kaddr) __virt_addr_valid((unsigned long) (kaddr)) +static __always_inline u64 __canonical_address(u64 vaddr, u8 vaddr_bits) +{ + return ((s64)vaddr << (64 - vaddr_bits)) >> (64 - vaddr_bits); +} + +static __always_inline u64 __is_canonical_address(u64 vaddr, u8 vaddr_bits) +{ + return __canonical_address(vaddr, vaddr_bits) == vaddr; +} + #endif /* __ASSEMBLY__ */ #include diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 5719d8cfdbd9..40da8c7f3019 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -665,7 +665,7 @@ static inline u8 ctxt_virt_addr_bits(struct x86_emulate_ctxt *ctxt) static inline bool emul_is_noncanonical_address(u64 la, struct x86_emulate_ctxt *ctxt) { - return get_canonical(la, ctxt_virt_addr_bits(ctxt)) != la; + return !__is_canonical_address(la, ctxt_virt_addr_bits(ctxt)); } /* @@ -715,7 +715,7 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt, case X86EMUL_MODE_PROT64: *linear = la; va_bits = ctxt_virt_addr_bits(ctxt); - if (get_canonical(la, va_bits) != la) + if (!__is_canonical_address(la, va_bits)) goto bad; *max_size = min_t(u64, ~0u, (1ull << va_bits) - la); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 74b53a16f38a..197209f456a6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1735,7 +1735,7 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, * value, and that something deterministic happens if the guest * invokes 64-bit SYSENTER. */ - data = get_canonical(data, vcpu_virt_addr_bits(vcpu)); + data = __canonical_address(data, vcpu_virt_addr_bits(vcpu)); break; case MSR_TSC_AUX: if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX)) diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 635b75f9e145..fc4b68ab8d71 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -211,14 +211,9 @@ static inline u8 vcpu_virt_addr_bits(struct kvm_vcpu *vcpu) return kvm_read_cr4_bits(vcpu, X86_CR4_LA57) ? 57 : 48; } -static inline u64 get_canonical(u64 la, u8 vaddr_bits) -{ - return ((int64_t)la << (64 - vaddr_bits)) >> (64 - vaddr_bits); -} - static inline bool is_noncanonical_address(u64 la, struct kvm_vcpu *vcpu) { - return get_canonical(la, vcpu_virt_addr_bits(vcpu)) != la; + return !__is_canonical_address(la, vcpu_virt_addr_bits(vcpu)); } static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu, diff --git a/arch/x86/mm/maccess.c b/arch/x86/mm/maccess.c index 92ec176a7293..5a53c2cc169c 100644 --- a/arch/x86/mm/maccess.c +++ b/arch/x86/mm/maccess.c @@ -4,11 +4,6 @@ #include #ifdef CONFIG_X86_64 -static __always_inline u64 canonical_address(u64 vaddr, u8 vaddr_bits) -{ - return ((s64)vaddr << (64 - vaddr_bits)) >> (64 - vaddr_bits); -} - bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size) { unsigned long vaddr = (unsigned long)unsafe_src; @@ -19,7 +14,7 @@ bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size) * we also need to include the userspace guard page. */ return vaddr >= TASK_SIZE_MAX + PAGE_SIZE && - canonical_address(vaddr, boot_cpu_data.x86_virt_bits) == vaddr; + __is_canonical_address(vaddr, boot_cpu_data.x86_virt_bits); } #else bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size) -- cgit v1.2.3 From d680ff24e9e14444c63945b43a37ede7cd6958f9 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 31 Jan 2022 09:24:51 +0200 Subject: perf/core: Fix address filter parser for multiple filters Reset appropriate variables in the parser loop between parsing separate filters, so that they do not interfere with parsing the next filter. Fixes: 375637bc524952 ("perf/core: Introduce address range filtering") Signed-off-by: Adrian Hunter Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220131072453.2839535-4-adrian.hunter@intel.com --- kernel/events/core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/events/core.c b/kernel/events/core.c index 76c754e45d01..2889b82fb75e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10558,8 +10558,11 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, } /* ready to consume more filters */ + kfree(filename); + filename = NULL; state = IF_STATE_ACTION; filter = NULL; + kernel = 0; } } -- cgit v1.2.3 From e5524bf1047eb3b3f3f33b5f59897ba67b3ade87 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 31 Jan 2022 09:24:52 +0200 Subject: perf/x86/intel/pt: Fix address filter config for 32-bit kernel Change from shifting 'unsigned long' to 'u64' to prevent the config bits being lost on a 32-bit kernel. Fixes: eadf48cab4b6b0 ("perf/x86/intel/pt: Add support for address range filtering in PT") Signed-off-by: Adrian Hunter Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220131072453.2839535-5-adrian.hunter@intel.com --- arch/x86/events/intel/pt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index 93676a5b66db..f061dc2bd5f0 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -474,7 +474,7 @@ static u64 pt_config_filters(struct perf_event *event) pt->filters.filter[range].msr_b = filter->msr_b; } - rtit_ctl |= filter->config << pt_address_ranges[range].reg_off; + rtit_ctl |= (u64)filter->config << pt_address_ranges[range].reg_off; } return rtit_ctl; -- cgit v1.2.3 From 58b2ff2c18b1e1d7232b8007a5698ec4ee7a7a0d Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 31 Jan 2022 09:24:53 +0200 Subject: perf/core: Allow kernel address filter when not filtering the kernel The so-called 'kernel' address filter can also be useful for filtering fixed addresses in user space. Allow that. Signed-off-by: Adrian Hunter Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220131072453.2839535-6-adrian.hunter@intel.com --- kernel/events/core.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 2889b82fb75e..afbf388a5176 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10515,8 +10515,6 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, */ if (state == IF_STATE_END) { ret = -EINVAL; - if (kernel && event->attr.exclude_kernel) - goto fail; /* * ACTION "filter" must have a non-zero length region -- cgit v1.2.3 From 2145e77fecfb3965b1dc299bac203b167238bd0b Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 1 Feb 2022 13:23:21 -0800 Subject: perf/x86/intel: Enable PEBS format 5 The new PEBS Record Format 5 is similar to the PEBS Record Format 4. The only difference is the layout of the Counter Reset fields of the PEBS Config Buffer in the DS area. For the PEBS format 4, the Counter Reset fields allocation is for 8 general-purpose counters followed by 4 fixed-function counters. For the PEBS format 5, the Counter Reset fields allocation is for 32 general-purpose counters followed by 16 fixed-function counters. Extend the MAX_PEBS_EVENTS to 32. Add MAX_PEBS_EVENTS_FMT4 for the previous platform. Except for the DS auto-reload code, other places already assume 32 counters. Only check the PEBS_FMT in the DS auto-reload code. Extend the MAX_FIXED_PEBS_EVENTS to 16, which only impacts the size of struct debug_store and some local temporary variables. The size of struct debug_store increases 288B, which is small and should be acceptable. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1643750603-100733-1-git-send-email-kan.liang@linux.intel.com --- arch/x86/events/intel/ds.c | 14 +++++++++++--- arch/x86/include/asm/intel_ds.h | 5 +++-- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 2e215369df4a..376cc3d66094 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1203,7 +1203,10 @@ static void intel_pmu_pebs_via_pt_enable(struct perf_event *event) if (hwc->idx >= INTEL_PMC_IDX_FIXED) { base = MSR_RELOAD_FIXED_CTR0; idx = hwc->idx - INTEL_PMC_IDX_FIXED; - value = ds->pebs_event_reset[MAX_PEBS_EVENTS + idx]; + if (x86_pmu.intel_cap.pebs_format < 5) + value = ds->pebs_event_reset[MAX_PEBS_EVENTS_FMT4 + idx]; + else + value = ds->pebs_event_reset[MAX_PEBS_EVENTS + idx]; } wrmsrl(base + idx, value); } @@ -1232,8 +1235,12 @@ void intel_pmu_pebs_enable(struct perf_event *event) } } - if (idx >= INTEL_PMC_IDX_FIXED) - idx = MAX_PEBS_EVENTS + (idx - INTEL_PMC_IDX_FIXED); + if (idx >= INTEL_PMC_IDX_FIXED) { + if (x86_pmu.intel_cap.pebs_format < 5) + idx = MAX_PEBS_EVENTS_FMT4 + (idx - INTEL_PMC_IDX_FIXED); + else + idx = MAX_PEBS_EVENTS + (idx - INTEL_PMC_IDX_FIXED); + } /* * Use auto-reload if possible to save a MSR write in the PMI. @@ -2204,6 +2211,7 @@ void __init intel_ds_init(void) break; case 4: + case 5: x86_pmu.drain_pebs = intel_pmu_drain_pebs_icl; x86_pmu.pebs_record_size = sizeof(struct pebs_basic); if (x86_pmu.intel_cap.pebs_baseline) { diff --git a/arch/x86/include/asm/intel_ds.h b/arch/x86/include/asm/intel_ds.h index 8380c3ddd4b2..2f9eeb5c3069 100644 --- a/arch/x86/include/asm/intel_ds.h +++ b/arch/x86/include/asm/intel_ds.h @@ -7,8 +7,9 @@ #define PEBS_BUFFER_SIZE (PAGE_SIZE << 4) /* The maximal number of PEBS events: */ -#define MAX_PEBS_EVENTS 8 -#define MAX_FIXED_PEBS_EVENTS 4 +#define MAX_PEBS_EVENTS_FMT4 8 +#define MAX_PEBS_EVENTS 32 +#define MAX_FIXED_PEBS_EVENTS 16 /* * A debug store configuration. -- cgit v1.2.3 From 0144ba0c5bd3176647bb4d49a697d231610c78b7 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Tue, 1 Feb 2022 13:23:22 -0800 Subject: KVM: x86: use the KVM side max supported fixed counter KVM vPMU doesn't support to emulate all the fixed counters that the host PMU driver has supported, e.g. the fixed counter 3 used by Topdown metrics hasn't been supported by KVM so far. Rename MAX_FIXED_COUNTERS to KVM_PMC_MAX_FIXED to have a more straightforward naming convention as INTEL_PMC_MAX_FIXED used by the host PMU driver, and fix vPMU to use the KVM side KVM_PMC_MAX_FIXED for the virtual fixed counter emulation, instead of the host side INTEL_PMC_MAX_FIXED. Signed-off-by: Wei Wang Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1643750603-100733-2-git-send-email-kan.liang@linux.intel.com --- arch/x86/include/asm/kvm_host.h | 3 ++- arch/x86/kvm/cpuid.c | 3 ++- arch/x86/kvm/pmu.h | 2 -- arch/x86/kvm/vmx/pmu_intel.c | 4 ++-- arch/x86/kvm/x86.c | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 6e7c545bc7ee..7d8f9e952a89 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -498,6 +498,7 @@ struct kvm_pmc { bool intr; }; +#define KVM_PMC_MAX_FIXED 3 struct kvm_pmu { unsigned nr_arch_gp_counters; unsigned nr_arch_fixed_counters; @@ -511,7 +512,7 @@ struct kvm_pmu { u64 reserved_bits; u8 version; struct kvm_pmc gp_counters[INTEL_PMC_MAX_GENERIC]; - struct kvm_pmc fixed_counters[INTEL_PMC_MAX_FIXED]; + struct kvm_pmc fixed_counters[KVM_PMC_MAX_FIXED]; struct irq_work irq_work; DECLARE_BITMAP(reprogram_pmi, X86_PMC_IDX_MAX); DECLARE_BITMAP(all_valid_pmc_idx, X86_PMC_IDX_MAX); diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 28be02adc669..b4ebf6216ad4 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -875,7 +875,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) eax.split.bit_width = cap.bit_width_gp; eax.split.mask_length = cap.events_mask_len; - edx.split.num_counters_fixed = min(cap.num_counters_fixed, MAX_FIXED_COUNTERS); + edx.split.num_counters_fixed = + min(cap.num_counters_fixed, KVM_PMC_MAX_FIXED); edx.split.bit_width_fixed = cap.bit_width_fixed; if (cap.version) edx.split.anythread_deprecated = 1; diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 7a7b8d5b775e..9e66fba1d6a3 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -15,8 +15,6 @@ #define VMWARE_BACKDOOR_PMC_REAL_TIME 0x10001 #define VMWARE_BACKDOOR_PMC_APPARENT_TIME 0x10002 -#define MAX_FIXED_COUNTERS 3 - struct kvm_event_hw_type_mapping { u8 eventsel; u8 unit_mask; diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 466d18fc0c5d..9b26596099a1 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -565,7 +565,7 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu) pmu->gp_counters[i].current_config = 0; } - for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) { + for (i = 0; i < KVM_PMC_MAX_FIXED; i++) { pmu->fixed_counters[i].type = KVM_PMC_FIXED; pmu->fixed_counters[i].vcpu = vcpu; pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED; @@ -591,7 +591,7 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu) pmc->counter = pmc->eventsel = 0; } - for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) { + for (i = 0; i < KVM_PMC_MAX_FIXED; i++) { pmc = &pmu->fixed_counters[i]; pmc_stop_counter(pmc); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 197209f456a6..d3fb2bc37771 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6514,7 +6514,7 @@ static void kvm_init_msr_list(void) u32 dummy[2]; unsigned i; - BUILD_BUG_ON_MSG(INTEL_PMC_MAX_FIXED != 4, + BUILD_BUG_ON_MSG(KVM_PMC_MAX_FIXED != 3, "Please update the fixed PMCs in msrs_to_saved_all[]"); perf_get_x86_pmu_capability(&x86_pmu); -- cgit v1.2.3 From ee28855a54493ce83bc2a3fbe30210be61b57bc7 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 1 Feb 2022 13:23:23 -0800 Subject: perf/x86/intel: Increase max number of the fixed counters The new PEBS format 5 implies that the number of the fixed counters can be up to 16. The current INTEL_PMC_MAX_FIXED is still 4. If the current kernel runs on a future platform which has more than 4 fixed counters, a warning will be triggered. The number of the fixed counters will be clipped to 4. Users have to upgrade the kernel to access the new fixed counters. Add a new default constraint for PerfMon v5 and up, which can support up to 16 fixed counters. The pseudo-encoding is applied for the fixed counters 4 and later. The user can have generic support for the new fixed counters on the future platfroms without updating the kernel. Increase the INTEL_PMC_MAX_FIXED to 16. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andi Kleen Link: https://lkml.kernel.org/r/1643750603-100733-3-git-send-email-kan.liang@linux.intel.com --- arch/x86/events/intel/core.c | 40 ++++++++++++++++++++++++++++++++++++++- arch/x86/include/asm/perf_event.h | 2 +- 2 files changed, 40 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index c91434056c29..88dcfb46c797 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -181,6 +181,27 @@ static struct event_constraint intel_gen_event_constraints[] __read_mostly = EVENT_CONSTRAINT_END }; +static struct event_constraint intel_v5_gen_event_constraints[] __read_mostly = +{ + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ + FIXED_EVENT_CONSTRAINT(0x0400, 3), /* SLOTS */ + FIXED_EVENT_CONSTRAINT(0x0500, 4), + FIXED_EVENT_CONSTRAINT(0x0600, 5), + FIXED_EVENT_CONSTRAINT(0x0700, 6), + FIXED_EVENT_CONSTRAINT(0x0800, 7), + FIXED_EVENT_CONSTRAINT(0x0900, 8), + FIXED_EVENT_CONSTRAINT(0x0a00, 9), + FIXED_EVENT_CONSTRAINT(0x0b00, 10), + FIXED_EVENT_CONSTRAINT(0x0c00, 11), + FIXED_EVENT_CONSTRAINT(0x0d00, 12), + FIXED_EVENT_CONSTRAINT(0x0e00, 13), + FIXED_EVENT_CONSTRAINT(0x0f00, 14), + FIXED_EVENT_CONSTRAINT(0x1000, 15), + EVENT_CONSTRAINT_END +}; + static struct event_constraint intel_slm_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ @@ -6295,7 +6316,9 @@ __init int intel_pmu_init(void) pr_cont("generic architected perfmon v1, "); name = "generic_arch_v1"; break; - default: + case 2: + case 3: + case 4: /* * default constraints for v2 and up */ @@ -6303,6 +6326,21 @@ __init int intel_pmu_init(void) pr_cont("generic architected perfmon, "); name = "generic_arch_v2+"; break; + default: + /* + * The default constraints for v5 and up can support up to + * 16 fixed counters. For the fixed counters 4 and later, + * the pseudo-encoding is applied. + * The constraints may be cut according to the CPUID enumeration + * by inserting the EVENT_CONSTRAINT_END. + */ + if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED) + x86_pmu.num_counters_fixed = INTEL_PMC_MAX_FIXED; + intel_v5_gen_event_constraints[x86_pmu.num_counters_fixed].weight = -1; + x86_pmu.event_constraints = intel_v5_gen_event_constraints; + pr_cont("generic architected perfmon, "); + name = "generic_arch_v5+"; + break; } } diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 8fc1b5003713..58d9e4b1fa0a 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -7,7 +7,7 @@ */ #define INTEL_PMC_MAX_GENERIC 32 -#define INTEL_PMC_MAX_FIXED 4 +#define INTEL_PMC_MAX_FIXED 16 #define INTEL_PMC_IDX_FIXED 32 #define X86_PMC_IDX_MAX 64 -- cgit v1.2.3 From 28c24ded649cf068ca518f2a3d78f5e7e06d41d8 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Wed, 26 Jan 2022 12:48:14 +0200 Subject: perf/x86/intel/pt: Add a capability and config bit for event tracing As of Intel SDM (https://www.intel.com/sdm) version 076, there is a new Intel PT feature called Event Trace which is enabled config bit 31. Event Trace exposes details about asynchronous events such as interrupts and VM-Entry/Exit. Add a capability and config bit for Event Trace. Signed-off-by: Alexander Shishkin Signed-off-by: Adrian Hunter Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Adrian Hunter Link: https://lore.kernel.org/r/20220126104815.2807416-2-adrian.hunter@intel.com --- arch/x86/events/intel/pt.c | 8 ++++++++ arch/x86/include/asm/intel_pt.h | 1 + arch/x86/include/asm/msr-index.h | 1 + 3 files changed, 10 insertions(+) diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index f061dc2bd5f0..f339c88d17f9 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -59,6 +59,7 @@ static struct pt_cap_desc { PT_CAP(mtc, 0, CPUID_EBX, BIT(3)), PT_CAP(ptwrite, 0, CPUID_EBX, BIT(4)), PT_CAP(power_event_trace, 0, CPUID_EBX, BIT(5)), + PT_CAP(event_trace, 0, CPUID_EBX, BIT(7)), PT_CAP(topa_output, 0, CPUID_ECX, BIT(0)), PT_CAP(topa_multiple_entries, 0, CPUID_ECX, BIT(1)), PT_CAP(single_range_output, 0, CPUID_ECX, BIT(2)), @@ -110,6 +111,7 @@ PMU_FORMAT_ATTR(tsc, "config:10" ); PMU_FORMAT_ATTR(noretcomp, "config:11" ); PMU_FORMAT_ATTR(ptw, "config:12" ); PMU_FORMAT_ATTR(branch, "config:13" ); +PMU_FORMAT_ATTR(event, "config:31" ); PMU_FORMAT_ATTR(mtc_period, "config:14-17" ); PMU_FORMAT_ATTR(cyc_thresh, "config:19-22" ); PMU_FORMAT_ATTR(psb_period, "config:24-27" ); @@ -118,6 +120,7 @@ static struct attribute *pt_formats_attr[] = { &format_attr_pt.attr, &format_attr_cyc.attr, &format_attr_pwr_evt.attr, + &format_attr_event.attr, &format_attr_fup_on_ptw.attr, &format_attr_mtc.attr, &format_attr_tsc.attr, @@ -298,6 +301,7 @@ fail: RTIT_CTL_CYC_PSB | \ RTIT_CTL_MTC | \ RTIT_CTL_PWR_EVT_EN | \ + RTIT_CTL_EVENT_EN | \ RTIT_CTL_FUP_ON_PTW | \ RTIT_CTL_PTW_EN) @@ -352,6 +356,10 @@ static bool pt_event_valid(struct perf_event *event) !intel_pt_validate_hw_cap(PT_CAP_power_event_trace)) return false; + if (config & RTIT_CTL_EVENT_EN && + !intel_pt_validate_hw_cap(PT_CAP_event_trace)) + return false; + if (config & RTIT_CTL_PTW) { if (!intel_pt_validate_hw_cap(PT_CAP_ptwrite)) return false; diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h index ebe8d2ea44fe..d1ef9cb58847 100644 --- a/arch/x86/include/asm/intel_pt.h +++ b/arch/x86/include/asm/intel_pt.h @@ -13,6 +13,7 @@ enum pt_capabilities { PT_CAP_mtc, PT_CAP_ptwrite, PT_CAP_power_event_trace, + PT_CAP_event_trace, PT_CAP_topa_output, PT_CAP_topa_multiple_entries, PT_CAP_single_range_output, diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 3faf0f97edb1..79b392d893e3 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -205,6 +205,7 @@ #define RTIT_CTL_DISRETC BIT(11) #define RTIT_CTL_PTW_EN BIT(12) #define RTIT_CTL_BRANCH_EN BIT(13) +#define RTIT_CTL_EVENT_EN BIT(31) #define RTIT_CTL_MTC_RANGE_OFFSET 14 #define RTIT_CTL_MTC_RANGE (0x0full << RTIT_CTL_MTC_RANGE_OFFSET) #define RTIT_CTL_CYC_THRESH_OFFSET 19 -- cgit v1.2.3 From 161a9a33702a2e65a4118dacb449505ac8ce3122 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Wed, 26 Jan 2022 12:48:15 +0200 Subject: perf/x86/intel/pt: Add a capability and config bit for disabling TNTs As of Intel SDM (https://www.intel.com/sdm) version 076, there is a new Intel PT feature called TNT-Disable which is enabled config bit 55. TNT-Disable disables Taken-Not-Taken packets to reduce the tracing overhead, but with the result that exact control flow information is lost. Add a capability and config bit for TNT-Disable. Signed-off-by: Alexander Shishkin Signed-off-by: Adrian Hunter Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Adrian Hunter Link: https://lore.kernel.org/r/20220126104815.2807416-3-adrian.hunter@intel.com --- arch/x86/events/intel/pt.c | 8 ++++++++ arch/x86/include/asm/intel_pt.h | 1 + arch/x86/include/asm/msr-index.h | 1 + 3 files changed, 10 insertions(+) diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index f339c88d17f9..aa66c0c7b18b 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -60,6 +60,7 @@ static struct pt_cap_desc { PT_CAP(ptwrite, 0, CPUID_EBX, BIT(4)), PT_CAP(power_event_trace, 0, CPUID_EBX, BIT(5)), PT_CAP(event_trace, 0, CPUID_EBX, BIT(7)), + PT_CAP(tnt_disable, 0, CPUID_EBX, BIT(8)), PT_CAP(topa_output, 0, CPUID_ECX, BIT(0)), PT_CAP(topa_multiple_entries, 0, CPUID_ECX, BIT(1)), PT_CAP(single_range_output, 0, CPUID_ECX, BIT(2)), @@ -112,6 +113,7 @@ PMU_FORMAT_ATTR(noretcomp, "config:11" ); PMU_FORMAT_ATTR(ptw, "config:12" ); PMU_FORMAT_ATTR(branch, "config:13" ); PMU_FORMAT_ATTR(event, "config:31" ); +PMU_FORMAT_ATTR(notnt, "config:55" ); PMU_FORMAT_ATTR(mtc_period, "config:14-17" ); PMU_FORMAT_ATTR(cyc_thresh, "config:19-22" ); PMU_FORMAT_ATTR(psb_period, "config:24-27" ); @@ -121,6 +123,7 @@ static struct attribute *pt_formats_attr[] = { &format_attr_cyc.attr, &format_attr_pwr_evt.attr, &format_attr_event.attr, + &format_attr_notnt.attr, &format_attr_fup_on_ptw.attr, &format_attr_mtc.attr, &format_attr_tsc.attr, @@ -302,6 +305,7 @@ fail: RTIT_CTL_MTC | \ RTIT_CTL_PWR_EVT_EN | \ RTIT_CTL_EVENT_EN | \ + RTIT_CTL_NOTNT | \ RTIT_CTL_FUP_ON_PTW | \ RTIT_CTL_PTW_EN) @@ -360,6 +364,10 @@ static bool pt_event_valid(struct perf_event *event) !intel_pt_validate_hw_cap(PT_CAP_event_trace)) return false; + if (config & RTIT_CTL_NOTNT && + !intel_pt_validate_hw_cap(PT_CAP_tnt_disable)) + return false; + if (config & RTIT_CTL_PTW) { if (!intel_pt_validate_hw_cap(PT_CAP_ptwrite)) return false; diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h index d1ef9cb58847..c796e9bc98b6 100644 --- a/arch/x86/include/asm/intel_pt.h +++ b/arch/x86/include/asm/intel_pt.h @@ -14,6 +14,7 @@ enum pt_capabilities { PT_CAP_ptwrite, PT_CAP_power_event_trace, PT_CAP_event_trace, + PT_CAP_tnt_disable, PT_CAP_topa_output, PT_CAP_topa_multiple_entries, PT_CAP_single_range_output, diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 79b392d893e3..efd34cfa1720 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -206,6 +206,7 @@ #define RTIT_CTL_PTW_EN BIT(12) #define RTIT_CTL_BRANCH_EN BIT(13) #define RTIT_CTL_EVENT_EN BIT(31) +#define RTIT_CTL_NOTNT BIT_ULL(55) #define RTIT_CTL_MTC_RANGE_OFFSET 14 #define RTIT_CTL_MTC_RANGE (0x0full << RTIT_CTL_MTC_RANGE_OFFSET) #define RTIT_CTL_CYC_THRESH_OFFSET 19 -- cgit v1.2.3 From 71a412ed4c104bcc239b1a8e06f90b58a4aee0bb Mon Sep 17 00:00:00 2001 From: Steve Wahl Date: Fri, 18 Feb 2022 11:54:18 -0600 Subject: perf/x86/intel/uncore: Make uncore_discovery clean for 64 bit addresses Support 64-bit BAR size for discovery, and do not truncate return from generic_uncore_mmio_box_ctl() to 32 bits. Signed-off-by: Steve Wahl Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kan Liang Link: https://lore.kernel.org/r/20220218175418.421268-1-steve.wahl@hpe.com --- arch/x86/events/intel/uncore_discovery.c | 16 +++++++++++----- arch/x86/events/intel/uncore_discovery.h | 2 -- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/arch/x86/events/intel/uncore_discovery.c b/arch/x86/events/intel/uncore_discovery.c index 6ddadb482f68..61185d16fba2 100644 --- a/arch/x86/events/intel/uncore_discovery.c +++ b/arch/x86/events/intel/uncore_discovery.c @@ -210,15 +210,21 @@ static int parse_discovery_table(struct pci_dev *dev, int die, void __iomem *io_addr; resource_size_t addr; unsigned long size; - u32 val; + u32 val, val2; int i; pci_read_config_dword(dev, bar_offset, &val); - if (val & UNCORE_DISCOVERY_MASK) + if (val & ~PCI_BASE_ADDRESS_MEM_MASK & ~PCI_BASE_ADDRESS_MEM_TYPE_64) return -EINVAL; - addr = (resource_size_t)(val & ~UNCORE_DISCOVERY_MASK); + addr = (resource_size_t)(val & PCI_BASE_ADDRESS_MEM_MASK); +#ifdef CONFIG_PHYS_ADDR_T_64BIT + if ((val & PCI_BASE_ADDRESS_MEM_TYPE_MASK) == PCI_BASE_ADDRESS_MEM_TYPE_64) { + pci_read_config_dword(dev, bar_offset + 4, &val2); + addr |= ((resource_size_t)val2) << 32; + } +#endif size = UNCORE_DISCOVERY_GLOBAL_MAP_SIZE; io_addr = ioremap(addr, size); if (!io_addr) @@ -444,7 +450,7 @@ static struct intel_uncore_ops generic_uncore_pci_ops = { #define UNCORE_GENERIC_MMIO_SIZE 0x4000 -static unsigned int generic_uncore_mmio_box_ctl(struct intel_uncore_box *box) +static u64 generic_uncore_mmio_box_ctl(struct intel_uncore_box *box) { struct intel_uncore_type *type = box->pmu->type; @@ -456,7 +462,7 @@ static unsigned int generic_uncore_mmio_box_ctl(struct intel_uncore_box *box) void intel_generic_uncore_mmio_init_box(struct intel_uncore_box *box) { - unsigned int box_ctl = generic_uncore_mmio_box_ctl(box); + u64 box_ctl = generic_uncore_mmio_box_ctl(box); struct intel_uncore_type *type = box->pmu->type; resource_size_t addr; diff --git a/arch/x86/events/intel/uncore_discovery.h b/arch/x86/events/intel/uncore_discovery.h index cfaf558bdb6b..f4439357779a 100644 --- a/arch/x86/events/intel/uncore_discovery.h +++ b/arch/x86/events/intel/uncore_discovery.h @@ -18,8 +18,6 @@ #define UNCORE_DISCOVERY_BIR_BASE 0x10 /* Discovery table BAR step */ #define UNCORE_DISCOVERY_BIR_STEP 0x4 -/* Mask of the discovery table offset */ -#define UNCORE_DISCOVERY_MASK 0xf /* Global discovery table size */ #define UNCORE_DISCOVERY_GLOBAL_MAP_SIZE 0x20 -- cgit v1.2.3 From cedd3614e5d9c80908099c19f8716714ce0610b1 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 24 Feb 2022 11:06:54 +0530 Subject: perf: Add irq and exception return branch types This expands generic branch type classification by adding two more entries there in i.e irq and exception return. Also updates the x86 implementation to process X86_BR_IRET and X86_BR_IRQ records as appropriate. This changes branch types reported to user space on x86 platform but it should not be a problem. The possible scenarios and impacts are enumerated here. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1645681014-3346-1-git-send-email-anshuman.khandual@arm.com --- arch/x86/events/intel/lbr.c | 4 ++-- include/uapi/linux/perf_event.h | 2 ++ tools/include/uapi/linux/perf_event.h | 2 ++ tools/perf/util/branch.c | 4 +++- 4 files changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 669c2be14784..fe1742c4ca49 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -1329,10 +1329,10 @@ static int branch_map[X86_BR_TYPE_MAP_MAX] = { PERF_BR_SYSCALL, /* X86_BR_SYSCALL */ PERF_BR_SYSRET, /* X86_BR_SYSRET */ PERF_BR_UNKNOWN, /* X86_BR_INT */ - PERF_BR_UNKNOWN, /* X86_BR_IRET */ + PERF_BR_ERET, /* X86_BR_IRET */ PERF_BR_COND, /* X86_BR_JCC */ PERF_BR_UNCOND, /* X86_BR_JMP */ - PERF_BR_UNKNOWN, /* X86_BR_IRQ */ + PERF_BR_IRQ, /* X86_BR_IRQ */ PERF_BR_IND_CALL, /* X86_BR_IND_CALL */ PERF_BR_UNKNOWN, /* X86_BR_ABORT */ PERF_BR_UNKNOWN, /* X86_BR_IN_TX */ diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 1b65042ab1db..7dc71768749d 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -251,6 +251,8 @@ enum { PERF_BR_SYSRET = 8, /* syscall return */ PERF_BR_COND_CALL = 9, /* conditional function call */ PERF_BR_COND_RET = 10, /* conditional function return */ + PERF_BR_ERET = 11, /* exception return */ + PERF_BR_IRQ = 12, /* irq */ PERF_BR_MAX, }; diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index 4cd39aaccbe7..d1324c427e8b 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -251,6 +251,8 @@ enum { PERF_BR_SYSRET = 8, /* syscall return */ PERF_BR_COND_CALL = 9, /* conditional function call */ PERF_BR_COND_RET = 10, /* conditional function return */ + PERF_BR_ERET = 11, /* exception return */ + PERF_BR_IRQ = 12, /* irq */ PERF_BR_MAX, }; diff --git a/tools/perf/util/branch.c b/tools/perf/util/branch.c index 2285b1eb3128..a9a909db8cc7 100644 --- a/tools/perf/util/branch.c +++ b/tools/perf/util/branch.c @@ -49,7 +49,9 @@ const char *branch_type_name(int type) "SYSCALL", "SYSRET", "COND_CALL", - "COND_RET" + "COND_RET", + "ERET", + "IRQ" }; if (type >= 0 && type < PERF_BR_MAX) -- cgit v1.2.3 From 02a08d78f5c429c7dc8e5b9417b4efb518b3d041 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 3 Mar 2022 08:57:08 +0100 Subject: perf/x86/intel/uncore: Fix the build on !CONFIG_PHYS_ADDR_T_64BIT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 'val2' is unused if !CONFIG_PHYS_ADDR_T_64BIT: arch/x86/events/intel/uncore_discovery.c:213:18: error: unused variable ‘val2’ [-Werror=unused-variable] Signed-off-by: Ingo Molnar --- arch/x86/events/intel/uncore_discovery.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/intel/uncore_discovery.c b/arch/x86/events/intel/uncore_discovery.c index 61185d16fba2..5fd72d4b8bbb 100644 --- a/arch/x86/events/intel/uncore_discovery.c +++ b/arch/x86/events/intel/uncore_discovery.c @@ -210,7 +210,7 @@ static int parse_discovery_table(struct pci_dev *dev, int die, void __iomem *io_addr; resource_size_t addr; unsigned long size; - u32 val, val2; + u32 val; int i; pci_read_config_dword(dev, bar_offset, &val); @@ -221,6 +221,8 @@ static int parse_discovery_table(struct pci_dev *dev, int die, addr = (resource_size_t)(val & PCI_BASE_ADDRESS_MEM_MASK); #ifdef CONFIG_PHYS_ADDR_T_64BIT if ((val & PCI_BASE_ADDRESS_MEM_TYPE_MASK) == PCI_BASE_ADDRESS_MEM_TYPE_64) { + u32 val2; + pci_read_config_dword(dev, bar_offset + 4, &val2); addr |= ((resource_size_t)val2) << 32; } -- cgit v1.2.3