diff options
Diffstat (limited to 'arch/x86/kernel')
114 files changed, 1797 insertions, 27030 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index b1b78ffe01d0..616ebd22ef9a 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -16,9 +16,21 @@ CFLAGS_REMOVE_ftrace.o = -pg CFLAGS_REMOVE_early_printk.o = -pg endif -KASAN_SANITIZE_head$(BITS).o := n -KASAN_SANITIZE_dumpstack.o := n -KASAN_SANITIZE_dumpstack_$(BITS).o := n +KASAN_SANITIZE_head$(BITS).o := n +KASAN_SANITIZE_dumpstack.o := n +KASAN_SANITIZE_dumpstack_$(BITS).o := n +KASAN_SANITIZE_stacktrace.o := n + +OBJECT_FILES_NON_STANDARD_head_$(BITS).o := y +OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y +OBJECT_FILES_NON_STANDARD_mcount_$(BITS).o := y +OBJECT_FILES_NON_STANDARD_test_nx.o := y + +# If instrumentation of this dir is enabled, boot hangs during first second. +# Probably could be more selective here, but note that files related to irqs, +# boot, dumpstack/stacktrace, etc are either non-interesting or can lead to +# non-deterministic coverage. +KCOV_INSTRUMENT := n CFLAGS_irq.o := -I$(src)/../include/asm/trace diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index e75907601a41..8c2f1ef6ca23 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -956,7 +956,7 @@ static int __init early_acpi_parse_madt_lapic_addr_ovr(void) /* * Note that the LAPIC address is obtained from the MADT (32-bit value) - * and (optionally) overriden by a LAPIC_ADDR_OVR entry (64-bit value). + * and (optionally) overridden by a LAPIC_ADDR_OVR entry (64-bit value). */ count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE, @@ -984,7 +984,7 @@ static int __init acpi_parse_madt_lapic_entries(void) /* * Note that the LAPIC address is obtained from the MADT (32-bit value) - * and (optionally) overriden by a LAPIC_ADDR_OVR entry (64-bit value). + * and (optionally) overridden by a LAPIC_ADDR_OVR entry (64-bit value). */ count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE, diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index d1daead5fcdd..adb3eaf8fe2a 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -16,6 +16,7 @@ #include <asm/cacheflush.h> #include <asm/realmode.h> +#include <linux/ftrace.h> #include "../../realmode/rm/wakeup.h" #include "sleep.h" @@ -107,7 +108,13 @@ int x86_acpi_suspend_lowlevel(void) saved_magic = 0x123456789abcdef0L; #endif /* CONFIG_64BIT */ + /* + * Pause/unpause graph tracing around do_suspend_lowlevel as it has + * inconsistent call/return info after it jumps to the wakeup vector. + */ + pause_graph_tracing(); do_suspend_lowlevel(); + unpause_graph_tracing(); return 0; } diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S index 8c35df468104..169963f471bb 100644 --- a/arch/x86/kernel/acpi/wakeup_64.S +++ b/arch/x86/kernel/acpi/wakeup_64.S @@ -5,6 +5,7 @@ #include <asm/page_types.h> #include <asm/msr.h> #include <asm/asm-offsets.h> +#include <asm/frame.h> # Copyright 2003 Pavel Machek <pavel@suse.cz>, distribute under GPLv2 @@ -39,6 +40,7 @@ bogus_64_magic: jmp bogus_64_magic ENTRY(do_suspend_lowlevel) + FRAME_BEGIN subq $8, %rsp xorl %eax, %eax call save_processor_state @@ -109,6 +111,7 @@ ENTRY(do_suspend_lowlevel) xorl %eax, %eax addq $8, %rsp + FRAME_END jmp restore_processor_state ENDPROC(do_suspend_lowlevel) diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index 222a57076039..cefacbad1531 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -221,7 +221,7 @@ static int apbt_cpuhp_notify(struct notifier_block *n, unsigned long cpu = (unsigned long)hcpu; struct apbt_dev *adev = &per_cpu(cpu_apbt_dev, cpu); - switch (action & 0xf) { + switch (action & ~CPU_TASKS_FROZEN) { case CPU_DEAD: dw_apb_clockevent_pause(adev->timer); if (system_state == SYSTEM_RUNNING) { diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 6e85f713641d..0a2bb1f62e72 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -227,19 +227,11 @@ static u32 __init search_agp_bridge(u32 *order, int *valid_agp) return 0; } -static int gart_fix_e820 __initdata = 1; +static bool gart_fix_e820 __initdata = true; static int __init parse_gart_mem(char *p) { - if (!p) - return -EINVAL; - - if (!strncmp(p, "off", 3)) - gart_fix_e820 = 0; - else if (!strncmp(p, "on", 2)) - gart_fix_e820 = 1; - - return 0; + return kstrtobool(p, &gart_fix_e820); } early_param("gart_fix_e820", parse_gart_mem); diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index 8bb12ddc5db8..8e63ebdcbd0b 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -2,6 +2,10 @@ # Makefile for local APIC drivers and for the IO-APIC code # +# Leads to non-deterministic coverage that is not a function of syscall inputs. +# In particualr, smp_apic_timer_interrupt() is called in random places. +KCOV_INSTRUMENT := n + obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o ipi.o vector.o obj-y += hw_nmi.o diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 8a5cddac7d44..d356987a04e9 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1611,7 +1611,7 @@ void __init enable_IR_x2apic(void) legacy_pic->mask_all(); mask_ioapic_entries(); - /* If irq_remapping_prepare() succeded, try to enable it */ + /* If irq_remapping_prepare() succeeded, try to enable it */ if (ir_stat >= 0) ir_stat = try_to_enable_IR(); /* ir_stat contains the remap mode or an error code */ @@ -2078,6 +2078,20 @@ int generic_processor_info(int apicid, int version) cpu = cpumask_next_zero(-1, cpu_present_mask); /* + * This can happen on physical hotplug. The sanity check at boot time + * is done from native_smp_prepare_cpus() after num_possible_cpus() is + * established. + */ + if (topology_update_package_map(apicid, cpu) < 0) { + int thiscpu = max + disabled_cpus; + + pr_warning("ACPI: Package limit reached. Processor %d/0x%x ignored.\n", + thiscpu, apicid); + disabled_cpus++; + return -ENOSPC; + } + + /* * Validate version */ if (version == 0x0) { diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 9968f30cca3e..76f89e2b245a 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -53,7 +53,7 @@ void flat_init_apic_ldr(void) apic_write(APIC_LDR, val); } -static inline void _flat_send_IPI_mask(unsigned long mask, int vector) +static void _flat_send_IPI_mask(unsigned long mask, int vector) { unsigned long flags; diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index c80c02c6ec49..ab5c2c685a3c 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -30,7 +30,7 @@ static unsigned int numachip1_get_apic_id(unsigned long x) unsigned long value; unsigned int id = (x >> 24) & 0xff; - if (static_cpu_has_safe(X86_FEATURE_NODEID_MSR)) { + if (static_cpu_has(X86_FEATURE_NODEID_MSR)) { rdmsrl(MSR_FAM10H_NODE_ID, value); id |= (value << 2) & 0xff00; } @@ -178,7 +178,7 @@ static void fixup_cpu_id(struct cpuinfo_x86 *c, int node) this_cpu_write(cpu_llc_id, node); /* Account for nodes per socket in multi-core-module processors */ - if (static_cpu_has_safe(X86_FEATURE_NODEID_MSR)) { + if (static_cpu_has(X86_FEATURE_NODEID_MSR)) { rdmsrl(MSR_FAM10H_NODE_ID, val); nodes = ((val >> 3) & 7) + 1; } diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index eb45fc9b6124..28bde88b0085 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c @@ -18,6 +18,66 @@ #include <asm/proto.h> #include <asm/ipi.h> +void __default_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest) +{ + /* + * Subtle. In the case of the 'never do double writes' workaround + * we have to lock out interrupts to be safe. As we don't care + * of the value read we use an atomic rmw access to avoid costly + * cli/sti. Otherwise we use an even cheaper single atomic write + * to the APIC. + */ + unsigned int cfg; + + /* + * Wait for idle. + */ + __xapic_wait_icr_idle(); + + /* + * No need to touch the target chip field + */ + cfg = __prepare_ICR(shortcut, vector, dest); + + /* + * Send the IPI. The write to APIC_ICR fires this off. + */ + native_apic_mem_write(APIC_ICR, cfg); +} + +/* + * This is used to send an IPI with no shorthand notation (the destination is + * specified in bits 56 to 63 of the ICR). + */ +void __default_send_IPI_dest_field(unsigned int mask, int vector, unsigned int dest) +{ + unsigned long cfg; + + /* + * Wait for idle. + */ + if (unlikely(vector == NMI_VECTOR)) + safe_apic_wait_icr_idle(); + else + __xapic_wait_icr_idle(); + + /* + * prepare target chip field + */ + cfg = __prepare_ICR2(mask); + native_apic_mem_write(APIC_ICR2, cfg); + + /* + * program the ICR + */ + cfg = __prepare_ICR(0, vector, dest); + + /* + * Send the IPI. The write to APIC_ICR fires this off. + */ + native_apic_mem_write(APIC_ICR, cfg); +} + void default_send_IPI_single_phys(int cpu, int vector) { unsigned long flags; diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 3b670df4ba7b..ad59d70bcb1a 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -213,6 +213,7 @@ update: */ cpumask_and(d->old_domain, d->old_domain, cpu_online_mask); d->move_in_progress = !cpumask_empty(d->old_domain); + d->cfg.old_vector = d->move_in_progress ? d->cfg.vector : 0; d->cfg.vector = vector; cpumask_copy(d->domain, vector_cpumask); success: @@ -655,46 +656,97 @@ void irq_complete_move(struct irq_cfg *cfg) } /* - * Called with @desc->lock held and interrupts disabled. + * Called from fixup_irqs() with @desc->lock held and interrupts disabled. */ void irq_force_complete_move(struct irq_desc *desc) { struct irq_data *irqdata = irq_desc_get_irq_data(desc); struct apic_chip_data *data = apic_chip_data(irqdata); struct irq_cfg *cfg = data ? &data->cfg : NULL; + unsigned int cpu; if (!cfg) return; - __irq_complete_move(cfg, cfg->vector); - /* * This is tricky. If the cleanup of @data->old_domain has not been * done yet, then the following setaffinity call will fail with * -EBUSY. This can leave the interrupt in a stale state. * - * The cleanup cannot make progress because we hold @desc->lock. So in - * case @data->old_domain is not yet cleaned up, we need to drop the - * lock and acquire it again. @desc cannot go away, because the - * hotplug code holds the sparse irq lock. + * All CPUs are stuck in stop machine with interrupts disabled so + * calling __irq_complete_move() would be completely pointless. */ raw_spin_lock(&vector_lock); - /* Clean out all offline cpus (including ourself) first. */ + /* + * Clean out all offline cpus (including the outgoing one) from the + * old_domain mask. + */ cpumask_and(data->old_domain, data->old_domain, cpu_online_mask); - while (!cpumask_empty(data->old_domain)) { + + /* + * If move_in_progress is cleared and the old_domain mask is empty, + * then there is nothing to cleanup. fixup_irqs() will take care of + * the stale vectors on the outgoing cpu. + */ + if (!data->move_in_progress && cpumask_empty(data->old_domain)) { raw_spin_unlock(&vector_lock); - raw_spin_unlock(&desc->lock); - cpu_relax(); - raw_spin_lock(&desc->lock); + return; + } + + /* + * 1) The interrupt is in move_in_progress state. That means that we + * have not seen an interrupt since the io_apic was reprogrammed to + * the new vector. + * + * 2) The interrupt has fired on the new vector, but the cleanup IPIs + * have not been processed yet. + */ + if (data->move_in_progress) { /* - * Reevaluate apic_chip_data. It might have been cleared after - * we dropped @desc->lock. + * In theory there is a race: + * + * set_ioapic(new_vector) <-- Interrupt is raised before update + * is effective, i.e. it's raised on + * the old vector. + * + * So if the target cpu cannot handle that interrupt before + * the old vector is cleaned up, we get a spurious interrupt + * and in the worst case the ioapic irq line becomes stale. + * + * But in case of cpu hotplug this should be a non issue + * because if the affinity update happens right before all + * cpus rendevouz in stop machine, there is no way that the + * interrupt can be blocked on the target cpu because all cpus + * loops first with interrupts enabled in stop machine, so the + * old vector is not yet cleaned up when the interrupt fires. + * + * So the only way to run into this issue is if the delivery + * of the interrupt on the apic/system bus would be delayed + * beyond the point where the target cpu disables interrupts + * in stop machine. I doubt that it can happen, but at least + * there is a theroretical chance. Virtualization might be + * able to expose this, but AFAICT the IOAPIC emulation is not + * as stupid as the real hardware. + * + * Anyway, there is nothing we can do about that at this point + * w/o refactoring the whole fixup_irq() business completely. + * We print at least the irq number and the old vector number, + * so we have the necessary information when a problem in that + * area arises. */ - data = apic_chip_data(irqdata); - if (!data) - return; - raw_spin_lock(&vector_lock); + pr_warn("IRQ fixup: irq %d move in progress, old vector %d\n", + irqdata->irq, cfg->old_vector); } + /* + * If old_domain is not empty, then other cpus still have the irq + * descriptor set in their vector array. Clean it up. + */ + for_each_cpu(cpu, data->old_domain) + per_cpu(vector_irq, cpu)[cfg->old_vector] = VECTOR_UNUSED; + + /* Cleanup the left overs of the (half finished) move */ + cpumask_clear(data->old_domain); + data->move_in_progress = 0; raw_spin_unlock(&vector_lock); } #endif diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 624db00583f4..8f4942e2bcbb 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -792,7 +792,8 @@ static int uv_scir_cpu_notify(struct notifier_block *self, unsigned long action, { long cpu = (long)hcpu; - switch (action) { + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_DOWN_FAILED: case CPU_ONLINE: uv_heartbeat_enable(cpu); break; @@ -860,7 +861,7 @@ int uv_set_vga_state(struct pci_dev *pdev, bool decode, */ void uv_cpu_init(void) { - /* CPU 0 initilization will be done via uv_system_init. */ + /* CPU 0 initialization will be done via uv_system_init. */ if (!uv_blade_info) return; diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 052c9c3026cc..9307f182fe30 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -1088,7 +1088,7 @@ static int apm_get_battery_status(u_short which, u_short *status, * @device: identity of device * @enable: on/off * - * Activate or deactive power management on either a specific device + * Activate or deactivate power management on either a specific device * or the entire system (%APM_DEVICE_ALL). */ diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 84a7524b202c..5c042466f274 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -59,7 +59,6 @@ void common(void) { #ifdef CONFIG_PARAVIRT BLANK(); - OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled); OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops); OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops); OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable); diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 6ce39025f467..ecdc1d217dc0 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -7,7 +7,7 @@ #include <linux/lguest.h> #include "../../../drivers/lguest/lg.h" -#define __SYSCALL_I386(nr, sym, compat) [nr] = 1, +#define __SYSCALL_I386(nr, sym, qual) [nr] = 1, static char syscalls[] = { #include <asm/syscalls_32.h> }; @@ -52,6 +52,11 @@ void foo(void) DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - offsetofend(struct tss_struct, SYSENTER_stack)); + /* Offset from cpu_tss to SYSENTER_stack */ + OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack); + /* Size of SYSENTER_stack */ + DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack)); + #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) BLANK(); OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index f2edafb5f24e..d875f97d4e0b 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -4,17 +4,11 @@ #include <asm/ia32.h> -#define __SYSCALL_64(nr, sym, compat) [nr] = 1, -#define __SYSCALL_COMMON(nr, sym, compat) [nr] = 1, -#ifdef CONFIG_X86_X32_ABI -# define __SYSCALL_X32(nr, sym, compat) [nr] = 1, -#else -# define __SYSCALL_X32(nr, sym, compat) /* nothing */ -#endif +#define __SYSCALL_64(nr, sym, qual) [nr] = 1, static char syscalls_64[] = { #include <asm/syscalls_64.h> }; -#define __SYSCALL_I386(nr, sym, compat) [nr] = 1, +#define __SYSCALL_I386(nr, sym, qual) [nr] = 1, static char syscalls_ia32[] = { #include <asm/syscalls_32.h> }; diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 58031303e304..4a8697f7d4ef 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -8,6 +8,10 @@ CFLAGS_REMOVE_common.o = -pg CFLAGS_REMOVE_perf_event.o = -pg endif +# If these files are instrumented, boot hangs during the first second. +KCOV_INSTRUMENT_common.o := n +KCOV_INSTRUMENT_perf_event.o := n + # Make sure load_percpu_segment has no stackprotector nostackp := $(call cc-option, -fno-stack-protector) CFLAGS_common.o := $(nostackp) @@ -30,33 +34,11 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o -obj-$(CONFIG_PERF_EVENTS) += perf_event.o - -ifdef CONFIG_PERF_EVENTS -obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd.o perf_event_amd_uncore.o -ifdef CONFIG_AMD_IOMMU -obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd_iommu.o -endif -obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_knc.o perf_event_p4.o -obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o -obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_rapl.o perf_event_intel_cqm.o -obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_pt.o perf_event_intel_bts.o -obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_cstate.o - -obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += perf_event_intel_uncore.o \ - perf_event_intel_uncore_snb.o \ - perf_event_intel_uncore_snbep.o \ - perf_event_intel_uncore_nhmex.o -obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_msr.o -obj-$(CONFIG_CPU_SUP_AMD) += perf_event_msr.o -endif - - obj-$(CONFIG_X86_MCE) += mcheck/ obj-$(CONFIG_MTRR) += mtrr/ obj-$(CONFIG_MICROCODE) += microcode/ -obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o perf_event_amd_ibs.o +obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o obj-$(CONFIG_HYPERVISOR_GUEST) += vmware.o hypervisor.o mshyperv.o @@ -64,7 +46,7 @@ ifdef CONFIG_X86_FEATURE_NAMES quiet_cmd_mkcapflags = MKCAP $@ cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/mkcapflags.sh $< $@ -cpufeature = $(src)/../../include/asm/cpufeature.h +cpufeature = $(src)/../../include/asm/cpufeatures.h targets += capflags.c $(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.sh FORCE diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index a07956a08936..6e47e3a916f1 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -75,14 +75,17 @@ static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val) */ extern __visible void vide(void); -__asm__(".globl vide\n\t.align 4\nvide: ret"); +__asm__(".globl vide\n" + ".type vide, @function\n" + ".align 4\n" + "vide: ret\n"); static void init_amd_k5(struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_32 /* * General Systems BIOSen alias the cpu frequency registers - * of the Elan at 0x000df000. Unfortuantly, one of the Linux + * of the Elan at 0x000df000. Unfortunately, one of the Linux * drivers subsequently pokes it, and changes the CPU speed. * Workaround : Remove the unneeded alias. */ @@ -117,7 +120,7 @@ static void init_amd_k6(struct cpuinfo_x86 *c) void (*f_vide)(void); u64 d, d2; - printk(KERN_INFO "AMD K6 stepping B detected - "); + pr_info("AMD K6 stepping B detected - "); /* * It looks like AMD fixed the 2.6.2 bug and improved indirect @@ -133,10 +136,9 @@ static void init_amd_k6(struct cpuinfo_x86 *c) d = d2-d; if (d > 20*K6_BUG_LOOP) - printk(KERN_CONT - "system stability may be impaired when more than 32 MB are used.\n"); + pr_cont("system stability may be impaired when more than 32 MB are used.\n"); else - printk(KERN_CONT "probably OK (after B9730xxxx).\n"); + pr_cont("probably OK (after B9730xxxx).\n"); } /* K6 with old style WHCR */ @@ -154,7 +156,7 @@ static void init_amd_k6(struct cpuinfo_x86 *c) wbinvd(); wrmsr(MSR_K6_WHCR, l, h); local_irq_restore(flags); - printk(KERN_INFO "Enabling old style K6 write allocation for %d Mb\n", + pr_info("Enabling old style K6 write allocation for %d Mb\n", mbytes); } return; @@ -175,7 +177,7 @@ static void init_amd_k6(struct cpuinfo_x86 *c) wbinvd(); wrmsr(MSR_K6_WHCR, l, h); local_irq_restore(flags); - printk(KERN_INFO "Enabling new style K6 write allocation for %d Mb\n", + pr_info("Enabling new style K6 write allocation for %d Mb\n", mbytes); } @@ -202,7 +204,7 @@ static void init_amd_k7(struct cpuinfo_x86 *c) */ if (c->x86_model >= 6 && c->x86_model <= 10) { if (!cpu_has(c, X86_FEATURE_XMM)) { - printk(KERN_INFO "Enabling disabled K7/SSE Support.\n"); + pr_info("Enabling disabled K7/SSE Support.\n"); msr_clear_bit(MSR_K7_HWCR, 15); set_cpu_cap(c, X86_FEATURE_XMM); } @@ -216,9 +218,8 @@ static void init_amd_k7(struct cpuinfo_x86 *c) if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) { rdmsr(MSR_K7_CLK_CTL, l, h); if ((l & 0xfff00000) != 0x20000000) { - printk(KERN_INFO - "CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", - l, ((l & 0x000fffff)|0x20000000)); + pr_info("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", + l, ((l & 0x000fffff)|0x20000000)); wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h); } } @@ -308,7 +309,6 @@ static void amd_get_topology(struct cpuinfo_x86 *c) u32 eax, ebx, ecx, edx; cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); - nodes_per_socket = ((ecx >> 8) & 7) + 1; node_id = ecx & 7; /* get compute unit information */ @@ -319,7 +319,6 @@ static void amd_get_topology(struct cpuinfo_x86 *c) u64 value; rdmsrl(MSR_FAM10H_NODE_ID, value); - nodes_per_socket = ((value >> 3) & 7) + 1; node_id = value & 7; } else return; @@ -485,7 +484,7 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) { unsigned long pfn = tseg >> PAGE_SHIFT; - printk(KERN_DEBUG "tseg: %010llx\n", tseg); + pr_debug("tseg: %010llx\n", tseg); if (pfn_range_is_mapped(pfn, pfn + 1)) set_memory_4k((unsigned long)__va(tseg), 1); } @@ -500,8 +499,7 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) rdmsrl(MSR_K7_HWCR, val); if (!(val & BIT(24))) - printk(KERN_WARNING FW_BUG "TSC doesn't count " - "with P0 frequency!\n"); + pr_warn(FW_BUG "TSC doesn't count with P0 frequency!\n"); } } @@ -522,6 +520,18 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_MWAITX)) use_mwaitx_delay(); + + if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { + u32 ecx; + + ecx = cpuid_ecx(0x8000001e); + nodes_per_socket = ((ecx >> 8) & 7) + 1; + } else if (boot_cpu_has(X86_FEATURE_NODEID_MSR)) { + u64 value; + + rdmsrl(MSR_FAM10H_NODE_ID, value); + nodes_per_socket = ((value >> 3) & 7) + 1; + } } static void early_init_amd(struct cpuinfo_x86 *c) @@ -539,6 +549,10 @@ static void early_init_amd(struct cpuinfo_x86 *c) set_sched_clock_stable(); } + /* Bit 12 of 8000_0007 edx is accumulated power mechanism. */ + if (c->x86_power & BIT(12)) + set_cpu_cap(c, X86_FEATURE_ACC_POWER); + #ifdef CONFIG_X86_64 set_cpu_cap(c, X86_FEATURE_SYSCALL32); #else diff --git a/arch/x86/kernel/cpu/bugs_64.c b/arch/x86/kernel/cpu/bugs_64.c index 04f0fe5af83e..a972ac4c7e7d 100644 --- a/arch/x86/kernel/cpu/bugs_64.c +++ b/arch/x86/kernel/cpu/bugs_64.c @@ -15,7 +15,7 @@ void __init check_bugs(void) { identify_boot_cpu(); #if !defined(CONFIG_SMP) - printk(KERN_INFO "CPU: "); + pr_info("CPU: "); print_cpu_info(&boot_cpu_data); #endif alternative_instructions(); diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index ae20be6e483c..1661d8ec9280 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -1,7 +1,7 @@ #include <linux/bitops.h> #include <linux/kernel.h> -#include <asm/processor.h> +#include <asm/cpufeature.h> #include <asm/e820.h> #include <asm/mtrr.h> #include <asm/msr.h> @@ -29,7 +29,7 @@ static void init_c3(struct cpuinfo_x86 *c) rdmsr(MSR_VIA_FCR, lo, hi); lo |= ACE_FCR; /* enable ACE unit */ wrmsr(MSR_VIA_FCR, lo, hi); - printk(KERN_INFO "CPU: Enabled ACE h/w crypto\n"); + pr_info("CPU: Enabled ACE h/w crypto\n"); } /* enable RNG unit, if present and disabled */ @@ -37,7 +37,7 @@ static void init_c3(struct cpuinfo_x86 *c) rdmsr(MSR_VIA_RNG, lo, hi); lo |= RNG_ENABLE; /* enable RNG unit */ wrmsr(MSR_VIA_RNG, lo, hi); - printk(KERN_INFO "CPU: Enabled h/w RNG\n"); + pr_info("CPU: Enabled h/w RNG\n"); } /* store Centaur Extended Feature Flags as @@ -130,7 +130,7 @@ static void init_centaur(struct cpuinfo_x86 *c) name = "C6"; fcr_set = ECX8|DSMC|EDCTLB|EMMX|ERETSTK; fcr_clr = DPDC; - printk(KERN_NOTICE "Disabling bugged TSC.\n"); + pr_notice("Disabling bugged TSC.\n"); clear_cpu_cap(c, X86_FEATURE_TSC); break; case 8: @@ -163,11 +163,11 @@ static void init_centaur(struct cpuinfo_x86 *c) newlo = (lo|fcr_set) & (~fcr_clr); if (newlo != lo) { - printk(KERN_INFO "Centaur FCR was 0x%X now 0x%X\n", + pr_info("Centaur FCR was 0x%X now 0x%X\n", lo, newlo); wrmsr(MSR_IDT_FCR1, newlo, hi); } else { - printk(KERN_INFO "Centaur FCR is 0x%X\n", lo); + pr_info("Centaur FCR is 0x%X\n", lo); } /* Emulate MTRRs using Centaur's MCR. */ set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 37830de8f60a..8394b3d1f94f 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -162,6 +162,22 @@ static int __init x86_mpx_setup(char *s) } __setup("nompx", x86_mpx_setup); +static int __init x86_noinvpcid_setup(char *s) +{ + /* noinvpcid doesn't accept parameters */ + if (s) + return -EINVAL; + + /* do not emit a message if the feature is not present */ + if (!boot_cpu_has(X86_FEATURE_INVPCID)) + return 0; + + setup_clear_cpu_cap(X86_FEATURE_INVPCID); + pr_info("noinvpcid: INVPCID feature disabled\n"); + return 0; +} +early_param("noinvpcid", x86_noinvpcid_setup); + #ifdef CONFIG_X86_32 static int cachesize_override = -1; static int disable_x86_serial_nr = 1; @@ -228,7 +244,7 @@ static void squash_the_stupid_serial_number(struct cpuinfo_x86 *c) lo |= 0x200000; wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi); - printk(KERN_NOTICE "CPU serial number disabled.\n"); + pr_notice("CPU serial number disabled.\n"); clear_cpu_cap(c, X86_FEATURE_PN); /* Disabling the serial number may affect the cpuid level */ @@ -288,6 +304,48 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c) } /* + * Protection Keys are not available in 32-bit mode. + */ +static bool pku_disabled; + +static __always_inline void setup_pku(struct cpuinfo_x86 *c) +{ + if (!cpu_has(c, X86_FEATURE_PKU)) + return; + if (pku_disabled) + return; + + cr4_set_bits(X86_CR4_PKE); + /* + * Seting X86_CR4_PKE will cause the X86_FEATURE_OSPKE + * cpuid bit to be set. We need to ensure that we + * update that bit in this CPU's "cpu_info". + */ + get_cpu_cap(c); +} + +#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS +static __init int setup_disable_pku(char *arg) +{ + /* + * Do not clear the X86_FEATURE_PKU bit. All of the + * runtime checks are against OSPKE so clearing the + * bit does nothing. + * + * This way, we will see "pku" in cpuinfo, but not + * "ospke", which is exactly what we want. It shows + * that the CPU has PKU, but the OS has not enabled it. + * This happens to be exactly how a system would look + * if we disabled the config option. + */ + pr_info("x86: 'nopku' specified, disabling Memory Protection Keys\n"); + pku_disabled = true; + return 1; +} +__setup("nopku", setup_disable_pku); +#endif /* CONFIG_X86_64 */ + +/* * Some CPU features depend on higher CPUID levels, which may not always * be available due to CPUID level capping or broken virtualization * software. Add those features to this table to auto-disable them. @@ -329,9 +387,8 @@ static void filter_cpuid_features(struct cpuinfo_x86 *c, bool warn) if (!warn) continue; - printk(KERN_WARNING - "CPU: CPU feature " X86_CAP_FMT " disabled, no CPUID level 0x%x\n", - x86_cap_flag(df->feature), df->level); + pr_warn("CPU: CPU feature " X86_CAP_FMT " disabled, no CPUID level 0x%x\n", + x86_cap_flag(df->feature), df->level); } } @@ -510,7 +567,7 @@ void detect_ht(struct cpuinfo_x86 *c) smp_num_siblings = (ebx & 0xff0000) >> 16; if (smp_num_siblings == 1) { - printk_once(KERN_INFO "CPU0: Hyper-Threading is disabled\n"); + pr_info_once("CPU0: Hyper-Threading is disabled\n"); goto out; } @@ -531,10 +588,10 @@ void detect_ht(struct cpuinfo_x86 *c) out: if (!printed && (c->x86_max_cores * smp_num_siblings) > 1) { - printk(KERN_INFO "CPU: Physical Processor ID: %d\n", - c->phys_proc_id); - printk(KERN_INFO "CPU: Processor Core ID: %d\n", - c->cpu_core_id); + pr_info("CPU: Physical Processor ID: %d\n", + c->phys_proc_id); + pr_info("CPU: Processor Core ID: %d\n", + c->cpu_core_id); printed = 1; } #endif @@ -559,9 +616,8 @@ static void get_cpu_vendor(struct cpuinfo_x86 *c) } } - printk_once(KERN_ERR - "CPU: vendor_id '%s' unknown, using generic init.\n" \ - "CPU: Your system may be unstable.\n", v); + pr_err_once("CPU: vendor_id '%s' unknown, using generic init.\n" \ + "CPU: Your system may be unstable.\n", v); c->x86_vendor = X86_VENDOR_UNKNOWN; this_cpu = &default_cpu; @@ -611,6 +667,7 @@ void get_cpu_cap(struct cpuinfo_x86 *c) c->x86_capability[CPUID_7_0_EBX] = ebx; c->x86_capability[CPUID_6_EAX] = cpuid_eax(0x00000006); + c->x86_capability[CPUID_7_ECX] = ecx; } /* Extended state features: level 0x0000000d */ @@ -635,7 +692,9 @@ void get_cpu_cap(struct cpuinfo_x86 *c) cpuid_count(0x0000000F, 1, &eax, &ebx, &ecx, &edx); c->x86_capability[CPUID_F_1_EDX] = edx; - if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC)) { + if ((cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC)) || + ((cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL)) || + (cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)))) { c->x86_cache_max_rmid = ecx; c->x86_cache_occ_scale = ebx; } @@ -760,7 +819,7 @@ void __init early_cpu_init(void) int count = 0; #ifdef CONFIG_PROCESSOR_SELECT - printk(KERN_INFO "KERNEL supported cpus:\n"); + pr_info("KERNEL supported cpus:\n"); #endif for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) { @@ -778,7 +837,7 @@ void __init early_cpu_init(void) for (j = 0; j < 2; j++) { if (!cpudev->c_ident[j]) continue; - printk(KERN_INFO " %s %s\n", cpudev->c_vendor, + pr_info(" %s %s\n", cpudev->c_vendor, cpudev->c_ident[j]); } } @@ -803,6 +862,31 @@ static void detect_nopl(struct cpuinfo_x86 *c) #else set_cpu_cap(c, X86_FEATURE_NOPL); #endif + + /* + * ESPFIX is a strange bug. All real CPUs have it. Paravirt + * systems that run Linux at CPL > 0 may or may not have the + * issue, but, even if they have the issue, there's absolutely + * nothing we can do about it because we can't use the real IRET + * instruction. + * + * NB: For the time being, only 32-bit kernels support + * X86_BUG_ESPFIX as such. 64-bit kernels directly choose + * whether to apply espfix using paravirt hooks. If any + * non-paravirt system ever shows up that does *not* have the + * ESPFIX issue, we can change this. + */ +#ifdef CONFIG_X86_32 +#ifdef CONFIG_PARAVIRT + do { + extern void native_iret(void); + if (pv_cpu_ops.iret == native_iret) + set_cpu_bug(c, X86_BUG_ESPFIX); + } while (0); +#else + set_cpu_bug(c, X86_BUG_ESPFIX); +#endif +#endif } static void generic_identify(struct cpuinfo_x86 *c) @@ -886,7 +970,7 @@ static void identify_cpu(struct cpuinfo_x86 *c) if (this_cpu->c_identify) this_cpu->c_identify(c); - /* Clear/Set all flags overriden by options, after probe */ + /* Clear/Set all flags overridden by options, after probe */ for (i = 0; i < NCAPINTS; i++) { c->x86_capability[i] &= ~cpu_caps_cleared[i]; c->x86_capability[i] |= cpu_caps_set[i]; @@ -943,9 +1027,10 @@ static void identify_cpu(struct cpuinfo_x86 *c) init_hypervisor(c); x86_init_rdrand(c); x86_init_cache_qos(c); + setup_pku(c); /* - * Clear/Set all flags overriden by options, need do it + * Clear/Set all flags overridden by options, need do it * before following smp all cpus cap AND. */ for (i = 0; i < NCAPINTS; i++) { @@ -977,6 +1062,8 @@ static void identify_cpu(struct cpuinfo_x86 *c) #ifdef CONFIG_NUMA numa_add_cpu(smp_processor_id()); #endif + /* The boot/hotplug time assigment got cleared, restore it */ + c->logical_proc_id = topology_phys_to_logical_pkg(c->phys_proc_id); } /* @@ -1061,7 +1148,7 @@ static void __print_cpu_msr(void) for (index = index_min; index < index_max; index++) { if (rdmsrl_safe(index, &val)) continue; - printk(KERN_INFO " MSR%08x: %016llx\n", index, val); + pr_info(" MSR%08x: %016llx\n", index, val); } } } @@ -1100,19 +1187,19 @@ void print_cpu_info(struct cpuinfo_x86 *c) } if (vendor && !strstr(c->x86_model_id, vendor)) - printk(KERN_CONT "%s ", vendor); + pr_cont("%s ", vendor); if (c->x86_model_id[0]) - printk(KERN_CONT "%s", c->x86_model_id); + pr_cont("%s", c->x86_model_id); else - printk(KERN_CONT "%d86", c->x86); + pr_cont("%d86", c->x86); - printk(KERN_CONT " (family: 0x%x, model: 0x%x", c->x86, c->x86_model); + pr_cont(" (family: 0x%x, model: 0x%x", c->x86, c->x86_model); if (c->x86_mask || c->cpuid_level >= 0) - printk(KERN_CONT ", stepping: 0x%x)\n", c->x86_mask); + pr_cont(", stepping: 0x%x)\n", c->x86_mask); else - printk(KERN_CONT ")\n"); + pr_cont(")\n"); print_cpu_msr(c); } @@ -1438,7 +1525,7 @@ void cpu_init(void) show_ucode_info_early(); - printk(KERN_INFO "Initializing CPU#%d\n", cpu); + pr_info("Initializing CPU#%d\n", cpu); if (cpu_feature_enabled(X86_FEATURE_VME) || cpu_has_tsc || @@ -1475,20 +1562,6 @@ void cpu_init(void) } #endif -#ifdef CONFIG_X86_DEBUG_STATIC_CPU_HAS -void warn_pre_alternatives(void) -{ - WARN(1, "You're using static_cpu_has before alternatives have run!\n"); -} -EXPORT_SYMBOL_GPL(warn_pre_alternatives); -#endif - -inline bool __static_cpu_has_safe(u16 bit) -{ - return boot_cpu_has(bit); -} -EXPORT_SYMBOL_GPL(__static_cpu_has_safe); - static void bsp_resume(void) { if (this_cpu->c_bsp_resume) diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index aaf152e79637..6adef9cac23e 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -8,6 +8,7 @@ #include <linux/timer.h> #include <asm/pci-direct.h> #include <asm/tsc.h> +#include <asm/cpufeature.h> #include "cpu.h" @@ -103,7 +104,7 @@ static void check_cx686_slop(struct cpuinfo_x86 *c) local_irq_restore(flags); if (ccr5 & 2) { /* possible wrong calibration done */ - printk(KERN_INFO "Recalibrating delay loop with SLOP bit reset\n"); + pr_info("Recalibrating delay loop with SLOP bit reset\n"); calibrate_delay(); c->loops_per_jiffy = loops_per_jiffy; } @@ -115,7 +116,7 @@ static void set_cx86_reorder(void) { u8 ccr3; - printk(KERN_INFO "Enable Memory access reorder on Cyrix/NSC processor.\n"); + pr_info("Enable Memory access reorder on Cyrix/NSC processor.\n"); ccr3 = getCx86(CX86_CCR3); setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ @@ -128,7 +129,7 @@ static void set_cx86_reorder(void) static void set_cx86_memwb(void) { - printk(KERN_INFO "Enable Memory-Write-back mode on Cyrix/NSC processor.\n"); + pr_info("Enable Memory-Write-back mode on Cyrix/NSC processor.\n"); /* CCR2 bit 2: unlock NW bit */ setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) & ~0x04); @@ -268,7 +269,7 @@ static void init_cyrix(struct cpuinfo_x86 *c) * VSA1 we work around however. */ - printk(KERN_INFO "Working around Cyrix MediaGX virtual DMA bugs.\n"); + pr_info("Working around Cyrix MediaGX virtual DMA bugs.\n"); isa_dma_bridge_buggy = 2; /* We do this before the PCI layer is running. However we @@ -426,7 +427,7 @@ static void cyrix_identify(struct cpuinfo_x86 *c) if (dir0 == 5 || dir0 == 3) { unsigned char ccr3; unsigned long flags; - printk(KERN_INFO "Enabling CPUID on Cyrix processor.\n"); + pr_info("Enabling CPUID on Cyrix processor.\n"); local_irq_save(flags); ccr3 = getCx86(CX86_CCR3); /* enable MAPEN */ diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index d820d8eae96b..73d391ae452f 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -56,7 +56,7 @@ detect_hypervisor_vendor(void) } if (max_pri) - printk(KERN_INFO "Hypervisor detected: %s\n", x86_hyper->name); + pr_info("Hypervisor detected: %s\n", x86_hyper->name); } void init_hypervisor(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 565648bc1a0a..1f7fdb91a818 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -8,7 +8,7 @@ #include <linux/module.h> #include <linux/uaccess.h> -#include <asm/processor.h> +#include <asm/cpufeature.h> #include <asm/pgtable.h> #include <asm/msr.h> #include <asm/bugs.h> @@ -61,7 +61,7 @@ static void early_init_intel(struct cpuinfo_x86 *c) */ if (c->x86 == 6 && c->x86_model == 0x1c && c->x86_mask <= 2 && c->microcode < 0x20e) { - printk(KERN_WARNING "Atom PSE erratum detected, BIOS microcode update recommended\n"); + pr_warn("Atom PSE erratum detected, BIOS microcode update recommended\n"); clear_cpu_cap(c, X86_FEATURE_PSE); } @@ -140,7 +140,7 @@ static void early_init_intel(struct cpuinfo_x86 *c) if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) { rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); if (!(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) { - printk(KERN_INFO "Disabled fast string operations\n"); + pr_info("Disabled fast string operations\n"); setup_clear_cpu_cap(X86_FEATURE_REP_GOOD); setup_clear_cpu_cap(X86_FEATURE_ERMS); } @@ -160,6 +160,19 @@ static void early_init_intel(struct cpuinfo_x86 *c) pr_info("Disabling PGE capability bit\n"); setup_clear_cpu_cap(X86_FEATURE_PGE); } + + if (c->cpuid_level >= 0x00000001) { + u32 eax, ebx, ecx, edx; + + cpuid(0x00000001, &eax, &ebx, &ecx, &edx); + /* + * If HTT (EDX[28]) is set EBX[16:23] contain the number of + * apicids which are reserved per package. Store the resulting + * shift value for the package management code. + */ + if (edx & (1U << 28)) + c->x86_coreid_bits = get_count_order((ebx >> 16) & 0xff); + } } #ifdef CONFIG_X86_32 @@ -176,7 +189,7 @@ int ppro_with_ram_bug(void) boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 1 && boot_cpu_data.x86_mask < 8) { - printk(KERN_INFO "Pentium Pro with Errata#50 detected. Taking evasive action.\n"); + pr_info("Pentium Pro with Errata#50 detected. Taking evasive action.\n"); return 1; } return 0; @@ -225,7 +238,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c) set_cpu_bug(c, X86_BUG_F00F); if (!f00f_workaround_enabled) { - printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n"); + pr_notice("Intel Pentium with F0 0F bug - workaround enabled.\n"); f00f_workaround_enabled = 1; } } @@ -244,7 +257,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c) * Forcefully enable PAE if kernel parameter "forcepae" is present. */ if (forcepae) { - printk(KERN_WARNING "PAE forced!\n"); + pr_warn("PAE forced!\n"); set_cpu_cap(c, X86_FEATURE_PAE); add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_NOW_UNRELIABLE); } diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 0b6c52388cf4..de6626c18e42 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -14,7 +14,7 @@ #include <linux/sysfs.h> #include <linux/pci.h> -#include <asm/processor.h> +#include <asm/cpufeature.h> #include <asm/amd_nb.h> #include <asm/smp.h> @@ -444,7 +444,7 @@ static ssize_t store_cache_disable(struct cacheinfo *this_leaf, err = amd_set_l3_disable_slot(nb, cpu, slot, val); if (err) { if (err == -EEXIST) - pr_warning("L3 slot %d in use/index already disabled!\n", + pr_warn("L3 slot %d in use/index already disabled!\n", slot); return err; } diff --git a/arch/x86/kernel/cpu/intel_pt.h b/arch/x86/kernel/cpu/intel_pt.h deleted file mode 100644 index 336878a5d205..000000000000 --- a/arch/x86/kernel/cpu/intel_pt.h +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Intel(R) Processor Trace PMU driver for perf - * Copyright (c) 2013-2014, Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * Intel PT is specified in the Intel Architecture Instruction Set Extensions - * Programming Reference: - * http://software.intel.com/en-us/intel-isa-extensions - */ - -#ifndef __INTEL_PT_H__ -#define __INTEL_PT_H__ - -/* - * Single-entry ToPA: when this close to region boundary, switch - * buffers to avoid losing data. - */ -#define TOPA_PMI_MARGIN 512 - -#define TOPA_SHIFT 12 - -static inline unsigned int sizes(unsigned int tsz) -{ - return 1 << (tsz + TOPA_SHIFT); -}; - -struct topa_entry { - u64 end : 1; - u64 rsvd0 : 1; - u64 intr : 1; - u64 rsvd1 : 1; - u64 stop : 1; - u64 rsvd2 : 1; - u64 size : 4; - u64 rsvd3 : 2; - u64 base : 36; - u64 rsvd4 : 16; -}; - -#define PT_CPUID_LEAVES 2 -#define PT_CPUID_REGS_NUM 4 /* number of regsters (eax, ebx, ecx, edx) */ - -enum pt_capabilities { - PT_CAP_max_subleaf = 0, - PT_CAP_cr3_filtering, - PT_CAP_psb_cyc, - PT_CAP_mtc, - PT_CAP_topa_output, - PT_CAP_topa_multiple_entries, - PT_CAP_single_range_output, - PT_CAP_payloads_lip, - PT_CAP_mtc_periods, - PT_CAP_cycle_thresholds, - PT_CAP_psb_periods, -}; - -struct pt_pmu { - struct pmu pmu; - u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES]; -}; - -/** - * struct pt_buffer - buffer configuration; one buffer per task_struct or - * cpu, depending on perf event configuration - * @cpu: cpu for per-cpu allocation - * @tables: list of ToPA tables in this buffer - * @first: shorthand for first topa table - * @last: shorthand for last topa table - * @cur: current topa table - * @nr_pages: buffer size in pages - * @cur_idx: current output region's index within @cur table - * @output_off: offset within the current output region - * @data_size: running total of the amount of data in this buffer - * @lost: if data was lost/truncated - * @head: logical write offset inside the buffer - * @snapshot: if this is for a snapshot/overwrite counter - * @stop_pos: STOP topa entry in the buffer - * @intr_pos: INT topa entry in the buffer - * @data_pages: array of pages from perf - * @topa_index: table of topa entries indexed by page offset - */ -struct pt_buffer { - int cpu; - struct list_head tables; - struct topa *first, *last, *cur; - unsigned int cur_idx; - size_t output_off; - unsigned long nr_pages; - local_t data_size; - local_t lost; - local64_t head; - bool snapshot; - unsigned long stop_pos, intr_pos; - void **data_pages; - struct topa_entry *topa_index[0]; -}; - -/** - * struct pt - per-cpu pt context - * @handle: perf output handle - * @handle_nmi: do handle PT PMI on this cpu, there's an active event - */ -struct pt { - struct perf_output_handle handle; - int handle_nmi; -}; - -#endif /* __INTEL_PT_H__ */ diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c index afa9f0d487ea..fbb5e90557a5 100644 --- a/arch/x86/kernel/cpu/match.c +++ b/arch/x86/kernel/cpu/match.c @@ -1,5 +1,5 @@ #include <asm/cpu_device_id.h> -#include <asm/processor.h> +#include <asm/cpufeature.h> #include <linux/cpu.h> #include <linux/module.h> #include <linux/slab.h> diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index 4cfba4371a71..517619ea6498 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -115,7 +115,7 @@ static int raise_local(void) int cpu = m->extcpu; if (m->inject_flags & MCJ_EXCEPTION) { - printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu); + pr_info("Triggering MCE exception on CPU %d\n", cpu); switch (context) { case MCJ_CTX_IRQ: /* @@ -128,15 +128,15 @@ static int raise_local(void) raise_exception(m, NULL); break; default: - printk(KERN_INFO "Invalid MCE context\n"); + pr_info("Invalid MCE context\n"); ret = -EINVAL; } - printk(KERN_INFO "MCE exception done on CPU %d\n", cpu); + pr_info("MCE exception done on CPU %d\n", cpu); } else if (m->status) { - printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu); + pr_info("Starting machine check poll CPU %d\n", cpu); raise_poll(m); mce_notify_irq(); - printk(KERN_INFO "Machine check poll done on CPU %d\n", cpu); + pr_info("Machine check poll done on CPU %d\n", cpu); } else m->finished = 0; @@ -183,8 +183,7 @@ static void raise_mce(struct mce *m) start = jiffies; while (!cpumask_empty(mce_inject_cpumask)) { if (!time_before(jiffies, start + 2*HZ)) { - printk(KERN_ERR - "Timeout waiting for mce inject %lx\n", + pr_err("Timeout waiting for mce inject %lx\n", *cpumask_bits(mce_inject_cpumask)); break; } @@ -241,7 +240,7 @@ static int inject_init(void) { if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL)) return -ENOMEM; - printk(KERN_INFO "Machine check injector initialized\n"); + pr_info("Machine check injector initialized\n"); register_mce_write_callback(mce_write); register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0, "mce_notify"); diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 9c682c222071..5119766d9889 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -14,6 +14,7 @@ #include <linux/init.h> #include <linux/debugfs.h> #include <asm/mce.h> +#include <asm/uaccess.h> #include "mce-internal.h" @@ -29,7 +30,7 @@ * panic situations) */ -enum context { IN_KERNEL = 1, IN_USER = 2 }; +enum context { IN_KERNEL = 1, IN_USER = 2, IN_KERNEL_RECOV = 3 }; enum ser { SER_REQUIRED = 1, NO_SER = 2 }; enum exception { EXCP_CONTEXT = 1, NO_EXCP = 2 }; @@ -48,6 +49,7 @@ static struct severity { #define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c } #define KERNEL .context = IN_KERNEL #define USER .context = IN_USER +#define KERNEL_RECOV .context = IN_KERNEL_RECOV #define SER .ser = SER_REQUIRED #define NOSER .ser = NO_SER #define EXCP .excp = EXCP_CONTEXT @@ -87,6 +89,10 @@ static struct severity { EXCP, KERNEL, MCGMASK(MCG_STATUS_RIPV, 0) ), MCESEV( + PANIC, "In kernel and no restart IP", + EXCP, KERNEL_RECOV, MCGMASK(MCG_STATUS_RIPV, 0) + ), + MCESEV( DEFERRED, "Deferred error", NOSER, MASK(MCI_STATUS_UC|MCI_STATUS_DEFERRED|MCI_STATUS_POISON, MCI_STATUS_DEFERRED) ), @@ -123,6 +129,11 @@ static struct severity { MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, MCG_STATUS_RIPV) ), MCESEV( + AR, "Action required: data load in error recoverable area of kernel", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), + KERNEL_RECOV + ), + MCESEV( AR, "Action required: data load error in a user process", SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), USER @@ -170,6 +181,9 @@ static struct severity { ) /* always matches. keep at end */ }; +#define mc_recoverable(mcg) (((mcg) & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) == \ + (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) + /* * If mcgstatus indicated that ip/cs on the stack were * no good, then "m->cs" will be zero and we will have @@ -183,7 +197,11 @@ static struct severity { */ static int error_context(struct mce *m) { - return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL; + if ((m->cs & 3) == 3) + return IN_USER; + if (mc_recoverable(m->mcgstatus) && ex_has_fault_handler(m->ip)) + return IN_KERNEL_RECOV; + return IN_KERNEL; } /* diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index a006f4cd792b..f0c921b03e42 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -961,6 +961,20 @@ static void mce_clear_state(unsigned long *toclear) } } +static int do_memory_failure(struct mce *m) +{ + int flags = MF_ACTION_REQUIRED; + int ret; + + pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr); + if (!(m->mcgstatus & MCG_STATUS_RIPV)) + flags |= MF_MUST_KILL; + ret = memory_failure(m->addr >> PAGE_SHIFT, MCE_VECTOR, flags); + if (ret) + pr_err("Memory error not recovered"); + return ret; +} + /* * The actual machine check handler. This only handles real * exceptions when something got corrupted coming in through int 18. @@ -998,8 +1012,6 @@ void do_machine_check(struct pt_regs *regs, long error_code) DECLARE_BITMAP(toclear, MAX_NR_BANKS); DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); char *msg = "Unknown"; - u64 recover_paddr = ~0ull; - int flags = MF_ACTION_REQUIRED; int lmce = 0; /* If this CPU is offline, just bail out. */ @@ -1136,22 +1148,13 @@ void do_machine_check(struct pt_regs *regs, long error_code) } /* - * At insane "tolerant" levels we take no action. Otherwise - * we only die if we have no other choice. For less serious - * issues we try to recover, or limit damage to the current - * process. + * If tolerant is at an insane level we drop requests to kill + * processes and continue even when there is no way out. */ - if (cfg->tolerant < 3) { - if (no_way_out) - mce_panic("Fatal machine check on current CPU", &m, msg); - if (worst == MCE_AR_SEVERITY) { - recover_paddr = m.addr; - if (!(m.mcgstatus & MCG_STATUS_RIPV)) - flags |= MF_MUST_KILL; - } else if (kill_it) { - force_sig(SIGBUS, current); - } - } + if (cfg->tolerant == 3) + kill_it = 0; + else if (no_way_out) + mce_panic("Fatal machine check on current CPU", &m, msg); if (worst > 0) mce_report_event(regs); @@ -1159,25 +1162,24 @@ void do_machine_check(struct pt_regs *regs, long error_code) out: sync_core(); - if (recover_paddr == ~0ull) - goto done; + if (worst != MCE_AR_SEVERITY && !kill_it) + goto out_ist; - pr_err("Uncorrected hardware memory error in user-access at %llx", - recover_paddr); - /* - * We must call memory_failure() here even if the current process is - * doomed. We still need to mark the page as poisoned and alert any - * other users of the page. - */ - ist_begin_non_atomic(regs); - local_irq_enable(); - if (memory_failure(recover_paddr >> PAGE_SHIFT, MCE_VECTOR, flags) < 0) { - pr_err("Memory error not recovered"); - force_sig(SIGBUS, current); + /* Fault was in user mode and we need to take some action */ + if ((m.cs & 3) == 3) { + ist_begin_non_atomic(regs); + local_irq_enable(); + + if (kill_it || do_memory_failure(&m)) + force_sig(SIGBUS, current); + local_irq_disable(); + ist_end_non_atomic(); + } else { + if (!fixup_exception(regs, X86_TRAP_MC)) + mce_panic("Failed kernel mode recovery", &m, NULL); } - local_irq_disable(); - ist_end_non_atomic(); -done: + +out_ist: ist_exit(regs); } EXPORT_SYMBOL_GPL(do_machine_check); @@ -1576,6 +1578,17 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) if (c->x86 == 6 && c->x86_model == 45) quirk_no_way_out = quirk_sandybridge_ifu; + /* + * MCG_CAP.MCG_SER_P is necessary but not sufficient to know + * whether this processor will actually generate recoverable + * machine checks. Check to see if this is an E7 model Xeon. + * We can't do a model number check because E5 and E7 use the + * same model number. E5 doesn't support recovery, E7 does. + */ + if (mca_cfg.recovery || (mca_cfg.ser && + !strncmp(c->x86_model_id, + "Intel(R) Xeon(R) CPU E7-", 24))) + set_cpu_cap(c, X86_FEATURE_MCE_RECOVERY); } if (cfg->monarch_timeout < 0) cfg->monarch_timeout = 0; @@ -1617,10 +1630,10 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) case X86_VENDOR_AMD: { u32 ebx = cpuid_ebx(0x80000007); - mce_amd_feature_init(c); mce_flags.overflow_recov = !!(ebx & BIT(0)); mce_flags.succor = !!(ebx & BIT(1)); mce_flags.smca = !!(ebx & BIT(3)); + mce_amd_feature_init(c); break; } @@ -2028,6 +2041,8 @@ static int __init mcheck_enable(char *str) cfg->bootlog = (str[0] == 'b'); else if (!strcmp(str, "bios_cmci_threshold")) cfg->bios_cmci_threshold = true; + else if (!strcmp(str, "recovery")) + cfg->recovery = true; else if (isdigit(str[0])) { if (get_option(&str, &cfg->tolerant) == 2) get_option(&str, &(cfg->monarch_timeout)); diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index e99b15077e94..9d656fd436ef 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -1,5 +1,5 @@ /* - * (c) 2005-2015 Advanced Micro Devices, Inc. + * (c) 2005-2016 Advanced Micro Devices, Inc. * Your use of this code is subject to the terms and conditions of the * GNU general public license version 2. See "COPYING" or * http://www.gnu.org/licenses/gpl.html @@ -28,7 +28,7 @@ #include <asm/msr.h> #include <asm/trace/irq_vectors.h> -#define NR_BLOCKS 9 +#define NR_BLOCKS 5 #define THRESHOLD_MAX 0xFFF #define INT_TYPE_APIC 0x00020000 #define MASK_VALID_HI 0x80000000 @@ -49,6 +49,19 @@ #define DEF_LVT_OFF 0x2 #define DEF_INT_TYPE_APIC 0x2 +/* Scalable MCA: */ + +/* Threshold LVT offset is at MSR0xC0000410[15:12] */ +#define SMCA_THR_LVT_OFF 0xF000 + +/* + * OS is required to set the MCAX bit to acknowledge that it is now using the + * new MSR ranges and new registers under each bank. It also means that the OS + * will configure deferred errors in the new MCx_CONFIG register. If the bit is + * not set, uncorrectable errors will cause a system panic. + */ +#define SMCA_MCAX_EN_OFF 0x1 + static const char * const th_names[] = { "load_store", "insn_fetch", @@ -58,6 +71,35 @@ static const char * const th_names[] = { "execution_unit", }; +/* Define HWID to IP type mappings for Scalable MCA */ +struct amd_hwid amd_hwids[] = { + [SMCA_F17H_CORE] = { "f17h_core", 0xB0 }, + [SMCA_DF] = { "data_fabric", 0x2E }, + [SMCA_UMC] = { "umc", 0x96 }, + [SMCA_PB] = { "param_block", 0x5 }, + [SMCA_PSP] = { "psp", 0xFF }, + [SMCA_SMU] = { "smu", 0x1 }, +}; +EXPORT_SYMBOL_GPL(amd_hwids); + +const char * const amd_core_mcablock_names[] = { + [SMCA_LS] = "load_store", + [SMCA_IF] = "insn_fetch", + [SMCA_L2_CACHE] = "l2_cache", + [SMCA_DE] = "decode_unit", + [RES] = "", + [SMCA_EX] = "execution_unit", + [SMCA_FP] = "floating_point", + [SMCA_L3_CACHE] = "l3_cache", +}; +EXPORT_SYMBOL_GPL(amd_core_mcablock_names); + +const char * const amd_df_mcablock_names[] = { + [SMCA_CS] = "coherent_slave", + [SMCA_PIE] = "pie", +}; +EXPORT_SYMBOL_GPL(amd_df_mcablock_names); + static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ @@ -84,6 +126,13 @@ struct thresh_restart { static inline bool is_shared_bank(int bank) { + /* + * Scalable MCA provides for only one core to have access to the MSRs of + * a shared bank. + */ + if (mce_flags.smca) + return false; + /* Bank 4 is for northbridge reporting and is thus shared */ return (bank == 4); } @@ -135,6 +184,14 @@ static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi) } if (apic != msr) { + /* + * On SMCA CPUs, LVT offset is programmed at a different MSR, and + * the BIOS provides the value. The original field where LVT offset + * was set is reserved. Return early here: + */ + if (mce_flags.smca) + return 0; + pr_err(FW_BUG "cpu %d, invalid threshold interrupt offset %d " "for bank %d, block %d (MSR%08X=0x%x%08x)\n", b->cpu, apic, b->bank, b->block, b->address, hi, lo); @@ -144,10 +201,7 @@ static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi) return 1; }; -/* - * Called via smp_call_function_single(), must be called with correct - * cpu affinity. - */ +/* Reprogram MCx_MISC MSR behind this threshold bank. */ static void threshold_restart_bank(void *_tr) { struct thresh_restart *tr = _tr; @@ -247,27 +301,116 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c) wrmsr(MSR_CU_DEF_ERR, low, high); } +static u32 get_block_address(u32 current_addr, u32 low, u32 high, + unsigned int bank, unsigned int block) +{ + u32 addr = 0, offset = 0; + + if (mce_flags.smca) { + if (!block) { + addr = MSR_AMD64_SMCA_MCx_MISC(bank); + } else { + /* + * For SMCA enabled processors, BLKPTR field of the + * first MISC register (MCx_MISC0) indicates presence of + * additional MISC register set (MISC1-4). + */ + u32 low, high; + + if (rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high)) + return addr; + + if (!(low & MCI_CONFIG_MCAX)) + return addr; + + if (!rdmsr_safe(MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) && + (low & MASK_BLKPTR_LO)) + addr = MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1); + } + return addr; + } + + /* Fall back to method we used for older processors: */ + switch (block) { + case 0: + addr = MSR_IA32_MCx_MISC(bank); + break; + case 1: + offset = ((low & MASK_BLKPTR_LO) >> 21); + if (offset) + addr = MCG_XBLK_ADDR + offset; + break; + default: + addr = ++current_addr; + } + return addr; +} + +static int +prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr, + int offset, u32 misc_high) +{ + unsigned int cpu = smp_processor_id(); + struct threshold_block b; + int new; + + if (!block) + per_cpu(bank_map, cpu) |= (1 << bank); + + memset(&b, 0, sizeof(b)); + b.cpu = cpu; + b.bank = bank; + b.block = block; + b.address = addr; + b.interrupt_capable = lvt_interrupt_supported(bank, misc_high); + + if (!b.interrupt_capable) + goto done; + + b.interrupt_enable = 1; + + if (mce_flags.smca) { + u32 smca_low, smca_high; + u32 smca_addr = MSR_AMD64_SMCA_MCx_CONFIG(bank); + + if (!rdmsr_safe(smca_addr, &smca_low, &smca_high)) { + smca_high |= SMCA_MCAX_EN_OFF; + wrmsr(smca_addr, smca_low, smca_high); + } + + /* Gather LVT offset for thresholding: */ + if (rdmsr_safe(MSR_CU_DEF_ERR, &smca_low, &smca_high)) + goto out; + + new = (smca_low & SMCA_THR_LVT_OFF) >> 12; + } else { + new = (misc_high & MASK_LVTOFF_HI) >> 20; + } + + offset = setup_APIC_mce_threshold(offset, new); + + if ((offset == new) && (mce_threshold_vector != amd_threshold_interrupt)) + mce_threshold_vector = amd_threshold_interrupt; + +done: + mce_threshold_block_init(&b, offset); + +out: + return offset; +} + /* cpu init entry point, called from mce.c with preempt off */ void mce_amd_feature_init(struct cpuinfo_x86 *c) { - struct threshold_block b; - unsigned int cpu = smp_processor_id(); u32 low = 0, high = 0, address = 0; unsigned int bank, block; - int offset = -1, new; + int offset = -1; for (bank = 0; bank < mca_cfg.banks; ++bank) { for (block = 0; block < NR_BLOCKS; ++block) { - if (block == 0) - address = MSR_IA32_MCx_MISC(bank); - else if (block == 1) { - address = (low & MASK_BLKPTR_LO) >> 21; - if (!address) - break; - - address += MCG_XBLK_ADDR; - } else - ++address; + address = get_block_address(address, low, high, bank, block); + if (!address) + break; if (rdmsr_safe(address, &low, &high)) break; @@ -279,29 +422,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) (high & MASK_LOCKED_HI)) continue; - if (!block) - per_cpu(bank_map, cpu) |= (1 << bank); - - memset(&b, 0, sizeof(b)); - b.cpu = cpu; - b.bank = bank; - b.block = block; - b.address = address; - b.interrupt_capable = lvt_interrupt_supported(bank, high); - - if (!b.interrupt_capable) - goto init; - - b.interrupt_enable = 1; - new = (high & MASK_LVTOFF_HI) >> 20; - offset = setup_APIC_mce_threshold(offset, new); - - if ((offset == new) && - (mce_threshold_vector != amd_threshold_interrupt)) - mce_threshold_vector = amd_threshold_interrupt; - -init: - mce_threshold_block_init(&b, offset); + offset = prepare_threshold_block(bank, block, address, offset, high); } } @@ -394,16 +515,9 @@ static void amd_threshold_interrupt(void) if (!(per_cpu(bank_map, cpu) & (1 << bank))) continue; for (block = 0; block < NR_BLOCKS; ++block) { - if (block == 0) { - address = MSR_IA32_MCx_MISC(bank); - } else if (block == 1) { - address = (low & MASK_BLKPTR_LO) >> 21; - if (!address) - break; - address += MCG_XBLK_ADDR; - } else { - ++address; - } + address = get_block_address(address, low, high, bank, block); + if (!address) + break; if (rdmsr_safe(address, &low, &high)) break; @@ -623,16 +737,11 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank, if (err) goto out_free; recurse: - if (!block) { - address = (low & MASK_BLKPTR_LO) >> 21; - if (!address) - return 0; - address += MCG_XBLK_ADDR; - } else { - ++address; - } + address = get_block_address(address, low, high, bank, ++block); + if (!address) + return 0; - err = allocate_threshold_blocks(cpu, bank, ++block, address); + err = allocate_threshold_blocks(cpu, bank, block, address); if (err) goto out_free; diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c index 12402e10aeff..2a0717bf8033 100644 --- a/arch/x86/kernel/cpu/mcheck/p5.c +++ b/arch/x86/kernel/cpu/mcheck/p5.c @@ -26,14 +26,12 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code) rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi); rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi); - printk(KERN_EMERG - "CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n", - smp_processor_id(), loaddr, lotype); + pr_emerg("CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n", + smp_processor_id(), loaddr, lotype); if (lotype & (1<<5)) { - printk(KERN_EMERG - "CPU#%d: Possible thermal failure (CPU on fire ?).\n", - smp_processor_id()); + pr_emerg("CPU#%d: Possible thermal failure (CPU on fire ?).\n", + smp_processor_id()); } add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); @@ -61,12 +59,10 @@ void intel_p5_mcheck_init(struct cpuinfo_x86 *c) /* Read registers before enabling: */ rdmsr(MSR_IA32_P5_MC_ADDR, l, h); rdmsr(MSR_IA32_P5_MC_TYPE, l, h); - printk(KERN_INFO - "Intel old style machine check architecture supported.\n"); + pr_info("Intel old style machine check architecture supported.\n"); /* Enable MCE: */ cr4_set_bits(X86_CR4_MCE); - printk(KERN_INFO - "Intel old style machine check reporting enabled on CPU#%d.\n", - smp_processor_id()); + pr_info("Intel old style machine check reporting enabled on CPU#%d.\n", + smp_processor_id()); } diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 2c5aaf8c2e2f..0b445c2ff735 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -190,7 +190,7 @@ static int therm_throt_process(bool new_event, int event, int level) /* if we just entered the thermal event */ if (new_event) { if (event == THERMAL_THROTTLING_EVENT) - printk(KERN_CRIT "CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n", + pr_crit("CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n", this_cpu, level == CORE_LEVEL ? "Core" : "Package", state->count); @@ -198,8 +198,7 @@ static int therm_throt_process(bool new_event, int event, int level) } if (old_event) { if (event == THERMAL_THROTTLING_EVENT) - printk(KERN_INFO "CPU%d: %s temperature/speed normal\n", - this_cpu, + pr_info("CPU%d: %s temperature/speed normal\n", this_cpu, level == CORE_LEVEL ? "Core" : "Package"); return 1; } @@ -417,8 +416,8 @@ static void intel_thermal_interrupt(void) static void unexpected_thermal_interrupt(void) { - printk(KERN_ERR "CPU%d: Unexpected LVT thermal interrupt!\n", - smp_processor_id()); + pr_err("CPU%d: Unexpected LVT thermal interrupt!\n", + smp_processor_id()); } static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt; @@ -499,7 +498,7 @@ void intel_init_thermal(struct cpuinfo_x86 *c) if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { if (system_state == SYSTEM_BOOTING) - printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n", cpu); + pr_debug("CPU%d: Thermal monitoring handled by SMI\n", cpu); return; } @@ -557,8 +556,8 @@ void intel_init_thermal(struct cpuinfo_x86 *c) l = apic_read(APIC_LVTTHMR); apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); - printk_once(KERN_INFO "CPU0: Thermal monitoring enabled (%s)\n", - tm2 ? "TM2" : "TM1"); + pr_info_once("CPU0: Thermal monitoring enabled (%s)\n", + tm2 ? "TM2" : "TM1"); /* enable thermal throttle processing */ atomic_set(&therm_throt_en, 1); diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c index 7245980186ee..fcf9ae9384f4 100644 --- a/arch/x86/kernel/cpu/mcheck/threshold.c +++ b/arch/x86/kernel/cpu/mcheck/threshold.c @@ -12,8 +12,8 @@ static void default_threshold_interrupt(void) { - printk(KERN_ERR "Unexpected threshold interrupt at vector %x\n", - THRESHOLD_APIC_VECTOR); + pr_err("Unexpected threshold interrupt at vector %x\n", + THRESHOLD_APIC_VECTOR); } void (*mce_threshold_vector)(void) = default_threshold_interrupt; diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c index 01dd8702880b..c6a722e1d011 100644 --- a/arch/x86/kernel/cpu/mcheck/winchip.c +++ b/arch/x86/kernel/cpu/mcheck/winchip.c @@ -17,7 +17,7 @@ static void winchip_machine_check(struct pt_regs *regs, long error_code) { ist_enter(regs); - printk(KERN_EMERG "CPU0: Machine Check Exception.\n"); + pr_emerg("CPU0: Machine Check Exception.\n"); add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); ist_exit(regs); @@ -39,6 +39,5 @@ void winchip_mcheck_init(struct cpuinfo_x86 *c) cr4_set_bits(X86_CR4_MCE); - printk(KERN_INFO - "Winchip machine check reporting enabled on CPU#0.\n"); + pr_info("Winchip machine check reporting enabled on CPU#0.\n"); } diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 2233f8a76615..8581963894c7 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -431,10 +431,6 @@ int __init save_microcode_in_initrd_amd(void) else container = cont_va; - if (ucode_new_rev) - pr_info("microcode: updated early to new patch_level=0x%08x\n", - ucode_new_rev); - eax = cpuid_eax(0x00000001); eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); @@ -469,8 +465,7 @@ void reload_ucode_amd(void) if (mc && rev < mc->hdr.patch_id) { if (!__apply_microcode_amd(mc)) { ucode_new_rev = mc->hdr.patch_id; - pr_info("microcode: reload patch_level=0x%08x\n", - ucode_new_rev); + pr_info("reload patch_level=0x%08x\n", ucode_new_rev); } } } @@ -793,15 +788,13 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover) return -EINVAL; } - patch->data = kzalloc(patch_size, GFP_KERNEL); + patch->data = kmemdup(fw + SECTION_HDR_SIZE, patch_size, GFP_KERNEL); if (!patch->data) { pr_err("Patch data allocation failure.\n"); kfree(patch); return -EINVAL; } - /* All looks ok, copy patch... */ - memcpy(patch->data, fw + SECTION_HDR_SIZE, patch_size); INIT_LIST_HEAD(&patch->plist); patch->patch_id = mc_hdr->patch_id; patch->equiv_cpu = proc_id; @@ -953,10 +946,14 @@ struct microcode_ops * __init init_amd_microcode(void) struct cpuinfo_x86 *c = &boot_cpu_data; if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) { - pr_warning("AMD CPU family 0x%x not supported\n", c->x86); + pr_warn("AMD CPU family 0x%x not supported\n", c->x86); return NULL; } + if (ucode_new_rev) + pr_info_once("microcode updated early to new patch_level=0x%08x\n", + ucode_new_rev); + return µcode_amd_ops; } diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index faec7120c508..ac360bfbbdb6 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -43,16 +43,8 @@ #define MICROCODE_VERSION "2.01" static struct microcode_ops *microcode_ops; - static bool dis_ucode_ldr; -static int __init disable_loader(char *str) -{ - dis_ucode_ldr = true; - return 1; -} -__setup("dis_ucode_ldr", disable_loader); - /* * Synchronization. * @@ -81,15 +73,16 @@ struct cpu_info_ctx { static bool __init check_loader_disabled_bsp(void) { + static const char *__dis_opt_str = "dis_ucode_ldr"; + #ifdef CONFIG_X86_32 const char *cmdline = (const char *)__pa_nodebug(boot_command_line); - const char *opt = "dis_ucode_ldr"; - const char *option = (const char *)__pa_nodebug(opt); + const char *option = (const char *)__pa_nodebug(__dis_opt_str); bool *res = (bool *)__pa_nodebug(&dis_ucode_ldr); #else /* CONFIG_X86_64 */ const char *cmdline = boot_command_line; - const char *option = "dis_ucode_ldr"; + const char *option = __dis_opt_str; bool *res = &dis_ucode_ldr; #endif @@ -479,7 +472,7 @@ static enum ucode_state microcode_init_cpu(int cpu, bool refresh_fw) enum ucode_state ustate; struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - if (uci && uci->valid) + if (uci->valid) return UCODE_OK; if (collect_cpu_info(cpu)) @@ -630,7 +623,7 @@ int __init microcode_init(void) struct cpuinfo_x86 *c = &boot_cpu_data; int error; - if (paravirt_enabled() || dis_ucode_ldr) + if (dis_ucode_ldr) return -EINVAL; if (c->x86_vendor == X86_VENDOR_INTEL) diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index ee81c544ee0d..cbb3cf09b065 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -39,9 +39,15 @@ #include <asm/setup.h> #include <asm/msr.h> -static unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT]; +/* + * Temporary microcode blobs pointers storage. We note here the pointers to + * microcode blobs we've got from whatever storage (detached initrd, builtin). + * Later on, we put those into final storage mc_saved_data.mc_saved. + */ +static unsigned long mc_tmp_ptrs[MAX_UCODE_COUNT]; + static struct mc_saved_data { - unsigned int mc_saved_count; + unsigned int num_saved; struct microcode_intel **mc_saved; } mc_saved_data; @@ -78,53 +84,50 @@ load_microcode_early(struct microcode_intel **saved, } static inline void -copy_initrd_ptrs(struct microcode_intel **mc_saved, unsigned long *initrd, - unsigned long off, int num_saved) +copy_ptrs(struct microcode_intel **mc_saved, unsigned long *mc_ptrs, + unsigned long off, int num_saved) { int i; for (i = 0; i < num_saved; i++) - mc_saved[i] = (struct microcode_intel *)(initrd[i] + off); + mc_saved[i] = (struct microcode_intel *)(mc_ptrs[i] + off); } #ifdef CONFIG_X86_32 static void -microcode_phys(struct microcode_intel **mc_saved_tmp, - struct mc_saved_data *mc_saved_data) +microcode_phys(struct microcode_intel **mc_saved_tmp, struct mc_saved_data *mcs) { int i; struct microcode_intel ***mc_saved; - mc_saved = (struct microcode_intel ***) - __pa_nodebug(&mc_saved_data->mc_saved); - for (i = 0; i < mc_saved_data->mc_saved_count; i++) { + mc_saved = (struct microcode_intel ***)__pa_nodebug(&mcs->mc_saved); + + for (i = 0; i < mcs->num_saved; i++) { struct microcode_intel *p; - p = *(struct microcode_intel **) - __pa_nodebug(mc_saved_data->mc_saved + i); + p = *(struct microcode_intel **)__pa_nodebug(mcs->mc_saved + i); mc_saved_tmp[i] = (struct microcode_intel *)__pa_nodebug(p); } } #endif static enum ucode_state -load_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd, - unsigned long initrd_start, struct ucode_cpu_info *uci) +load_microcode(struct mc_saved_data *mcs, unsigned long *mc_ptrs, + unsigned long offset, struct ucode_cpu_info *uci) { struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; - unsigned int count = mc_saved_data->mc_saved_count; + unsigned int count = mcs->num_saved; - if (!mc_saved_data->mc_saved) { - copy_initrd_ptrs(mc_saved_tmp, initrd, initrd_start, count); + if (!mcs->mc_saved) { + copy_ptrs(mc_saved_tmp, mc_ptrs, offset, count); return load_microcode_early(mc_saved_tmp, count, uci); } else { #ifdef CONFIG_X86_32 - microcode_phys(mc_saved_tmp, mc_saved_data); + microcode_phys(mc_saved_tmp, mcs); return load_microcode_early(mc_saved_tmp, count, uci); #else - return load_microcode_early(mc_saved_data->mc_saved, - count, uci); + return load_microcode_early(mcs->mc_saved, count, uci); #endif } } @@ -175,25 +178,25 @@ matching_model_microcode(struct microcode_header_intel *mc_header, } static int -save_microcode(struct mc_saved_data *mc_saved_data, +save_microcode(struct mc_saved_data *mcs, struct microcode_intel **mc_saved_src, - unsigned int mc_saved_count) + unsigned int num_saved) { int i, j; struct microcode_intel **saved_ptr; int ret; - if (!mc_saved_count) + if (!num_saved) return -EINVAL; /* * Copy new microcode data. */ - saved_ptr = kcalloc(mc_saved_count, sizeof(struct microcode_intel *), GFP_KERNEL); + saved_ptr = kcalloc(num_saved, sizeof(struct microcode_intel *), GFP_KERNEL); if (!saved_ptr) return -ENOMEM; - for (i = 0; i < mc_saved_count; i++) { + for (i = 0; i < num_saved; i++) { struct microcode_header_intel *mc_hdr; struct microcode_intel *mc; unsigned long size; @@ -207,20 +210,18 @@ save_microcode(struct mc_saved_data *mc_saved_data, mc_hdr = &mc->hdr; size = get_totalsize(mc_hdr); - saved_ptr[i] = kmalloc(size, GFP_KERNEL); + saved_ptr[i] = kmemdup(mc, size, GFP_KERNEL); if (!saved_ptr[i]) { ret = -ENOMEM; goto err; } - - memcpy(saved_ptr[i], mc, size); } /* * Point to newly saved microcode. */ - mc_saved_data->mc_saved = saved_ptr; - mc_saved_data->mc_saved_count = mc_saved_count; + mcs->mc_saved = saved_ptr; + mcs->num_saved = num_saved; return 0; @@ -284,22 +285,20 @@ static unsigned int _save_mc(struct microcode_intel **mc_saved, * BSP can stay in the platform. */ static enum ucode_state __init -get_matching_model_microcode(int cpu, unsigned long start, - void *data, size_t size, - struct mc_saved_data *mc_saved_data, - unsigned long *mc_saved_in_initrd, +get_matching_model_microcode(unsigned long start, void *data, size_t size, + struct mc_saved_data *mcs, unsigned long *mc_ptrs, struct ucode_cpu_info *uci) { - u8 *ucode_ptr = data; - unsigned int leftover = size; + struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; + struct microcode_header_intel *mc_header; + unsigned int num_saved = mcs->num_saved; enum ucode_state state = UCODE_OK; + unsigned int leftover = size; + u8 *ucode_ptr = data; unsigned int mc_size; - struct microcode_header_intel *mc_header; - struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; - unsigned int mc_saved_count = mc_saved_data->mc_saved_count; int i; - while (leftover && mc_saved_count < ARRAY_SIZE(mc_saved_tmp)) { + while (leftover && num_saved < ARRAY_SIZE(mc_saved_tmp)) { if (leftover < sizeof(mc_header)) break; @@ -318,32 +317,31 @@ get_matching_model_microcode(int cpu, unsigned long start, * the platform, we need to find and save microcode patches * with the same family and model as the BSP. */ - if (matching_model_microcode(mc_header, uci->cpu_sig.sig) != - UCODE_OK) { + if (matching_model_microcode(mc_header, uci->cpu_sig.sig) != UCODE_OK) { ucode_ptr += mc_size; continue; } - mc_saved_count = _save_mc(mc_saved_tmp, ucode_ptr, mc_saved_count); + num_saved = _save_mc(mc_saved_tmp, ucode_ptr, num_saved); ucode_ptr += mc_size; } if (leftover) { state = UCODE_ERROR; - goto out; + return state; } - if (mc_saved_count == 0) { + if (!num_saved) { state = UCODE_NFOUND; - goto out; + return state; } - for (i = 0; i < mc_saved_count; i++) - mc_saved_in_initrd[i] = (unsigned long)mc_saved_tmp[i] - start; + for (i = 0; i < num_saved; i++) + mc_ptrs[i] = (unsigned long)mc_saved_tmp[i] - start; + + mcs->num_saved = num_saved; - mc_saved_data->mc_saved_count = mc_saved_count; -out: return state; } @@ -373,7 +371,7 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci) native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); csig.pf = 1 << ((val[1] >> 18) & 7); } - native_wrmsr(MSR_IA32_UCODE_REV, 0, 0); + native_wrmsrl(MSR_IA32_UCODE_REV, 0); /* As documented in the SDM: Do a CPUID 1 here */ sync_core(); @@ -396,11 +394,11 @@ static void show_saved_mc(void) unsigned int sig, pf, rev, total_size, data_size, date; struct ucode_cpu_info uci; - if (mc_saved_data.mc_saved_count == 0) { + if (!mc_saved_data.num_saved) { pr_debug("no microcode data saved.\n"); return; } - pr_debug("Total microcode saved: %d\n", mc_saved_data.mc_saved_count); + pr_debug("Total microcode saved: %d\n", mc_saved_data.num_saved); collect_cpu_info_early(&uci); @@ -409,7 +407,7 @@ static void show_saved_mc(void) rev = uci.cpu_sig.rev; pr_debug("CPU: sig=0x%x, pf=0x%x, rev=0x%x\n", sig, pf, rev); - for (i = 0; i < mc_saved_data.mc_saved_count; i++) { + for (i = 0; i < mc_saved_data.num_saved; i++) { struct microcode_header_intel *mc_saved_header; struct extended_sigtable *ext_header; int ext_sigcount; @@ -465,7 +463,7 @@ int save_mc_for_early(u8 *mc) { struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; unsigned int mc_saved_count_init; - unsigned int mc_saved_count; + unsigned int num_saved; struct microcode_intel **mc_saved; int ret = 0; int i; @@ -476,23 +474,23 @@ int save_mc_for_early(u8 *mc) */ mutex_lock(&x86_cpu_microcode_mutex); - mc_saved_count_init = mc_saved_data.mc_saved_count; - mc_saved_count = mc_saved_data.mc_saved_count; + mc_saved_count_init = mc_saved_data.num_saved; + num_saved = mc_saved_data.num_saved; mc_saved = mc_saved_data.mc_saved; - if (mc_saved && mc_saved_count) + if (mc_saved && num_saved) memcpy(mc_saved_tmp, mc_saved, - mc_saved_count * sizeof(struct microcode_intel *)); + num_saved * sizeof(struct microcode_intel *)); /* * Save the microcode patch mc in mc_save_tmp structure if it's a newer * version. */ - mc_saved_count = _save_mc(mc_saved_tmp, mc, mc_saved_count); + num_saved = _save_mc(mc_saved_tmp, mc, num_saved); /* * Save the mc_save_tmp in global mc_saved_data. */ - ret = save_microcode(&mc_saved_data, mc_saved_tmp, mc_saved_count); + ret = save_microcode(&mc_saved_data, mc_saved_tmp, num_saved); if (ret) { pr_err("Cannot save microcode patch.\n"); goto out; @@ -536,7 +534,7 @@ static bool __init load_builtin_intel_microcode(struct cpio_data *cp) static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin"; static __init enum ucode_state -scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd, +scan_microcode(struct mc_saved_data *mcs, unsigned long *mc_ptrs, unsigned long start, unsigned long size, struct ucode_cpu_info *uci) { @@ -551,14 +549,18 @@ scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd, cd.data = NULL; cd.size = 0; - cd = find_cpio_data(p, (void *)start, size, &offset); - if (!cd.data) { + /* try built-in microcode if no initrd */ + if (!size) { if (!load_builtin_intel_microcode(&cd)) return UCODE_ERROR; + } else { + cd = find_cpio_data(p, (void *)start, size, &offset); + if (!cd.data) + return UCODE_ERROR; } - return get_matching_model_microcode(0, start, cd.data, cd.size, - mc_saved_data, initrd, uci); + return get_matching_model_microcode(start, cd.data, cd.size, + mcs, mc_ptrs, uci); } /* @@ -567,14 +569,11 @@ scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd, static void print_ucode_info(struct ucode_cpu_info *uci, unsigned int date) { - int cpu = smp_processor_id(); - - pr_info("CPU%d microcode updated early to revision 0x%x, date = %04x-%02x-%02x\n", - cpu, - uci->cpu_sig.rev, - date & 0xffff, - date >> 24, - (date >> 16) & 0xff); + pr_info_once("microcode updated early to revision 0x%x, date = %04x-%02x-%02x\n", + uci->cpu_sig.rev, + date & 0xffff, + date >> 24, + (date >> 16) & 0xff); } #ifdef CONFIG_X86_32 @@ -603,19 +602,19 @@ void show_ucode_info_early(void) */ static void print_ucode(struct ucode_cpu_info *uci) { - struct microcode_intel *mc_intel; + struct microcode_intel *mc; int *delay_ucode_info_p; int *current_mc_date_p; - mc_intel = uci->mc; - if (mc_intel == NULL) + mc = uci->mc; + if (!mc) return; delay_ucode_info_p = (int *)__pa_nodebug(&delay_ucode_info); current_mc_date_p = (int *)__pa_nodebug(¤t_mc_date); *delay_ucode_info_p = 1; - *current_mc_date_p = mc_intel->hdr.date; + *current_mc_date_p = mc->hdr.date; } #else @@ -630,37 +629,35 @@ static inline void flush_tlb_early(void) static inline void print_ucode(struct ucode_cpu_info *uci) { - struct microcode_intel *mc_intel; + struct microcode_intel *mc; - mc_intel = uci->mc; - if (mc_intel == NULL) + mc = uci->mc; + if (!mc) return; - print_ucode_info(uci, mc_intel->hdr.date); + print_ucode_info(uci, mc->hdr.date); } #endif static int apply_microcode_early(struct ucode_cpu_info *uci, bool early) { - struct microcode_intel *mc_intel; + struct microcode_intel *mc; unsigned int val[2]; - mc_intel = uci->mc; - if (mc_intel == NULL) + mc = uci->mc; + if (!mc) return 0; /* write microcode via MSR 0x79 */ - native_wrmsr(MSR_IA32_UCODE_WRITE, - (unsigned long) mc_intel->bits, - (unsigned long) mc_intel->bits >> 16 >> 16); - native_wrmsr(MSR_IA32_UCODE_REV, 0, 0); + native_wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits); + native_wrmsrl(MSR_IA32_UCODE_REV, 0); /* As documented in the SDM: Do a CPUID 1 here */ sync_core(); /* get the current revision from MSR 0x8B */ native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); - if (val[1] != mc_intel->hdr.rev) + if (val[1] != mc->hdr.rev) return -1; #ifdef CONFIG_X86_64 @@ -672,25 +669,26 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early) if (early) print_ucode(uci); else - print_ucode_info(uci, mc_intel->hdr.date); + print_ucode_info(uci, mc->hdr.date); return 0; } /* * This function converts microcode patch offsets previously stored in - * mc_saved_in_initrd to pointers and stores the pointers in mc_saved_data. + * mc_tmp_ptrs to pointers and stores the pointers in mc_saved_data. */ int __init save_microcode_in_initrd_intel(void) { - unsigned int count = mc_saved_data.mc_saved_count; + unsigned int count = mc_saved_data.num_saved; struct microcode_intel *mc_saved[MAX_UCODE_COUNT]; int ret = 0; - if (count == 0) + if (!count) return ret; - copy_initrd_ptrs(mc_saved, mc_saved_in_initrd, initrd_start, count); + copy_ptrs(mc_saved, mc_tmp_ptrs, get_initrd_start(), count); + ret = save_microcode(&mc_saved_data, mc_saved, count); if (ret) pr_err("Cannot save microcode patches from initrd.\n"); @@ -701,8 +699,7 @@ int __init save_microcode_in_initrd_intel(void) } static void __init -_load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data, - unsigned long *initrd, +_load_ucode_intel_bsp(struct mc_saved_data *mcs, unsigned long *mc_ptrs, unsigned long start, unsigned long size) { struct ucode_cpu_info uci; @@ -710,11 +707,11 @@ _load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data, collect_cpu_info_early(&uci); - ret = scan_microcode(mc_saved_data, initrd, start, size, &uci); + ret = scan_microcode(mcs, mc_ptrs, start, size, &uci); if (ret != UCODE_OK) return; - ret = load_microcode(mc_saved_data, initrd, start, &uci); + ret = load_microcode(mcs, mc_ptrs, start, &uci); if (ret != UCODE_OK) return; @@ -728,53 +725,49 @@ void __init load_ucode_intel_bsp(void) struct boot_params *p; p = (struct boot_params *)__pa_nodebug(&boot_params); - start = p->hdr.ramdisk_image; size = p->hdr.ramdisk_size; - _load_ucode_intel_bsp( - (struct mc_saved_data *)__pa_nodebug(&mc_saved_data), - (unsigned long *)__pa_nodebug(&mc_saved_in_initrd), - start, size); + /* + * Set start only if we have an initrd image. We cannot use initrd_start + * because it is not set that early yet. + */ + start = (size ? p->hdr.ramdisk_image : 0); + + _load_ucode_intel_bsp((struct mc_saved_data *)__pa_nodebug(&mc_saved_data), + (unsigned long *)__pa_nodebug(&mc_tmp_ptrs), + start, size); #else - start = boot_params.hdr.ramdisk_image + PAGE_OFFSET; size = boot_params.hdr.ramdisk_size; + start = (size ? boot_params.hdr.ramdisk_image + PAGE_OFFSET : 0); - _load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd, start, size); + _load_ucode_intel_bsp(&mc_saved_data, mc_tmp_ptrs, start, size); #endif } void load_ucode_intel_ap(void) { - struct mc_saved_data *mc_saved_data_p; + unsigned long *mcs_tmp_p; + struct mc_saved_data *mcs_p; struct ucode_cpu_info uci; - unsigned long *mc_saved_in_initrd_p; - unsigned long initrd_start_addr; enum ucode_state ret; #ifdef CONFIG_X86_32 - unsigned long *initrd_start_p; - mc_saved_in_initrd_p = - (unsigned long *)__pa_nodebug(mc_saved_in_initrd); - mc_saved_data_p = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data); - initrd_start_p = (unsigned long *)__pa_nodebug(&initrd_start); - initrd_start_addr = (unsigned long)__pa_nodebug(*initrd_start_p); + mcs_tmp_p = (unsigned long *)__pa_nodebug(mc_tmp_ptrs); + mcs_p = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data); #else - mc_saved_data_p = &mc_saved_data; - mc_saved_in_initrd_p = mc_saved_in_initrd; - initrd_start_addr = initrd_start; + mcs_tmp_p = mc_tmp_ptrs; + mcs_p = &mc_saved_data; #endif /* * If there is no valid ucode previously saved in memory, no need to * update ucode on this AP. */ - if (mc_saved_data_p->mc_saved_count == 0) + if (!mcs_p->num_saved) return; collect_cpu_info_early(&uci); - ret = load_microcode(mc_saved_data_p, mc_saved_in_initrd_p, - initrd_start_addr, &uci); - + ret = load_microcode(mcs_p, mcs_tmp_p, get_initrd_start_addr(), &uci); if (ret != UCODE_OK) return; @@ -786,13 +779,13 @@ void reload_ucode_intel(void) struct ucode_cpu_info uci; enum ucode_state ret; - if (!mc_saved_data.mc_saved_count) + if (!mc_saved_data.num_saved) return; collect_cpu_info_early(&uci); ret = load_microcode_early(mc_saved_data.mc_saved, - mc_saved_data.mc_saved_count, &uci); + mc_saved_data.num_saved, &uci); if (ret != UCODE_OK) return; @@ -825,7 +818,7 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) * return 0 - no update found * return 1 - found update */ -static int get_matching_mc(struct microcode_intel *mc_intel, int cpu) +static int get_matching_mc(struct microcode_intel *mc, int cpu) { struct cpu_signature cpu_sig; unsigned int csig, cpf, crev; @@ -836,39 +829,36 @@ static int get_matching_mc(struct microcode_intel *mc_intel, int cpu) cpf = cpu_sig.pf; crev = cpu_sig.rev; - return has_newer_microcode(mc_intel, csig, cpf, crev); + return has_newer_microcode(mc, csig, cpf, crev); } static int apply_microcode_intel(int cpu) { - struct microcode_intel *mc_intel; + struct microcode_intel *mc; struct ucode_cpu_info *uci; + struct cpuinfo_x86 *c; unsigned int val[2]; - int cpu_num = raw_smp_processor_id(); - struct cpuinfo_x86 *c = &cpu_data(cpu_num); - - uci = ucode_cpu_info + cpu; - mc_intel = uci->mc; /* We should bind the task to the CPU */ - BUG_ON(cpu_num != cpu); + if (WARN_ON(raw_smp_processor_id() != cpu)) + return -1; - if (mc_intel == NULL) + uci = ucode_cpu_info + cpu; + mc = uci->mc; + if (!mc) return 0; /* * Microcode on this CPU could be updated earlier. Only apply the - * microcode patch in mc_intel when it is newer than the one on this + * microcode patch in mc when it is newer than the one on this * CPU. */ - if (get_matching_mc(mc_intel, cpu) == 0) + if (!get_matching_mc(mc, cpu)) return 0; /* write microcode via MSR 0x79 */ - wrmsr(MSR_IA32_UCODE_WRITE, - (unsigned long) mc_intel->bits, - (unsigned long) mc_intel->bits >> 16 >> 16); - wrmsr(MSR_IA32_UCODE_REV, 0, 0); + wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits); + wrmsrl(MSR_IA32_UCODE_REV, 0); /* As documented in the SDM: Do a CPUID 1 here */ sync_core(); @@ -876,16 +866,19 @@ static int apply_microcode_intel(int cpu) /* get the current revision from MSR 0x8B */ rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); - if (val[1] != mc_intel->hdr.rev) { + if (val[1] != mc->hdr.rev) { pr_err("CPU%d update to revision 0x%x failed\n", - cpu_num, mc_intel->hdr.rev); + cpu, mc->hdr.rev); return -1; } + pr_info("CPU%d updated to revision 0x%x, date = %04x-%02x-%02x\n", - cpu_num, val[1], - mc_intel->hdr.date & 0xffff, - mc_intel->hdr.date >> 24, - (mc_intel->hdr.date >> 16) & 0xff); + cpu, val[1], + mc->hdr.date & 0xffff, + mc->hdr.date >> 24, + (mc->hdr.date >> 16) & 0xff); + + c = &cpu_data(cpu); uci->cpu_sig.rev = val[1]; c->microcode = val[1]; diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c index b96896bcbdaf..2ce1a7dc45b7 100644 --- a/arch/x86/kernel/cpu/microcode/intel_lib.c +++ b/arch/x86/kernel/cpu/microcode/intel_lib.c @@ -49,7 +49,7 @@ int microcode_sanity_check(void *mc, int print_err) unsigned long total_size, data_size, ext_table_size; struct microcode_header_intel *mc_header = mc; struct extended_sigtable *ext_header = NULL; - int sum, orig_sum, ext_sigcount = 0, i; + u32 sum, orig_sum, ext_sigcount = 0, i; struct extended_signature *ext_sig; total_size = get_totalsize(mc_header); @@ -57,69 +57,85 @@ int microcode_sanity_check(void *mc, int print_err) if (data_size + MC_HEADER_SIZE > total_size) { if (print_err) - pr_err("error! Bad data size in microcode data file\n"); + pr_err("Error: bad microcode data file size.\n"); return -EINVAL; } if (mc_header->ldrver != 1 || mc_header->hdrver != 1) { if (print_err) - pr_err("error! Unknown microcode update format\n"); + pr_err("Error: invalid/unknown microcode update format.\n"); return -EINVAL; } + ext_table_size = total_size - (MC_HEADER_SIZE + data_size); if (ext_table_size) { + u32 ext_table_sum = 0; + u32 *ext_tablep; + if ((ext_table_size < EXT_HEADER_SIZE) || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { if (print_err) - pr_err("error! Small exttable size in microcode data file\n"); + pr_err("Error: truncated extended signature table.\n"); return -EINVAL; } + ext_header = mc + MC_HEADER_SIZE + data_size; if (ext_table_size != exttable_size(ext_header)) { if (print_err) - pr_err("error! Bad exttable size in microcode data file\n"); + pr_err("Error: extended signature table size mismatch.\n"); return -EFAULT; } + ext_sigcount = ext_header->count; - } - /* check extended table checksum */ - if (ext_table_size) { - int ext_table_sum = 0; - int *ext_tablep = (int *)ext_header; + /* + * Check extended table checksum: the sum of all dwords that + * comprise a valid table must be 0. + */ + ext_tablep = (u32 *)ext_header; - i = ext_table_size / DWSIZE; + i = ext_table_size / sizeof(u32); while (i--) ext_table_sum += ext_tablep[i]; + if (ext_table_sum) { if (print_err) - pr_warn("aborting, bad extended signature table checksum\n"); + pr_warn("Bad extended signature table checksum, aborting.\n"); return -EINVAL; } } - /* calculate the checksum */ + /* + * Calculate the checksum of update data and header. The checksum of + * valid update data and header including the extended signature table + * must be 0. + */ orig_sum = 0; - i = (MC_HEADER_SIZE + data_size) / DWSIZE; + i = (MC_HEADER_SIZE + data_size) / sizeof(u32); while (i--) - orig_sum += ((int *)mc)[i]; + orig_sum += ((u32 *)mc)[i]; + if (orig_sum) { if (print_err) - pr_err("aborting, bad checksum\n"); + pr_err("Bad microcode data checksum, aborting.\n"); return -EINVAL; } + if (!ext_table_size) return 0; - /* check extended signature checksum */ + + /* + * Check extended signature checksum: 0 => valid. + */ for (i = 0; i < ext_sigcount; i++) { ext_sig = (void *)ext_header + EXT_HEADER_SIZE + EXT_SIGNATURE_SIZE * i; - sum = orig_sum - - (mc_header->sig + mc_header->pf + mc_header->cksum) - + (ext_sig->sig + ext_sig->pf + ext_sig->cksum); + + sum = (mc_header->sig + mc_header->pf + mc_header->cksum) - + (ext_sig->sig + ext_sig->pf + ext_sig->cksum); if (sum) { if (print_err) - pr_err("aborting, bad checksum\n"); + pr_err("Bad extended signature checksum, aborting.\n"); return -EINVAL; } } diff --git a/arch/x86/kernel/cpu/mkcapflags.sh b/arch/x86/kernel/cpu/mkcapflags.sh index 3f20710a5b23..6988c74409a8 100644 --- a/arch/x86/kernel/cpu/mkcapflags.sh +++ b/arch/x86/kernel/cpu/mkcapflags.sh @@ -1,6 +1,6 @@ #!/bin/sh # -# Generate the x86_cap/bug_flags[] arrays from include/asm/cpufeature.h +# Generate the x86_cap/bug_flags[] arrays from include/asm/cpufeatures.h # IN=$1 @@ -49,8 +49,8 @@ dump_array() trap 'rm "$OUT"' EXIT ( - echo "#ifndef _ASM_X86_CPUFEATURE_H" - echo "#include <asm/cpufeature.h>" + echo "#ifndef _ASM_X86_CPUFEATURES_H" + echo "#include <asm/cpufeatures.h>" echo "#endif" echo "" diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 20e242ea1bc4..4e7c6933691c 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -161,8 +161,8 @@ static void __init ms_hyperv_init_platform(void) ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES); ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO); - printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n", - ms_hyperv.features, ms_hyperv.hints); + pr_info("HyperV: features 0x%x, hints 0x%x\n", + ms_hyperv.features, ms_hyperv.hints); #ifdef CONFIG_X86_LOCAL_APIC if (ms_hyperv.features & HV_X64_MSR_APIC_FREQUENCY_AVAILABLE) { @@ -174,8 +174,8 @@ static void __init ms_hyperv_init_platform(void) rdmsrl(HV_X64_MSR_APIC_FREQUENCY, hv_lapic_frequency); hv_lapic_frequency = div_u64(hv_lapic_frequency, HZ); lapic_timer_frequency = hv_lapic_frequency; - printk(KERN_INFO "HyperV: LAPIC Timer Frequency: %#x\n", - lapic_timer_frequency); + pr_info("HyperV: LAPIC Timer Frequency: %#x\n", + lapic_timer_frequency); } #endif diff --git a/arch/x86/kernel/cpu/mtrr/centaur.c b/arch/x86/kernel/cpu/mtrr/centaur.c index 316fe3e60a97..3d689937fc1b 100644 --- a/arch/x86/kernel/cpu/mtrr/centaur.c +++ b/arch/x86/kernel/cpu/mtrr/centaur.c @@ -103,7 +103,7 @@ centaur_validate_add_page(unsigned long base, unsigned long size, unsigned int t */ if (type != MTRR_TYPE_WRCOMB && (centaur_mcr_type == 0 || type != MTRR_TYPE_UNCACHABLE)) { - pr_warning("mtrr: only write-combining%s supported\n", + pr_warn("mtrr: only write-combining%s supported\n", centaur_mcr_type ? " and uncacheable are" : " is"); return -EINVAL; } diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 0d98503c2245..31e951ce6dff 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -57,9 +57,9 @@ static int __initdata nr_range; static struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; static int __initdata debug_print; -#define Dprintk(x...) do { if (debug_print) printk(KERN_DEBUG x); } while (0) +#define Dprintk(x...) do { if (debug_print) pr_debug(x); } while (0) -#define BIOS_BUG_MSG KERN_WARNING \ +#define BIOS_BUG_MSG \ "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n" static int __init @@ -81,9 +81,9 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range, base, base + size); } if (debug_print) { - printk(KERN_DEBUG "After WB checking\n"); + pr_debug("After WB checking\n"); for (i = 0; i < nr_range; i++) - printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n", + pr_debug("MTRR MAP PFN: %016llx - %016llx\n", range[i].start, range[i].end); } @@ -101,7 +101,7 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range, (mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) && (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) { /* Var MTRR contains UC entry below 1M? Skip it: */ - printk(BIOS_BUG_MSG, i); + pr_warn(BIOS_BUG_MSG, i); if (base + size <= (1<<(20-PAGE_SHIFT))) continue; size -= (1<<(20-PAGE_SHIFT)) - base; @@ -114,11 +114,11 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range, extra_remove_base + extra_remove_size); if (debug_print) { - printk(KERN_DEBUG "After UC checking\n"); + pr_debug("After UC checking\n"); for (i = 0; i < RANGE_NUM; i++) { if (!range[i].end) continue; - printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n", + pr_debug("MTRR MAP PFN: %016llx - %016llx\n", range[i].start, range[i].end); } } @@ -126,9 +126,9 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range, /* sort the ranges */ nr_range = clean_sort_range(range, RANGE_NUM); if (debug_print) { - printk(KERN_DEBUG "After sorting\n"); + pr_debug("After sorting\n"); for (i = 0; i < nr_range; i++) - printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n", + pr_debug("MTRR MAP PFN: %016llx - %016llx\n", range[i].start, range[i].end); } @@ -544,7 +544,7 @@ static void __init print_out_mtrr_range_state(void) start_base = to_size_factor(start_base, &start_factor), type = range_state[i].type; - printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n", + pr_debug("reg %d, base: %ld%cB, range: %ld%cB, type %s\n", i, start_base, start_factor, size_base, size_factor, (type == MTRR_TYPE_UNCACHABLE) ? "UC" : @@ -713,7 +713,7 @@ int __init mtrr_cleanup(unsigned address_bits) return 0; /* Print original var MTRRs at first, for debugging: */ - printk(KERN_DEBUG "original variable MTRRs\n"); + pr_debug("original variable MTRRs\n"); print_out_mtrr_range_state(); memset(range, 0, sizeof(range)); @@ -733,7 +733,7 @@ int __init mtrr_cleanup(unsigned address_bits) x_remove_base, x_remove_size); range_sums = sum_ranges(range, nr_range); - printk(KERN_INFO "total RAM covered: %ldM\n", + pr_info("total RAM covered: %ldM\n", range_sums >> (20 - PAGE_SHIFT)); if (mtrr_chunk_size && mtrr_gran_size) { @@ -745,12 +745,11 @@ int __init mtrr_cleanup(unsigned address_bits) if (!result[i].bad) { set_var_mtrr_all(address_bits); - printk(KERN_DEBUG "New variable MTRRs\n"); + pr_debug("New variable MTRRs\n"); print_out_mtrr_range_state(); return 1; } - printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, " - "will find optimal one\n"); + pr_info("invalid mtrr_gran_size or mtrr_chunk_size, will find optimal one\n"); } i = 0; @@ -768,7 +767,7 @@ int __init mtrr_cleanup(unsigned address_bits) x_remove_base, x_remove_size, i); if (debug_print) { mtrr_print_out_one_result(i); - printk(KERN_INFO "\n"); + pr_info("\n"); } i++; @@ -779,7 +778,7 @@ int __init mtrr_cleanup(unsigned address_bits) index_good = mtrr_search_optimal_index(); if (index_good != -1) { - printk(KERN_INFO "Found optimal setting for mtrr clean up\n"); + pr_info("Found optimal setting for mtrr clean up\n"); i = index_good; mtrr_print_out_one_result(i); @@ -790,7 +789,7 @@ int __init mtrr_cleanup(unsigned address_bits) gran_size <<= 10; x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size); set_var_mtrr_all(address_bits); - printk(KERN_DEBUG "New variable MTRRs\n"); + pr_debug("New variable MTRRs\n"); print_out_mtrr_range_state(); return 1; } else { @@ -799,8 +798,8 @@ int __init mtrr_cleanup(unsigned address_bits) mtrr_print_out_one_result(i); } - printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n"); - printk(KERN_INFO "please specify mtrr_gran_size/mtrr_chunk_size\n"); + pr_info("mtrr_cleanup: can not find optimal value\n"); + pr_info("please specify mtrr_gran_size/mtrr_chunk_size\n"); return 0; } @@ -918,7 +917,7 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) /* kvm/qemu doesn't have mtrr set right, don't trim them all: */ if (!highest_pfn) { - printk(KERN_INFO "CPU MTRRs all blank - virtualized system.\n"); + pr_info("CPU MTRRs all blank - virtualized system.\n"); return 0; } @@ -973,7 +972,8 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) end_pfn); if (total_trim_size) { - pr_warning("WARNING: BIOS bug: CPU MTRRs don't cover all of memory, losing %lluMB of RAM.\n", total_trim_size >> 20); + pr_warn("WARNING: BIOS bug: CPU MTRRs don't cover all of memory, losing %lluMB of RAM.\n", + total_trim_size >> 20); if (!changed_by_mtrr_cleanup) WARN_ON(1); diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index c870af161008..19f57360dfd2 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -42,7 +42,7 @@ EXPORT_SYMBOL_GPL(mtrr_state); * "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD * Opteron Processors" (26094 Rev. 3.30 February 2006), section * "13.2.1.2 SYSCFG Register": "The MtrrFixDramModEn bit should be set - * to 1 during BIOS initalization of the fixed MTRRs, then cleared to + * to 1 during BIOS initialization of the fixed MTRRs, then cleared to * 0 for operation." */ static inline void k8_check_syscfg_dram_mod_en(void) @@ -55,7 +55,7 @@ static inline void k8_check_syscfg_dram_mod_en(void) rdmsr(MSR_K8_SYSCFG, lo, hi); if (lo & K8_MTRRFIXRANGE_DRAM_MODIFY) { - printk(KERN_ERR FW_WARN "MTRR: CPU %u: SYSCFG[MtrrFixDramModEn]" + pr_err(FW_WARN "MTRR: CPU %u: SYSCFG[MtrrFixDramModEn]" " not cleared by BIOS, clearing this bit\n", smp_processor_id()); lo &= ~K8_MTRRFIXRANGE_DRAM_MODIFY; @@ -501,14 +501,14 @@ void __init mtrr_state_warn(void) if (!mask) return; if (mask & MTRR_CHANGE_MASK_FIXED) - pr_warning("mtrr: your CPUs had inconsistent fixed MTRR settings\n"); + pr_warn("mtrr: your CPUs had inconsistent fixed MTRR settings\n"); if (mask & MTRR_CHANGE_MASK_VARIABLE) - pr_warning("mtrr: your CPUs had inconsistent variable MTRR settings\n"); + pr_warn("mtrr: your CPUs had inconsistent variable MTRR settings\n"); if (mask & MTRR_CHANGE_MASK_DEFTYPE) - pr_warning("mtrr: your CPUs had inconsistent MTRRdefType settings\n"); + pr_warn("mtrr: your CPUs had inconsistent MTRRdefType settings\n"); - printk(KERN_INFO "mtrr: probably your BIOS does not setup all CPUs.\n"); - printk(KERN_INFO "mtrr: corrected configuration.\n"); + pr_info("mtrr: probably your BIOS does not setup all CPUs.\n"); + pr_info("mtrr: corrected configuration.\n"); } /* @@ -519,8 +519,7 @@ void __init mtrr_state_warn(void) void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b) { if (wrmsr_safe(msr, a, b) < 0) { - printk(KERN_ERR - "MTRR: CPU %u: Writing MSR %x to %x:%x failed\n", + pr_err("MTRR: CPU %u: Writing MSR %x to %x:%x failed\n", smp_processor_id(), msr, a, b); } } @@ -607,7 +606,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, tmp |= ~((1ULL<<(hi - 1)) - 1); if (tmp != mask) { - printk(KERN_WARNING "mtrr: your BIOS has configured an incorrect mask, fixing it.\n"); + pr_warn("mtrr: your BIOS has configured an incorrect mask, fixing it.\n"); add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); mask = tmp; } @@ -858,13 +857,13 @@ int generic_validate_add_page(unsigned long base, unsigned long size, boot_cpu_data.x86_model == 1 && boot_cpu_data.x86_mask <= 7) { if (base & ((1 << (22 - PAGE_SHIFT)) - 1)) { - pr_warning("mtrr: base(0x%lx000) is not 4 MiB aligned\n", base); + pr_warn("mtrr: base(0x%lx000) is not 4 MiB aligned\n", base); return -EINVAL; } if (!(base + size < 0x70000 || base > 0x7003F) && (type == MTRR_TYPE_WRCOMB || type == MTRR_TYPE_WRBACK)) { - pr_warning("mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n"); + pr_warn("mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n"); return -EINVAL; } } @@ -878,7 +877,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size, lbase = lbase >> 1, last = last >> 1) ; if (lbase != last) { - pr_warning("mtrr: base(0x%lx000) is not aligned on a size(0x%lx000) boundary\n", base, size); + pr_warn("mtrr: base(0x%lx000) is not aligned on a size(0x%lx000) boundary\n", base, size); return -EINVAL; } return 0; diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 5c3d149ee91c..10f8d4796240 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -47,7 +47,7 @@ #include <linux/smp.h> #include <linux/syscore_ops.h> -#include <asm/processor.h> +#include <asm/cpufeature.h> #include <asm/e820.h> #include <asm/mtrr.h> #include <asm/msr.h> @@ -300,24 +300,24 @@ int mtrr_add_page(unsigned long base, unsigned long size, return error; if (type >= MTRR_NUM_TYPES) { - pr_warning("mtrr: type: %u invalid\n", type); + pr_warn("mtrr: type: %u invalid\n", type); return -EINVAL; } /* If the type is WC, check that this processor supports it */ if ((type == MTRR_TYPE_WRCOMB) && !have_wrcomb()) { - pr_warning("mtrr: your processor doesn't support write-combining\n"); + pr_warn("mtrr: your processor doesn't support write-combining\n"); return -ENOSYS; } if (!size) { - pr_warning("mtrr: zero sized request\n"); + pr_warn("mtrr: zero sized request\n"); return -EINVAL; } if ((base | (base + size - 1)) >> (boot_cpu_data.x86_phys_bits - PAGE_SHIFT)) { - pr_warning("mtrr: base or size exceeds the MTRR width\n"); + pr_warn("mtrr: base or size exceeds the MTRR width\n"); return -EINVAL; } @@ -348,7 +348,7 @@ int mtrr_add_page(unsigned long base, unsigned long size, } else if (types_compatible(type, ltype)) continue; } - pr_warning("mtrr: 0x%lx000,0x%lx000 overlaps existing" + pr_warn("mtrr: 0x%lx000,0x%lx000 overlaps existing" " 0x%lx000,0x%lx000\n", base, size, lbase, lsize); goto out; @@ -357,7 +357,7 @@ int mtrr_add_page(unsigned long base, unsigned long size, if (ltype != type) { if (types_compatible(type, ltype)) continue; - pr_warning("mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n", + pr_warn("mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n", base, size, mtrr_attrib_to_str(ltype), mtrr_attrib_to_str(type)); goto out; @@ -395,7 +395,7 @@ int mtrr_add_page(unsigned long base, unsigned long size, static int mtrr_check(unsigned long base, unsigned long size) { if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { - pr_warning("mtrr: size and base must be multiples of 4 kiB\n"); + pr_warn("mtrr: size and base must be multiples of 4 kiB\n"); pr_debug("mtrr: size: 0x%lx base: 0x%lx\n", size, base); dump_stack(); return -1; @@ -493,16 +493,16 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size) } } if (reg >= max) { - pr_warning("mtrr: register: %d too big\n", reg); + pr_warn("mtrr: register: %d too big\n", reg); goto out; } mtrr_if->get(reg, &lbase, &lsize, <ype); if (lsize < 1) { - pr_warning("mtrr: MTRR %d not used\n", reg); + pr_warn("mtrr: MTRR %d not used\n", reg); goto out; } if (mtrr_usage_table[reg] < 1) { - pr_warning("mtrr: reg: %d has count=0\n", reg); + pr_warn("mtrr: reg: %d has count=0\n", reg); goto out; } if (--mtrr_usage_table[reg] < 1) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c deleted file mode 100644 index 1b443db2db50..000000000000 --- a/arch/x86/kernel/cpu/perf_event.c +++ /dev/null @@ -1,2428 +0,0 @@ -/* - * Performance events x86 architecture code - * - * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> - * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2009 Jaswinder Singh Rajput - * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter - * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra - * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com> - * Copyright (C) 2009 Google, Inc., Stephane Eranian - * - * For licencing details see kernel-base/COPYING - */ - -#include <linux/perf_event.h> -#include <linux/capability.h> -#include <linux/notifier.h> -#include <linux/hardirq.h> -#include <linux/kprobes.h> -#include <linux/module.h> -#include <linux/kdebug.h> -#include <linux/sched.h> -#include <linux/uaccess.h> -#include <linux/slab.h> -#include <linux/cpu.h> -#include <linux/bitops.h> -#include <linux/device.h> - -#include <asm/apic.h> -#include <asm/stacktrace.h> -#include <asm/nmi.h> -#include <asm/smp.h> -#include <asm/alternative.h> -#include <asm/mmu_context.h> -#include <asm/tlbflush.h> -#include <asm/timer.h> -#include <asm/desc.h> -#include <asm/ldt.h> - -#include "perf_event.h" - -struct x86_pmu x86_pmu __read_mostly; - -DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { - .enabled = 1, -}; - -struct static_key rdpmc_always_available = STATIC_KEY_INIT_FALSE; - -u64 __read_mostly hw_cache_event_ids - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX]; -u64 __read_mostly hw_cache_extra_regs - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX]; - -/* - * Propagate event elapsed time into the generic event. - * Can only be executed on the CPU where the event is active. - * Returns the delta events processed. - */ -u64 x86_perf_event_update(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - int shift = 64 - x86_pmu.cntval_bits; - u64 prev_raw_count, new_raw_count; - int idx = hwc->idx; - s64 delta; - - if (idx == INTEL_PMC_IDX_FIXED_BTS) - return 0; - - /* - * Careful: an NMI might modify the previous event value. - * - * Our tactic to handle this is to first atomically read and - * exchange a new raw count - then add that new-prev delta - * count to the generic event atomically: - */ -again: - prev_raw_count = local64_read(&hwc->prev_count); - rdpmcl(hwc->event_base_rdpmc, new_raw_count); - - if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, - new_raw_count) != prev_raw_count) - goto again; - - /* - * Now we have the new raw value and have updated the prev - * timestamp already. We can now calculate the elapsed delta - * (event-)time and add that to the generic event. - * - * Careful, not all hw sign-extends above the physical width - * of the count. - */ - delta = (new_raw_count << shift) - (prev_raw_count << shift); - delta >>= shift; - - local64_add(delta, &event->count); - local64_sub(delta, &hwc->period_left); - - return new_raw_count; -} - -/* - * Find and validate any extra registers to set up. - */ -static int x86_pmu_extra_regs(u64 config, struct perf_event *event) -{ - struct hw_perf_event_extra *reg; - struct extra_reg *er; - - reg = &event->hw.extra_reg; - - if (!x86_pmu.extra_regs) - return 0; - - for (er = x86_pmu.extra_regs; er->msr; er++) { - if (er->event != (config & er->config_mask)) - continue; - if (event->attr.config1 & ~er->valid_mask) - return -EINVAL; - /* Check if the extra msrs can be safely accessed*/ - if (!er->extra_msr_access) - return -ENXIO; - - reg->idx = er->idx; - reg->config = event->attr.config1; - reg->reg = er->msr; - break; - } - return 0; -} - -static atomic_t active_events; -static atomic_t pmc_refcount; -static DEFINE_MUTEX(pmc_reserve_mutex); - -#ifdef CONFIG_X86_LOCAL_APIC - -static bool reserve_pmc_hardware(void) -{ - int i; - - for (i = 0; i < x86_pmu.num_counters; i++) { - if (!reserve_perfctr_nmi(x86_pmu_event_addr(i))) - goto perfctr_fail; - } - - for (i = 0; i < x86_pmu.num_counters; i++) { - if (!reserve_evntsel_nmi(x86_pmu_config_addr(i))) - goto eventsel_fail; - } - - return true; - -eventsel_fail: - for (i--; i >= 0; i--) - release_evntsel_nmi(x86_pmu_config_addr(i)); - - i = x86_pmu.num_counters; - -perfctr_fail: - for (i--; i >= 0; i--) - release_perfctr_nmi(x86_pmu_event_addr(i)); - - return false; -} - -static void release_pmc_hardware(void) -{ - int i; - - for (i = 0; i < x86_pmu.num_counters; i++) { - release_perfctr_nmi(x86_pmu_event_addr(i)); - release_evntsel_nmi(x86_pmu_config_addr(i)); - } -} - -#else - -static bool reserve_pmc_hardware(void) { return true; } -static void release_pmc_hardware(void) {} - -#endif - -static bool check_hw_exists(void) -{ - u64 val, val_fail, val_new= ~0; - int i, reg, reg_fail, ret = 0; - int bios_fail = 0; - int reg_safe = -1; - - /* - * Check to see if the BIOS enabled any of the counters, if so - * complain and bail. - */ - for (i = 0; i < x86_pmu.num_counters; i++) { - reg = x86_pmu_config_addr(i); - ret = rdmsrl_safe(reg, &val); - if (ret) - goto msr_fail; - if (val & ARCH_PERFMON_EVENTSEL_ENABLE) { - bios_fail = 1; - val_fail = val; - reg_fail = reg; - } else { - reg_safe = i; - } - } - - if (x86_pmu.num_counters_fixed) { - reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; - ret = rdmsrl_safe(reg, &val); - if (ret) - goto msr_fail; - for (i = 0; i < x86_pmu.num_counters_fixed; i++) { - if (val & (0x03 << i*4)) { - bios_fail = 1; - val_fail = val; - reg_fail = reg; - } - } - } - - /* - * If all the counters are enabled, the below test will always - * fail. The tools will also become useless in this scenario. - * Just fail and disable the hardware counters. - */ - - if (reg_safe == -1) { - reg = reg_safe; - goto msr_fail; - } - - /* - * Read the current value, change it and read it back to see if it - * matches, this is needed to detect certain hardware emulators - * (qemu/kvm) that don't trap on the MSR access and always return 0s. - */ - reg = x86_pmu_event_addr(reg_safe); - if (rdmsrl_safe(reg, &val)) - goto msr_fail; - val ^= 0xffffUL; - ret = wrmsrl_safe(reg, val); - ret |= rdmsrl_safe(reg, &val_new); - if (ret || val != val_new) - goto msr_fail; - - /* - * We still allow the PMU driver to operate: - */ - if (bios_fail) { - printk(KERN_CONT "Broken BIOS detected, complain to your hardware vendor.\n"); - printk(KERN_ERR FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", reg_fail, val_fail); - } - - return true; - -msr_fail: - printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n"); - printk("%sFailed to access perfctr msr (MSR %x is %Lx)\n", - boot_cpu_has(X86_FEATURE_HYPERVISOR) ? KERN_INFO : KERN_ERR, - reg, val_new); - - return false; -} - -static void hw_perf_event_destroy(struct perf_event *event) -{ - x86_release_hardware(); - atomic_dec(&active_events); -} - -void hw_perf_lbr_event_destroy(struct perf_event *event) -{ - hw_perf_event_destroy(event); - - /* undo the lbr/bts event accounting */ - x86_del_exclusive(x86_lbr_exclusive_lbr); -} - -static inline int x86_pmu_initialized(void) -{ - return x86_pmu.handle_irq != NULL; -} - -static inline int -set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event) -{ - struct perf_event_attr *attr = &event->attr; - unsigned int cache_type, cache_op, cache_result; - u64 config, val; - - config = attr->config; - - cache_type = (config >> 0) & 0xff; - if (cache_type >= PERF_COUNT_HW_CACHE_MAX) - return -EINVAL; - - cache_op = (config >> 8) & 0xff; - if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX) - return -EINVAL; - - cache_result = (config >> 16) & 0xff; - if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX) - return -EINVAL; - - val = hw_cache_event_ids[cache_type][cache_op][cache_result]; - - if (val == 0) - return -ENOENT; - - if (val == -1) - return -EINVAL; - - hwc->config |= val; - attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result]; - return x86_pmu_extra_regs(val, event); -} - -int x86_reserve_hardware(void) -{ - int err = 0; - - if (!atomic_inc_not_zero(&pmc_refcount)) { - mutex_lock(&pmc_reserve_mutex); - if (atomic_read(&pmc_refcount) == 0) { - if (!reserve_pmc_hardware()) - err = -EBUSY; - else - reserve_ds_buffers(); - } - if (!err) - atomic_inc(&pmc_refcount); - mutex_unlock(&pmc_reserve_mutex); - } - - return err; -} - -void x86_release_hardware(void) -{ - if (atomic_dec_and_mutex_lock(&pmc_refcount, &pmc_reserve_mutex)) { - release_pmc_hardware(); - release_ds_buffers(); - mutex_unlock(&pmc_reserve_mutex); - } -} - -/* - * Check if we can create event of a certain type (that no conflicting events - * are present). - */ -int x86_add_exclusive(unsigned int what) -{ - int i; - - if (!atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what])) { - mutex_lock(&pmc_reserve_mutex); - for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++) { - if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i])) - goto fail_unlock; - } - atomic_inc(&x86_pmu.lbr_exclusive[what]); - mutex_unlock(&pmc_reserve_mutex); - } - - atomic_inc(&active_events); - return 0; - -fail_unlock: - mutex_unlock(&pmc_reserve_mutex); - return -EBUSY; -} - -void x86_del_exclusive(unsigned int what) -{ - atomic_dec(&x86_pmu.lbr_exclusive[what]); - atomic_dec(&active_events); -} - -int x86_setup_perfctr(struct perf_event *event) -{ - struct perf_event_attr *attr = &event->attr; - struct hw_perf_event *hwc = &event->hw; - u64 config; - - if (!is_sampling_event(event)) { - hwc->sample_period = x86_pmu.max_period; - hwc->last_period = hwc->sample_period; - local64_set(&hwc->period_left, hwc->sample_period); - } - - if (attr->type == PERF_TYPE_RAW) - return x86_pmu_extra_regs(event->attr.config, event); - - if (attr->type == PERF_TYPE_HW_CACHE) - return set_ext_hw_attr(hwc, event); - - if (attr->config >= x86_pmu.max_events) - return -EINVAL; - - /* - * The generic map: - */ - config = x86_pmu.event_map(attr->config); - - if (config == 0) - return -ENOENT; - - if (config == -1LL) - return -EINVAL; - - /* - * Branch tracing: - */ - if (attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS && - !attr->freq && hwc->sample_period == 1) { - /* BTS is not supported by this architecture. */ - if (!x86_pmu.bts_active) - return -EOPNOTSUPP; - - /* BTS is currently only allowed for user-mode. */ - if (!attr->exclude_kernel) - return -EOPNOTSUPP; - - /* disallow bts if conflicting events are present */ - if (x86_add_exclusive(x86_lbr_exclusive_lbr)) - return -EBUSY; - - event->destroy = hw_perf_lbr_event_destroy; - } - - hwc->config |= config; - - return 0; -} - -/* - * check that branch_sample_type is compatible with - * settings needed for precise_ip > 1 which implies - * using the LBR to capture ALL taken branches at the - * priv levels of the measurement - */ -static inline int precise_br_compat(struct perf_event *event) -{ - u64 m = event->attr.branch_sample_type; - u64 b = 0; - - /* must capture all branches */ - if (!(m & PERF_SAMPLE_BRANCH_ANY)) - return 0; - - m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER; - - if (!event->attr.exclude_user) - b |= PERF_SAMPLE_BRANCH_USER; - - if (!event->attr.exclude_kernel) - b |= PERF_SAMPLE_BRANCH_KERNEL; - - /* - * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86 - */ - - return m == b; -} - -int x86_pmu_hw_config(struct perf_event *event) -{ - if (event->attr.precise_ip) { - int precise = 0; - - /* Support for constant skid */ - if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) { - precise++; - - /* Support for IP fixup */ - if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2) - precise++; - - if (x86_pmu.pebs_prec_dist) - precise++; - } - - if (event->attr.precise_ip > precise) - return -EOPNOTSUPP; - } - /* - * check that PEBS LBR correction does not conflict with - * whatever the user is asking with attr->branch_sample_type - */ - if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) { - u64 *br_type = &event->attr.branch_sample_type; - - if (has_branch_stack(event)) { - if (!precise_br_compat(event)) - return -EOPNOTSUPP; - - /* branch_sample_type is compatible */ - - } else { - /* - * user did not specify branch_sample_type - * - * For PEBS fixups, we capture all - * the branches at the priv level of the - * event. - */ - *br_type = PERF_SAMPLE_BRANCH_ANY; - - if (!event->attr.exclude_user) - *br_type |= PERF_SAMPLE_BRANCH_USER; - - if (!event->attr.exclude_kernel) - *br_type |= PERF_SAMPLE_BRANCH_KERNEL; - } - } - - if (event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK) - event->attach_state |= PERF_ATTACH_TASK_DATA; - - /* - * Generate PMC IRQs: - * (keep 'enabled' bit clear for now) - */ - event->hw.config = ARCH_PERFMON_EVENTSEL_INT; - - /* - * Count user and OS events unless requested not to - */ - if (!event->attr.exclude_user) - event->hw.config |= ARCH_PERFMON_EVENTSEL_USR; - if (!event->attr.exclude_kernel) - event->hw.config |= ARCH_PERFMON_EVENTSEL_OS; - - if (event->attr.type == PERF_TYPE_RAW) - event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK; - - if (event->attr.sample_period && x86_pmu.limit_period) { - if (x86_pmu.limit_period(event, event->attr.sample_period) > - event->attr.sample_period) - return -EINVAL; - } - - return x86_setup_perfctr(event); -} - -/* - * Setup the hardware configuration for a given attr_type - */ -static int __x86_pmu_event_init(struct perf_event *event) -{ - int err; - - if (!x86_pmu_initialized()) - return -ENODEV; - - err = x86_reserve_hardware(); - if (err) - return err; - - atomic_inc(&active_events); - event->destroy = hw_perf_event_destroy; - - event->hw.idx = -1; - event->hw.last_cpu = -1; - event->hw.last_tag = ~0ULL; - - /* mark unused */ - event->hw.extra_reg.idx = EXTRA_REG_NONE; - event->hw.branch_reg.idx = EXTRA_REG_NONE; - - return x86_pmu.hw_config(event); -} - -void x86_pmu_disable_all(void) -{ - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - int idx; - - for (idx = 0; idx < x86_pmu.num_counters; idx++) { - u64 val; - - if (!test_bit(idx, cpuc->active_mask)) - continue; - rdmsrl(x86_pmu_config_addr(idx), val); - if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE)) - continue; - val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; - wrmsrl(x86_pmu_config_addr(idx), val); - } -} - -static void x86_pmu_disable(struct pmu *pmu) -{ - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - - if (!x86_pmu_initialized()) - return; - - if (!cpuc->enabled) - return; - - cpuc->n_added = 0; - cpuc->enabled = 0; - barrier(); - - x86_pmu.disable_all(); -} - -void x86_pmu_enable_all(int added) -{ - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - int idx; - - for (idx = 0; idx < x86_pmu.num_counters; idx++) { - struct hw_perf_event *hwc = &cpuc->events[idx]->hw; - - if (!test_bit(idx, cpuc->active_mask)) - continue; - - __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); - } -} - -static struct pmu pmu; - -static inline int is_x86_event(struct perf_event *event) -{ - return event->pmu == &pmu; -} - -/* - * Event scheduler state: - * - * Assign events iterating over all events and counters, beginning - * with events with least weights first. Keep the current iterator - * state in struct sched_state. - */ -struct sched_state { - int weight; - int event; /* event index */ - int counter; /* counter index */ - int unassigned; /* number of events to be assigned left */ - int nr_gp; /* number of GP counters used */ - unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; -}; - -/* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */ -#define SCHED_STATES_MAX 2 - -struct perf_sched { - int max_weight; - int max_events; - int max_gp; - int saved_states; - struct event_constraint **constraints; - struct sched_state state; - struct sched_state saved[SCHED_STATES_MAX]; -}; - -/* - * Initialize interator that runs through all events and counters. - */ -static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints, - int num, int wmin, int wmax, int gpmax) -{ - int idx; - - memset(sched, 0, sizeof(*sched)); - sched->max_events = num; - sched->max_weight = wmax; - sched->max_gp = gpmax; - sched->constraints = constraints; - - for (idx = 0; idx < num; idx++) { - if (constraints[idx]->weight == wmin) - break; - } - - sched->state.event = idx; /* start with min weight */ - sched->state.weight = wmin; - sched->state.unassigned = num; -} - -static void perf_sched_save_state(struct perf_sched *sched) -{ - if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX)) - return; - - sched->saved[sched->saved_states] = sched->state; - sched->saved_states++; -} - -static bool perf_sched_restore_state(struct perf_sched *sched) -{ - if (!sched->saved_states) - return false; - - sched->saved_states--; - sched->state = sched->saved[sched->saved_states]; - - /* continue with next counter: */ - clear_bit(sched->state.counter++, sched->state.used); - - return true; -} - -/* - * Select a counter for the current event to schedule. Return true on - * success. - */ -static bool __perf_sched_find_counter(struct perf_sched *sched) -{ - struct event_constraint *c; - int idx; - - if (!sched->state.unassigned) - return false; - - if (sched->state.event >= sched->max_events) - return false; - - c = sched->constraints[sched->state.event]; - /* Prefer fixed purpose counters */ - if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) { - idx = INTEL_PMC_IDX_FIXED; - for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) { - if (!__test_and_set_bit(idx, sched->state.used)) - goto done; - } - } - - /* Grab the first unused counter starting with idx */ - idx = sched->state.counter; - for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) { - if (!__test_and_set_bit(idx, sched->state.used)) { - if (sched->state.nr_gp++ >= sched->max_gp) - return false; - - goto done; - } - } - - return false; - -done: - sched->state.counter = idx; - - if (c->overlap) - perf_sched_save_state(sched); - - return true; -} - -static bool perf_sched_find_counter(struct perf_sched *sched) -{ - while (!__perf_sched_find_counter(sched)) { - if (!perf_sched_restore_state(sched)) - return false; - } - - return true; -} - -/* - * Go through all unassigned events and find the next one to schedule. - * Take events with the least weight first. Return true on success. - */ -static bool perf_sched_next_event(struct perf_sched *sched) -{ - struct event_constraint *c; - - if (!sched->state.unassigned || !--sched->state.unassigned) - return false; - - do { - /* next event */ - sched->state.event++; - if (sched->state.event >= sched->max_events) { - /* next weight */ - sched->state.event = 0; - sched->state.weight++; - if (sched->state.weight > sched->max_weight) - return false; - } - c = sched->constraints[sched->state.event]; - } while (c->weight != sched->state.weight); - - sched->state.counter = 0; /* start with first counter */ - - return true; -} - -/* - * Assign a counter for each event. - */ -int perf_assign_events(struct event_constraint **constraints, int n, - int wmin, int wmax, int gpmax, int *assign) -{ - struct perf_sched sched; - - perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax); - - do { - if (!perf_sched_find_counter(&sched)) - break; /* failed */ - if (assign) - assign[sched.state.event] = sched.state.counter; - } while (perf_sched_next_event(&sched)); - - return sched.state.unassigned; -} -EXPORT_SYMBOL_GPL(perf_assign_events); - -int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) -{ - struct event_constraint *c; - unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; - struct perf_event *e; - int i, wmin, wmax, unsched = 0; - struct hw_perf_event *hwc; - - bitmap_zero(used_mask, X86_PMC_IDX_MAX); - - if (x86_pmu.start_scheduling) - x86_pmu.start_scheduling(cpuc); - - for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) { - cpuc->event_constraint[i] = NULL; - c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]); - cpuc->event_constraint[i] = c; - - wmin = min(wmin, c->weight); - wmax = max(wmax, c->weight); - } - - /* - * fastpath, try to reuse previous register - */ - for (i = 0; i < n; i++) { - hwc = &cpuc->event_list[i]->hw; - c = cpuc->event_constraint[i]; - - /* never assigned */ - if (hwc->idx == -1) - break; - - /* constraint still honored */ - if (!test_bit(hwc->idx, c->idxmsk)) - break; - - /* not already used */ - if (test_bit(hwc->idx, used_mask)) - break; - - __set_bit(hwc->idx, used_mask); - if (assign) - assign[i] = hwc->idx; - } - - /* slow path */ - if (i != n) { - int gpmax = x86_pmu.num_counters; - - /* - * Do not allow scheduling of more than half the available - * generic counters. - * - * This helps avoid counter starvation of sibling thread by - * ensuring at most half the counters cannot be in exclusive - * mode. There is no designated counters for the limits. Any - * N/2 counters can be used. This helps with events with - * specific counter constraints. - */ - if (is_ht_workaround_enabled() && !cpuc->is_fake && - READ_ONCE(cpuc->excl_cntrs->exclusive_present)) - gpmax /= 2; - - unsched = perf_assign_events(cpuc->event_constraint, n, wmin, - wmax, gpmax, assign); - } - - /* - * In case of success (unsched = 0), mark events as committed, - * so we do not put_constraint() in case new events are added - * and fail to be scheduled - * - * We invoke the lower level commit callback to lock the resource - * - * We do not need to do all of this in case we are called to - * validate an event group (assign == NULL) - */ - if (!unsched && assign) { - for (i = 0; i < n; i++) { - e = cpuc->event_list[i]; - e->hw.flags |= PERF_X86_EVENT_COMMITTED; - if (x86_pmu.commit_scheduling) - x86_pmu.commit_scheduling(cpuc, i, assign[i]); - } - } else { - for (i = 0; i < n; i++) { - e = cpuc->event_list[i]; - /* - * do not put_constraint() on comitted events, - * because they are good to go - */ - if ((e->hw.flags & PERF_X86_EVENT_COMMITTED)) - continue; - - /* - * release events that failed scheduling - */ - if (x86_pmu.put_event_constraints) - x86_pmu.put_event_constraints(cpuc, e); - } - } - - if (x86_pmu.stop_scheduling) - x86_pmu.stop_scheduling(cpuc); - - return unsched ? -EINVAL : 0; -} - -/* - * dogrp: true if must collect siblings events (group) - * returns total number of events and error code - */ -static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp) -{ - struct perf_event *event; - int n, max_count; - - max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed; - - /* current number of events already accepted */ - n = cpuc->n_events; - - if (is_x86_event(leader)) { - if (n >= max_count) - return -EINVAL; - cpuc->event_list[n] = leader; - n++; - } - if (!dogrp) - return n; - - list_for_each_entry(event, &leader->sibling_list, group_entry) { - if (!is_x86_event(event) || - event->state <= PERF_EVENT_STATE_OFF) - continue; - - if (n >= max_count) - return -EINVAL; - - cpuc->event_list[n] = event; - n++; - } - return n; -} - -static inline void x86_assign_hw_event(struct perf_event *event, - struct cpu_hw_events *cpuc, int i) -{ - struct hw_perf_event *hwc = &event->hw; - - hwc->idx = cpuc->assign[i]; - hwc->last_cpu = smp_processor_id(); - hwc->last_tag = ++cpuc->tags[i]; - - if (hwc->idx == INTEL_PMC_IDX_FIXED_BTS) { - hwc->config_base = 0; - hwc->event_base = 0; - } else if (hwc->idx >= INTEL_PMC_IDX_FIXED) { - hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; - hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - INTEL_PMC_IDX_FIXED); - hwc->event_base_rdpmc = (hwc->idx - INTEL_PMC_IDX_FIXED) | 1<<30; - } else { - hwc->config_base = x86_pmu_config_addr(hwc->idx); - hwc->event_base = x86_pmu_event_addr(hwc->idx); - hwc->event_base_rdpmc = x86_pmu_rdpmc_index(hwc->idx); - } -} - -static inline int match_prev_assignment(struct hw_perf_event *hwc, - struct cpu_hw_events *cpuc, - int i) -{ - return hwc->idx == cpuc->assign[i] && - hwc->last_cpu == smp_processor_id() && - hwc->last_tag == cpuc->tags[i]; -} - -static void x86_pmu_start(struct perf_event *event, int flags); - -static void x86_pmu_enable(struct pmu *pmu) -{ - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - struct perf_event *event; - struct hw_perf_event *hwc; - int i, added = cpuc->n_added; - - if (!x86_pmu_initialized()) - return; - - if (cpuc->enabled) - return; - - if (cpuc->n_added) { - int n_running = cpuc->n_events - cpuc->n_added; - /* - * apply assignment obtained either from - * hw_perf_group_sched_in() or x86_pmu_enable() - * - * step1: save events moving to new counters - */ - for (i = 0; i < n_running; i++) { - event = cpuc->event_list[i]; - hwc = &event->hw; - - /* - * we can avoid reprogramming counter if: - * - assigned same counter as last time - * - running on same CPU as last time - * - no other event has used the counter since - */ - if (hwc->idx == -1 || - match_prev_assignment(hwc, cpuc, i)) - continue; - - /* - * Ensure we don't accidentally enable a stopped - * counter simply because we rescheduled. - */ - if (hwc->state & PERF_HES_STOPPED) - hwc->state |= PERF_HES_ARCH; - - x86_pmu_stop(event, PERF_EF_UPDATE); - } - - /* - * step2: reprogram moved events into new counters - */ - for (i = 0; i < cpuc->n_events; i++) { - event = cpuc->event_list[i]; - hwc = &event->hw; - - if (!match_prev_assignment(hwc, cpuc, i)) - x86_assign_hw_event(event, cpuc, i); - else if (i < n_running) - continue; - - if (hwc->state & PERF_HES_ARCH) - continue; - - x86_pmu_start(event, PERF_EF_RELOAD); - } - cpuc->n_added = 0; - perf_events_lapic_init(); - } - - cpuc->enabled = 1; - barrier(); - - x86_pmu.enable_all(added); -} - -static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); - -/* - * Set the next IRQ period, based on the hwc->period_left value. - * To be called with the event disabled in hw: - */ -int x86_perf_event_set_period(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - s64 left = local64_read(&hwc->period_left); - s64 period = hwc->sample_period; - int ret = 0, idx = hwc->idx; - - if (idx == INTEL_PMC_IDX_FIXED_BTS) - return 0; - - /* - * If we are way outside a reasonable range then just skip forward: - */ - if (unlikely(left <= -period)) { - left = period; - local64_set(&hwc->period_left, left); - hwc->last_period = period; - ret = 1; - } - - if (unlikely(left <= 0)) { - left += period; - local64_set(&hwc->period_left, left); - hwc->last_period = period; - ret = 1; - } - /* - * Quirk: certain CPUs dont like it if just 1 hw_event is left: - */ - if (unlikely(left < 2)) - left = 2; - - if (left > x86_pmu.max_period) - left = x86_pmu.max_period; - - if (x86_pmu.limit_period) - left = x86_pmu.limit_period(event, left); - - per_cpu(pmc_prev_left[idx], smp_processor_id()) = left; - - if (!(hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) || - local64_read(&hwc->prev_count) != (u64)-left) { - /* - * The hw event starts counting from this event offset, - * mark it to be able to extra future deltas: - */ - local64_set(&hwc->prev_count, (u64)-left); - - wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask); - } - - /* - * Due to erratum on certan cpu we need - * a second write to be sure the register - * is updated properly - */ - if (x86_pmu.perfctr_second_write) { - wrmsrl(hwc->event_base, - (u64)(-left) & x86_pmu.cntval_mask); - } - - perf_event_update_userpage(event); - - return ret; -} - -void x86_pmu_enable_event(struct perf_event *event) -{ - if (__this_cpu_read(cpu_hw_events.enabled)) - __x86_pmu_enable_event(&event->hw, - ARCH_PERFMON_EVENTSEL_ENABLE); -} - -/* - * Add a single event to the PMU. - * - * The event is added to the group of enabled events - * but only if it can be scehduled with existing events. - */ -static int x86_pmu_add(struct perf_event *event, int flags) -{ - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - struct hw_perf_event *hwc; - int assign[X86_PMC_IDX_MAX]; - int n, n0, ret; - - hwc = &event->hw; - - n0 = cpuc->n_events; - ret = n = collect_events(cpuc, event, false); - if (ret < 0) - goto out; - - hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; - if (!(flags & PERF_EF_START)) - hwc->state |= PERF_HES_ARCH; - - /* - * If group events scheduling transaction was started, - * skip the schedulability test here, it will be performed - * at commit time (->commit_txn) as a whole. - */ - if (cpuc->txn_flags & PERF_PMU_TXN_ADD) - goto done_collect; - - ret = x86_pmu.schedule_events(cpuc, n, assign); - if (ret) - goto out; - /* - * copy new assignment, now we know it is possible - * will be used by hw_perf_enable() - */ - memcpy(cpuc->assign, assign, n*sizeof(int)); - -done_collect: - /* - * Commit the collect_events() state. See x86_pmu_del() and - * x86_pmu_*_txn(). - */ - cpuc->n_events = n; - cpuc->n_added += n - n0; - cpuc->n_txn += n - n0; - - ret = 0; -out: - return ret; -} - -static void x86_pmu_start(struct perf_event *event, int flags) -{ - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - int idx = event->hw.idx; - - if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) - return; - - if (WARN_ON_ONCE(idx == -1)) - return; - - if (flags & PERF_EF_RELOAD) { - WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE)); - x86_perf_event_set_period(event); - } - - event->hw.state = 0; - - cpuc->events[idx] = event; - __set_bit(idx, cpuc->active_mask); - __set_bit(idx, cpuc->running); - x86_pmu.enable(event); - perf_event_update_userpage(event); -} - -void perf_event_print_debug(void) -{ - u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; - u64 pebs, debugctl; - struct cpu_hw_events *cpuc; - unsigned long flags; - int cpu, idx; - - if (!x86_pmu.num_counters) - return; - - local_irq_save(flags); - - cpu = smp_processor_id(); - cpuc = &per_cpu(cpu_hw_events, cpu); - - if (x86_pmu.version >= 2) { - rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); - rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); - rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); - rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed); - - pr_info("\n"); - pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl); - pr_info("CPU#%d: status: %016llx\n", cpu, status); - pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); - pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); - if (x86_pmu.pebs_constraints) { - rdmsrl(MSR_IA32_PEBS_ENABLE, pebs); - pr_info("CPU#%d: pebs: %016llx\n", cpu, pebs); - } - if (x86_pmu.lbr_nr) { - rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); - pr_info("CPU#%d: debugctl: %016llx\n", cpu, debugctl); - } - } - pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask); - - for (idx = 0; idx < x86_pmu.num_counters; idx++) { - rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl); - rdmsrl(x86_pmu_event_addr(idx), pmc_count); - - prev_left = per_cpu(pmc_prev_left[idx], cpu); - - pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n", - cpu, idx, pmc_ctrl); - pr_info("CPU#%d: gen-PMC%d count: %016llx\n", - cpu, idx, pmc_count); - pr_info("CPU#%d: gen-PMC%d left: %016llx\n", - cpu, idx, prev_left); - } - for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { - rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); - - pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", - cpu, idx, pmc_count); - } - local_irq_restore(flags); -} - -void x86_pmu_stop(struct perf_event *event, int flags) -{ - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - struct hw_perf_event *hwc = &event->hw; - - if (__test_and_clear_bit(hwc->idx, cpuc->active_mask)) { - x86_pmu.disable(event); - cpuc->events[hwc->idx] = NULL; - WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); - hwc->state |= PERF_HES_STOPPED; - } - - if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { - /* - * Drain the remaining delta count out of a event - * that we are disabling: - */ - x86_perf_event_update(event); - hwc->state |= PERF_HES_UPTODATE; - } -} - -static void x86_pmu_del(struct perf_event *event, int flags) -{ - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - int i; - - /* - * event is descheduled - */ - event->hw.flags &= ~PERF_X86_EVENT_COMMITTED; - - /* - * If we're called during a txn, we don't need to do anything. - * The events never got scheduled and ->cancel_txn will truncate - * the event_list. - * - * XXX assumes any ->del() called during a TXN will only be on - * an event added during that same TXN. - */ - if (cpuc->txn_flags & PERF_PMU_TXN_ADD) - return; - - /* - * Not a TXN, therefore cleanup properly. - */ - x86_pmu_stop(event, PERF_EF_UPDATE); - - for (i = 0; i < cpuc->n_events; i++) { - if (event == cpuc->event_list[i]) - break; - } - - if (WARN_ON_ONCE(i == cpuc->n_events)) /* called ->del() without ->add() ? */ - return; - - /* If we have a newly added event; make sure to decrease n_added. */ - if (i >= cpuc->n_events - cpuc->n_added) - --cpuc->n_added; - - if (x86_pmu.put_event_constraints) - x86_pmu.put_event_constraints(cpuc, event); - - /* Delete the array entry. */ - while (++i < cpuc->n_events) { - cpuc->event_list[i-1] = cpuc->event_list[i]; - cpuc->event_constraint[i-1] = cpuc->event_constraint[i]; - } - --cpuc->n_events; - - perf_event_update_userpage(event); -} - -int x86_pmu_handle_irq(struct pt_regs *regs) -{ - struct perf_sample_data data; - struct cpu_hw_events *cpuc; - struct perf_event *event; - int idx, handled = 0; - u64 val; - - cpuc = this_cpu_ptr(&cpu_hw_events); - - /* - * Some chipsets need to unmask the LVTPC in a particular spot - * inside the nmi handler. As a result, the unmasking was pushed - * into all the nmi handlers. - * - * This generic handler doesn't seem to have any issues where the - * unmasking occurs so it was left at the top. - */ - apic_write(APIC_LVTPC, APIC_DM_NMI); - - for (idx = 0; idx < x86_pmu.num_counters; idx++) { - if (!test_bit(idx, cpuc->active_mask)) { - /* - * Though we deactivated the counter some cpus - * might still deliver spurious interrupts still - * in flight. Catch them: - */ - if (__test_and_clear_bit(idx, cpuc->running)) - handled++; - continue; - } - - event = cpuc->events[idx]; - - val = x86_perf_event_update(event); - if (val & (1ULL << (x86_pmu.cntval_bits - 1))) - continue; - - /* - * event overflow - */ - handled++; - perf_sample_data_init(&data, 0, event->hw.last_period); - - if (!x86_perf_event_set_period(event)) - continue; - - if (perf_event_overflow(event, &data, regs)) - x86_pmu_stop(event, 0); - } - - if (handled) - inc_irq_stat(apic_perf_irqs); - - return handled; -} - -void perf_events_lapic_init(void) -{ - if (!x86_pmu.apic || !x86_pmu_initialized()) - return; - - /* - * Always use NMI for PMU - */ - apic_write(APIC_LVTPC, APIC_DM_NMI); -} - -static int -perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs) -{ - u64 start_clock; - u64 finish_clock; - int ret; - - /* - * All PMUs/events that share this PMI handler should make sure to - * increment active_events for their events. - */ - if (!atomic_read(&active_events)) - return NMI_DONE; - - start_clock = sched_clock(); - ret = x86_pmu.handle_irq(regs); - finish_clock = sched_clock(); - - perf_sample_event_took(finish_clock - start_clock); - - return ret; -} -NOKPROBE_SYMBOL(perf_event_nmi_handler); - -struct event_constraint emptyconstraint; -struct event_constraint unconstrained; - -static int -x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) -{ - unsigned int cpu = (long)hcpu; - struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); - int i, ret = NOTIFY_OK; - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_UP_PREPARE: - for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) - cpuc->kfree_on_online[i] = NULL; - if (x86_pmu.cpu_prepare) - ret = x86_pmu.cpu_prepare(cpu); - break; - - case CPU_STARTING: - if (x86_pmu.cpu_starting) - x86_pmu.cpu_starting(cpu); - break; - - case CPU_ONLINE: - for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) { - kfree(cpuc->kfree_on_online[i]); - cpuc->kfree_on_online[i] = NULL; - } - break; - - case CPU_DYING: - if (x86_pmu.cpu_dying) - x86_pmu.cpu_dying(cpu); - break; - - case CPU_UP_CANCELED: - case CPU_DEAD: - if (x86_pmu.cpu_dead) - x86_pmu.cpu_dead(cpu); - break; - - default: - break; - } - - return ret; -} - -static void __init pmu_check_apic(void) -{ - if (cpu_has_apic) - return; - - x86_pmu.apic = 0; - pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n"); - pr_info("no hardware sampling interrupt available.\n"); - - /* - * If we have a PMU initialized but no APIC - * interrupts, we cannot sample hardware - * events (user-space has to fall back and - * sample via a hrtimer based software event): - */ - pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT; - -} - -static struct attribute_group x86_pmu_format_group = { - .name = "format", - .attrs = NULL, -}; - -/* - * Remove all undefined events (x86_pmu.event_map(id) == 0) - * out of events_attr attributes. - */ -static void __init filter_events(struct attribute **attrs) -{ - struct device_attribute *d; - struct perf_pmu_events_attr *pmu_attr; - int offset = 0; - int i, j; - - for (i = 0; attrs[i]; i++) { - d = (struct device_attribute *)attrs[i]; - pmu_attr = container_of(d, struct perf_pmu_events_attr, attr); - /* str trumps id */ - if (pmu_attr->event_str) - continue; - if (x86_pmu.event_map(i + offset)) - continue; - - for (j = i; attrs[j]; j++) - attrs[j] = attrs[j + 1]; - - /* Check the shifted attr. */ - i--; - - /* - * event_map() is index based, the attrs array is organized - * by increasing event index. If we shift the events, then - * we need to compensate for the event_map(), otherwise - * we are looking up the wrong event in the map - */ - offset++; - } -} - -/* Merge two pointer arrays */ -__init struct attribute **merge_attr(struct attribute **a, struct attribute **b) -{ - struct attribute **new; - int j, i; - - for (j = 0; a[j]; j++) - ; - for (i = 0; b[i]; i++) - j++; - j++; - - new = kmalloc(sizeof(struct attribute *) * j, GFP_KERNEL); - if (!new) - return NULL; - - j = 0; - for (i = 0; a[i]; i++) - new[j++] = a[i]; - for (i = 0; b[i]; i++) - new[j++] = b[i]; - new[j] = NULL; - - return new; -} - -ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, - char *page) -{ - struct perf_pmu_events_attr *pmu_attr = \ - container_of(attr, struct perf_pmu_events_attr, attr); - u64 config = x86_pmu.event_map(pmu_attr->id); - - /* string trumps id */ - if (pmu_attr->event_str) - return sprintf(page, "%s", pmu_attr->event_str); - - return x86_pmu.events_sysfs_show(page, config); -} - -EVENT_ATTR(cpu-cycles, CPU_CYCLES ); -EVENT_ATTR(instructions, INSTRUCTIONS ); -EVENT_ATTR(cache-references, CACHE_REFERENCES ); -EVENT_ATTR(cache-misses, CACHE_MISSES ); -EVENT_ATTR(branch-instructions, BRANCH_INSTRUCTIONS ); -EVENT_ATTR(branch-misses, BRANCH_MISSES ); -EVENT_ATTR(bus-cycles, BUS_CYCLES ); -EVENT_ATTR(stalled-cycles-frontend, STALLED_CYCLES_FRONTEND ); -EVENT_ATTR(stalled-cycles-backend, STALLED_CYCLES_BACKEND ); -EVENT_ATTR(ref-cycles, REF_CPU_CYCLES ); - -static struct attribute *empty_attrs; - -static struct attribute *events_attr[] = { - EVENT_PTR(CPU_CYCLES), - EVENT_PTR(INSTRUCTIONS), - EVENT_PTR(CACHE_REFERENCES), - EVENT_PTR(CACHE_MISSES), - EVENT_PTR(BRANCH_INSTRUCTIONS), - EVENT_PTR(BRANCH_MISSES), - EVENT_PTR(BUS_CYCLES), - EVENT_PTR(STALLED_CYCLES_FRONTEND), - EVENT_PTR(STALLED_CYCLES_BACKEND), - EVENT_PTR(REF_CPU_CYCLES), - NULL, -}; - -static struct attribute_group x86_pmu_events_group = { - .name = "events", - .attrs = events_attr, -}; - -ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event) -{ - u64 umask = (config & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; - u64 cmask = (config & ARCH_PERFMON_EVENTSEL_CMASK) >> 24; - bool edge = (config & ARCH_PERFMON_EVENTSEL_EDGE); - bool pc = (config & ARCH_PERFMON_EVENTSEL_PIN_CONTROL); - bool any = (config & ARCH_PERFMON_EVENTSEL_ANY); - bool inv = (config & ARCH_PERFMON_EVENTSEL_INV); - ssize_t ret; - - /* - * We have whole page size to spend and just little data - * to write, so we can safely use sprintf. - */ - ret = sprintf(page, "event=0x%02llx", event); - - if (umask) - ret += sprintf(page + ret, ",umask=0x%02llx", umask); - - if (edge) - ret += sprintf(page + ret, ",edge"); - - if (pc) - ret += sprintf(page + ret, ",pc"); - - if (any) - ret += sprintf(page + ret, ",any"); - - if (inv) - ret += sprintf(page + ret, ",inv"); - - if (cmask) - ret += sprintf(page + ret, ",cmask=0x%02llx", cmask); - - ret += sprintf(page + ret, "\n"); - - return ret; -} - -static int __init init_hw_perf_events(void) -{ - struct x86_pmu_quirk *quirk; - int err; - - pr_info("Performance Events: "); - - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_INTEL: - err = intel_pmu_init(); - break; - case X86_VENDOR_AMD: - err = amd_pmu_init(); - break; - default: - err = -ENOTSUPP; - } - if (err != 0) { - pr_cont("no PMU driver, software events only.\n"); - return 0; - } - - pmu_check_apic(); - - /* sanity check that the hardware exists or is emulated */ - if (!check_hw_exists()) - return 0; - - pr_cont("%s PMU driver.\n", x86_pmu.name); - - x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */ - - for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next) - quirk->func(); - - if (!x86_pmu.intel_ctrl) - x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1; - - perf_events_lapic_init(); - register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI"); - - unconstrained = (struct event_constraint) - __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1, - 0, x86_pmu.num_counters, 0, 0); - - x86_pmu_format_group.attrs = x86_pmu.format_attrs; - - if (x86_pmu.event_attrs) - x86_pmu_events_group.attrs = x86_pmu.event_attrs; - - if (!x86_pmu.events_sysfs_show) - x86_pmu_events_group.attrs = &empty_attrs; - else - filter_events(x86_pmu_events_group.attrs); - - if (x86_pmu.cpu_events) { - struct attribute **tmp; - - tmp = merge_attr(x86_pmu_events_group.attrs, x86_pmu.cpu_events); - if (!WARN_ON(!tmp)) - x86_pmu_events_group.attrs = tmp; - } - - pr_info("... version: %d\n", x86_pmu.version); - pr_info("... bit width: %d\n", x86_pmu.cntval_bits); - pr_info("... generic registers: %d\n", x86_pmu.num_counters); - pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask); - pr_info("... max period: %016Lx\n", x86_pmu.max_period); - pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed); - pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl); - - perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW); - perf_cpu_notifier(x86_pmu_notifier); - - return 0; -} -early_initcall(init_hw_perf_events); - -static inline void x86_pmu_read(struct perf_event *event) -{ - x86_perf_event_update(event); -} - -/* - * Start group events scheduling transaction - * Set the flag to make pmu::enable() not perform the - * schedulability test, it will be performed at commit time - * - * We only support PERF_PMU_TXN_ADD transactions. Save the - * transaction flags but otherwise ignore non-PERF_PMU_TXN_ADD - * transactions. - */ -static void x86_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags) -{ - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - - WARN_ON_ONCE(cpuc->txn_flags); /* txn already in flight */ - - cpuc->txn_flags = txn_flags; - if (txn_flags & ~PERF_PMU_TXN_ADD) - return; - - perf_pmu_disable(pmu); - __this_cpu_write(cpu_hw_events.n_txn, 0); -} - -/* - * Stop group events scheduling transaction - * Clear the flag and pmu::enable() will perform the - * schedulability test. - */ -static void x86_pmu_cancel_txn(struct pmu *pmu) -{ - unsigned int txn_flags; - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - - WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */ - - txn_flags = cpuc->txn_flags; - cpuc->txn_flags = 0; - if (txn_flags & ~PERF_PMU_TXN_ADD) - return; - - /* - * Truncate collected array by the number of events added in this - * transaction. See x86_pmu_add() and x86_pmu_*_txn(). - */ - __this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn)); - __this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn)); - perf_pmu_enable(pmu); -} - -/* - * Commit group events scheduling transaction - * Perform the group schedulability test as a whole - * Return 0 if success - * - * Does not cancel the transaction on failure; expects the caller to do this. - */ -static int x86_pmu_commit_txn(struct pmu *pmu) -{ - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - int assign[X86_PMC_IDX_MAX]; - int n, ret; - - WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */ - - if (cpuc->txn_flags & ~PERF_PMU_TXN_ADD) { - cpuc->txn_flags = 0; - return 0; - } - - n = cpuc->n_events; - - if (!x86_pmu_initialized()) - return -EAGAIN; - - ret = x86_pmu.schedule_events(cpuc, n, assign); - if (ret) - return ret; - - /* - * copy new assignment, now we know it is possible - * will be used by hw_perf_enable() - */ - memcpy(cpuc->assign, assign, n*sizeof(int)); - - cpuc->txn_flags = 0; - perf_pmu_enable(pmu); - return 0; -} -/* - * a fake_cpuc is used to validate event groups. Due to - * the extra reg logic, we need to also allocate a fake - * per_core and per_cpu structure. Otherwise, group events - * using extra reg may conflict without the kernel being - * able to catch this when the last event gets added to - * the group. - */ -static void free_fake_cpuc(struct cpu_hw_events *cpuc) -{ - kfree(cpuc->shared_regs); - kfree(cpuc); -} - -static struct cpu_hw_events *allocate_fake_cpuc(void) -{ - struct cpu_hw_events *cpuc; - int cpu = raw_smp_processor_id(); - - cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL); - if (!cpuc) - return ERR_PTR(-ENOMEM); - - /* only needed, if we have extra_regs */ - if (x86_pmu.extra_regs) { - cpuc->shared_regs = allocate_shared_regs(cpu); - if (!cpuc->shared_regs) - goto error; - } - cpuc->is_fake = 1; - return cpuc; -error: - free_fake_cpuc(cpuc); - return ERR_PTR(-ENOMEM); -} - -/* - * validate that we can schedule this event - */ -static int validate_event(struct perf_event *event) -{ - struct cpu_hw_events *fake_cpuc; - struct event_constraint *c; - int ret = 0; - - fake_cpuc = allocate_fake_cpuc(); - if (IS_ERR(fake_cpuc)) - return PTR_ERR(fake_cpuc); - - c = x86_pmu.get_event_constraints(fake_cpuc, -1, event); - - if (!c || !c->weight) - ret = -EINVAL; - - if (x86_pmu.put_event_constraints) - x86_pmu.put_event_constraints(fake_cpuc, event); - - free_fake_cpuc(fake_cpuc); - - return ret; -} - -/* - * validate a single event group - * - * validation include: - * - check events are compatible which each other - * - events do not compete for the same counter - * - number of events <= number of counters - * - * validation ensures the group can be loaded onto the - * PMU if it was the only group available. - */ -static int validate_group(struct perf_event *event) -{ - struct perf_event *leader = event->group_leader; - struct cpu_hw_events *fake_cpuc; - int ret = -EINVAL, n; - - fake_cpuc = allocate_fake_cpuc(); - if (IS_ERR(fake_cpuc)) - return PTR_ERR(fake_cpuc); - /* - * the event is not yet connected with its - * siblings therefore we must first collect - * existing siblings, then add the new event - * before we can simulate the scheduling - */ - n = collect_events(fake_cpuc, leader, true); - if (n < 0) - goto out; - - fake_cpuc->n_events = n; - n = collect_events(fake_cpuc, event, false); - if (n < 0) - goto out; - - fake_cpuc->n_events = n; - - ret = x86_pmu.schedule_events(fake_cpuc, n, NULL); - -out: - free_fake_cpuc(fake_cpuc); - return ret; -} - -static int x86_pmu_event_init(struct perf_event *event) -{ - struct pmu *tmp; - int err; - - switch (event->attr.type) { - case PERF_TYPE_RAW: - case PERF_TYPE_HARDWARE: - case PERF_TYPE_HW_CACHE: - break; - - default: - return -ENOENT; - } - - err = __x86_pmu_event_init(event); - if (!err) { - /* - * we temporarily connect event to its pmu - * such that validate_group() can classify - * it as an x86 event using is_x86_event() - */ - tmp = event->pmu; - event->pmu = &pmu; - - if (event->group_leader != event) - err = validate_group(event); - else - err = validate_event(event); - - event->pmu = tmp; - } - if (err) { - if (event->destroy) - event->destroy(event); - } - - if (ACCESS_ONCE(x86_pmu.attr_rdpmc)) - event->hw.flags |= PERF_X86_EVENT_RDPMC_ALLOWED; - - return err; -} - -static void refresh_pce(void *ignored) -{ - if (current->mm) - load_mm_cr4(current->mm); -} - -static void x86_pmu_event_mapped(struct perf_event *event) -{ - if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED)) - return; - - if (atomic_inc_return(¤t->mm->context.perf_rdpmc_allowed) == 1) - on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1); -} - -static void x86_pmu_event_unmapped(struct perf_event *event) -{ - if (!current->mm) - return; - - if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED)) - return; - - if (atomic_dec_and_test(¤t->mm->context.perf_rdpmc_allowed)) - on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1); -} - -static int x86_pmu_event_idx(struct perf_event *event) -{ - int idx = event->hw.idx; - - if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED)) - return 0; - - if (x86_pmu.num_counters_fixed && idx >= INTEL_PMC_IDX_FIXED) { - idx -= INTEL_PMC_IDX_FIXED; - idx |= 1 << 30; - } - - return idx + 1; -} - -static ssize_t get_attr_rdpmc(struct device *cdev, - struct device_attribute *attr, - char *buf) -{ - return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc); -} - -static ssize_t set_attr_rdpmc(struct device *cdev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - unsigned long val; - ssize_t ret; - - ret = kstrtoul(buf, 0, &val); - if (ret) - return ret; - - if (val > 2) - return -EINVAL; - - if (x86_pmu.attr_rdpmc_broken) - return -ENOTSUPP; - - if ((val == 2) != (x86_pmu.attr_rdpmc == 2)) { - /* - * Changing into or out of always available, aka - * perf-event-bypassing mode. This path is extremely slow, - * but only root can trigger it, so it's okay. - */ - if (val == 2) - static_key_slow_inc(&rdpmc_always_available); - else - static_key_slow_dec(&rdpmc_always_available); - on_each_cpu(refresh_pce, NULL, 1); - } - - x86_pmu.attr_rdpmc = val; - - return count; -} - -static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc); - -static struct attribute *x86_pmu_attrs[] = { - &dev_attr_rdpmc.attr, - NULL, -}; - -static struct attribute_group x86_pmu_attr_group = { - .attrs = x86_pmu_attrs, -}; - -static const struct attribute_group *x86_pmu_attr_groups[] = { - &x86_pmu_attr_group, - &x86_pmu_format_group, - &x86_pmu_events_group, - NULL, -}; - -static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in) -{ - if (x86_pmu.sched_task) - x86_pmu.sched_task(ctx, sched_in); -} - -void perf_check_microcode(void) -{ - if (x86_pmu.check_microcode) - x86_pmu.check_microcode(); -} -EXPORT_SYMBOL_GPL(perf_check_microcode); - -static struct pmu pmu = { - .pmu_enable = x86_pmu_enable, - .pmu_disable = x86_pmu_disable, - - .attr_groups = x86_pmu_attr_groups, - - .event_init = x86_pmu_event_init, - - .event_mapped = x86_pmu_event_mapped, - .event_unmapped = x86_pmu_event_unmapped, - - .add = x86_pmu_add, - .del = x86_pmu_del, - .start = x86_pmu_start, - .stop = x86_pmu_stop, - .read = x86_pmu_read, - - .start_txn = x86_pmu_start_txn, - .cancel_txn = x86_pmu_cancel_txn, - .commit_txn = x86_pmu_commit_txn, - - .event_idx = x86_pmu_event_idx, - .sched_task = x86_pmu_sched_task, - .task_ctx_size = sizeof(struct x86_perf_task_context), -}; - -void arch_perf_update_userpage(struct perf_event *event, - struct perf_event_mmap_page *userpg, u64 now) -{ - struct cyc2ns_data *data; - - userpg->cap_user_time = 0; - userpg->cap_user_time_zero = 0; - userpg->cap_user_rdpmc = - !!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED); - userpg->pmc_width = x86_pmu.cntval_bits; - - if (!sched_clock_stable()) - return; - - data = cyc2ns_read_begin(); - - /* - * Internal timekeeping for enabled/running/stopped times - * is always in the local_clock domain. - */ - userpg->cap_user_time = 1; - userpg->time_mult = data->cyc2ns_mul; - userpg->time_shift = data->cyc2ns_shift; - userpg->time_offset = data->cyc2ns_offset - now; - - /* - * cap_user_time_zero doesn't make sense when we're using a different - * time base for the records. - */ - if (event->clock == &local_clock) { - userpg->cap_user_time_zero = 1; - userpg->time_zero = data->cyc2ns_offset; - } - - cyc2ns_read_end(data); -} - -/* - * callchain support - */ - -static int backtrace_stack(void *data, char *name) -{ - return 0; -} - -static void backtrace_address(void *data, unsigned long addr, int reliable) -{ - struct perf_callchain_entry *entry = data; - - perf_callchain_store(entry, addr); -} - -static const struct stacktrace_ops backtrace_ops = { - .stack = backtrace_stack, - .address = backtrace_address, - .walk_stack = print_context_stack_bp, -}; - -void -perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) -{ - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { - /* TODO: We don't support guest os callchain now */ - return; - } - - perf_callchain_store(entry, regs->ip); - - dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry); -} - -static inline int -valid_user_frame(const void __user *fp, unsigned long size) -{ - return (__range_not_ok(fp, size, TASK_SIZE) == 0); -} - -static unsigned long get_segment_base(unsigned int segment) -{ - struct desc_struct *desc; - int idx = segment >> 3; - - if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) { -#ifdef CONFIG_MODIFY_LDT_SYSCALL - struct ldt_struct *ldt; - - if (idx > LDT_ENTRIES) - return 0; - - /* IRQs are off, so this synchronizes with smp_store_release */ - ldt = lockless_dereference(current->active_mm->context.ldt); - if (!ldt || idx > ldt->size) - return 0; - - desc = &ldt->entries[idx]; -#else - return 0; -#endif - } else { - if (idx > GDT_ENTRIES) - return 0; - - desc = raw_cpu_ptr(gdt_page.gdt) + idx; - } - - return get_desc_base(desc); -} - -#ifdef CONFIG_IA32_EMULATION - -#include <asm/compat.h> - -static inline int -perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) -{ - /* 32-bit process in 64-bit kernel. */ - unsigned long ss_base, cs_base; - struct stack_frame_ia32 frame; - const void __user *fp; - - if (!test_thread_flag(TIF_IA32)) - return 0; - - cs_base = get_segment_base(regs->cs); - ss_base = get_segment_base(regs->ss); - - fp = compat_ptr(ss_base + regs->bp); - pagefault_disable(); - while (entry->nr < PERF_MAX_STACK_DEPTH) { - unsigned long bytes; - frame.next_frame = 0; - frame.return_address = 0; - - if (!access_ok(VERIFY_READ, fp, 8)) - break; - - bytes = __copy_from_user_nmi(&frame.next_frame, fp, 4); - if (bytes != 0) - break; - bytes = __copy_from_user_nmi(&frame.return_address, fp+4, 4); - if (bytes != 0) - break; - - if (!valid_user_frame(fp, sizeof(frame))) - break; - - perf_callchain_store(entry, cs_base + frame.return_address); - fp = compat_ptr(ss_base + frame.next_frame); - } - pagefault_enable(); - return 1; -} -#else -static inline int -perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) -{ - return 0; -} -#endif - -void -perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) -{ - struct stack_frame frame; - const void __user *fp; - - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { - /* TODO: We don't support guest os callchain now */ - return; - } - - /* - * We don't know what to do with VM86 stacks.. ignore them for now. - */ - if (regs->flags & (X86_VM_MASK | PERF_EFLAGS_VM)) - return; - - fp = (void __user *)regs->bp; - - perf_callchain_store(entry, regs->ip); - - if (!current->mm) - return; - - if (perf_callchain_user32(regs, entry)) - return; - - pagefault_disable(); - while (entry->nr < PERF_MAX_STACK_DEPTH) { - unsigned long bytes; - frame.next_frame = NULL; - frame.return_address = 0; - - if (!access_ok(VERIFY_READ, fp, 16)) - break; - - bytes = __copy_from_user_nmi(&frame.next_frame, fp, 8); - if (bytes != 0) - break; - bytes = __copy_from_user_nmi(&frame.return_address, fp+8, 8); - if (bytes != 0) - break; - - if (!valid_user_frame(fp, sizeof(frame))) - break; - - perf_callchain_store(entry, frame.return_address); - fp = (void __user *)frame.next_frame; - } - pagefault_enable(); -} - -/* - * Deal with code segment offsets for the various execution modes: - * - * VM86 - the good olde 16 bit days, where the linear address is - * 20 bits and we use regs->ip + 0x10 * regs->cs. - * - * IA32 - Where we need to look at GDT/LDT segment descriptor tables - * to figure out what the 32bit base address is. - * - * X32 - has TIF_X32 set, but is running in x86_64 - * - * X86_64 - CS,DS,SS,ES are all zero based. - */ -static unsigned long code_segment_base(struct pt_regs *regs) -{ - /* - * For IA32 we look at the GDT/LDT segment base to convert the - * effective IP to a linear address. - */ - -#ifdef CONFIG_X86_32 - /* - * If we are in VM86 mode, add the segment offset to convert to a - * linear address. - */ - if (regs->flags & X86_VM_MASK) - return 0x10 * regs->cs; - - if (user_mode(regs) && regs->cs != __USER_CS) - return get_segment_base(regs->cs); -#else - if (user_mode(regs) && !user_64bit_mode(regs) && - regs->cs != __USER32_CS) - return get_segment_base(regs->cs); -#endif - return 0; -} - -unsigned long perf_instruction_pointer(struct pt_regs *regs) -{ - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) - return perf_guest_cbs->get_guest_ip(); - - return regs->ip + code_segment_base(regs); -} - -unsigned long perf_misc_flags(struct pt_regs *regs) -{ - int misc = 0; - - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { - if (perf_guest_cbs->is_user_mode()) - misc |= PERF_RECORD_MISC_GUEST_USER; - else - misc |= PERF_RECORD_MISC_GUEST_KERNEL; - } else { - if (user_mode(regs)) - misc |= PERF_RECORD_MISC_USER; - else - misc |= PERF_RECORD_MISC_KERNEL; - } - - if (regs->flags & PERF_EFLAGS_EXACT) - misc |= PERF_RECORD_MISC_EXACT_IP; - - return misc; -} - -void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) -{ - cap->version = x86_pmu.version; - cap->num_counters_gp = x86_pmu.num_counters; - cap->num_counters_fixed = x86_pmu.num_counters_fixed; - cap->bit_width_gp = x86_pmu.cntval_bits; - cap->bit_width_fixed = x86_pmu.cntval_bits; - cap->events_mask = (unsigned int)x86_pmu.events_maskl; - cap->events_mask_len = x86_pmu.events_mask_len; -} -EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability); diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h deleted file mode 100644 index 7bb61e32fb29..000000000000 --- a/arch/x86/kernel/cpu/perf_event.h +++ /dev/null @@ -1,955 +0,0 @@ -/* - * Performance events x86 architecture header - * - * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> - * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2009 Jaswinder Singh Rajput - * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter - * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra - * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com> - * Copyright (C) 2009 Google, Inc., Stephane Eranian - * - * For licencing details see kernel-base/COPYING - */ - -#include <linux/perf_event.h> - -/* To enable MSR tracing please use the generic trace points. */ - -/* - * | NHM/WSM | SNB | - * register ------------------------------- - * | HT | no HT | HT | no HT | - *----------------------------------------- - * offcore | core | core | cpu | core | - * lbr_sel | core | core | cpu | core | - * ld_lat | cpu | core | cpu | core | - *----------------------------------------- - * - * Given that there is a small number of shared regs, - * we can pre-allocate their slot in the per-cpu - * per-core reg tables. - */ -enum extra_reg_type { - EXTRA_REG_NONE = -1, /* not used */ - - EXTRA_REG_RSP_0 = 0, /* offcore_response_0 */ - EXTRA_REG_RSP_1 = 1, /* offcore_response_1 */ - EXTRA_REG_LBR = 2, /* lbr_select */ - EXTRA_REG_LDLAT = 3, /* ld_lat_threshold */ - EXTRA_REG_FE = 4, /* fe_* */ - - EXTRA_REG_MAX /* number of entries needed */ -}; - -struct event_constraint { - union { - unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; - u64 idxmsk64; - }; - u64 code; - u64 cmask; - int weight; - int overlap; - int flags; -}; -/* - * struct hw_perf_event.flags flags - */ -#define PERF_X86_EVENT_PEBS_LDLAT 0x0001 /* ld+ldlat data address sampling */ -#define PERF_X86_EVENT_PEBS_ST 0x0002 /* st data address sampling */ -#define PERF_X86_EVENT_PEBS_ST_HSW 0x0004 /* haswell style datala, store */ -#define PERF_X86_EVENT_COMMITTED 0x0008 /* event passed commit_txn */ -#define PERF_X86_EVENT_PEBS_LD_HSW 0x0010 /* haswell style datala, load */ -#define PERF_X86_EVENT_PEBS_NA_HSW 0x0020 /* haswell style datala, unknown */ -#define PERF_X86_EVENT_EXCL 0x0040 /* HT exclusivity on counter */ -#define PERF_X86_EVENT_DYNAMIC 0x0080 /* dynamic alloc'd constraint */ -#define PERF_X86_EVENT_RDPMC_ALLOWED 0x0100 /* grant rdpmc permission */ -#define PERF_X86_EVENT_EXCL_ACCT 0x0200 /* accounted EXCL event */ -#define PERF_X86_EVENT_AUTO_RELOAD 0x0400 /* use PEBS auto-reload */ -#define PERF_X86_EVENT_FREERUNNING 0x0800 /* use freerunning PEBS */ - - -struct amd_nb { - int nb_id; /* NorthBridge id */ - int refcnt; /* reference count */ - struct perf_event *owners[X86_PMC_IDX_MAX]; - struct event_constraint event_constraints[X86_PMC_IDX_MAX]; -}; - -/* The maximal number of PEBS events: */ -#define MAX_PEBS_EVENTS 8 - -/* - * Flags PEBS can handle without an PMI. - * - * TID can only be handled by flushing at context switch. - * - */ -#define PEBS_FREERUNNING_FLAGS \ - (PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \ - PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \ - PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \ - PERF_SAMPLE_TRANSACTION) - -/* - * A debug store configuration. - * - * We only support architectures that use 64bit fields. - */ -struct debug_store { - u64 bts_buffer_base; - u64 bts_index; - u64 bts_absolute_maximum; - u64 bts_interrupt_threshold; - u64 pebs_buffer_base; - u64 pebs_index; - u64 pebs_absolute_maximum; - u64 pebs_interrupt_threshold; - u64 pebs_event_reset[MAX_PEBS_EVENTS]; -}; - -/* - * Per register state. - */ -struct er_account { - raw_spinlock_t lock; /* per-core: protect structure */ - u64 config; /* extra MSR config */ - u64 reg; /* extra MSR number */ - atomic_t ref; /* reference count */ -}; - -/* - * Per core/cpu state - * - * Used to coordinate shared registers between HT threads or - * among events on a single PMU. - */ -struct intel_shared_regs { - struct er_account regs[EXTRA_REG_MAX]; - int refcnt; /* per-core: #HT threads */ - unsigned core_id; /* per-core: core id */ -}; - -enum intel_excl_state_type { - INTEL_EXCL_UNUSED = 0, /* counter is unused */ - INTEL_EXCL_SHARED = 1, /* counter can be used by both threads */ - INTEL_EXCL_EXCLUSIVE = 2, /* counter can be used by one thread only */ -}; - -struct intel_excl_states { - enum intel_excl_state_type state[X86_PMC_IDX_MAX]; - bool sched_started; /* true if scheduling has started */ -}; - -struct intel_excl_cntrs { - raw_spinlock_t lock; - - struct intel_excl_states states[2]; - - union { - u16 has_exclusive[2]; - u32 exclusive_present; - }; - - int refcnt; /* per-core: #HT threads */ - unsigned core_id; /* per-core: core id */ -}; - -#define MAX_LBR_ENTRIES 32 - -enum { - X86_PERF_KFREE_SHARED = 0, - X86_PERF_KFREE_EXCL = 1, - X86_PERF_KFREE_MAX -}; - -struct cpu_hw_events { - /* - * Generic x86 PMC bits - */ - struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */ - unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; - unsigned long running[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; - int enabled; - - int n_events; /* the # of events in the below arrays */ - int n_added; /* the # last events in the below arrays; - they've never been enabled yet */ - int n_txn; /* the # last events in the below arrays; - added in the current transaction */ - int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */ - u64 tags[X86_PMC_IDX_MAX]; - - struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ - struct event_constraint *event_constraint[X86_PMC_IDX_MAX]; - - int n_excl; /* the number of exclusive events */ - - unsigned int txn_flags; - int is_fake; - - /* - * Intel DebugStore bits - */ - struct debug_store *ds; - u64 pebs_enabled; - - /* - * Intel LBR bits - */ - int lbr_users; - void *lbr_context; - struct perf_branch_stack lbr_stack; - struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; - struct er_account *lbr_sel; - u64 br_sel; - - /* - * Intel host/guest exclude bits - */ - u64 intel_ctrl_guest_mask; - u64 intel_ctrl_host_mask; - struct perf_guest_switch_msr guest_switch_msrs[X86_PMC_IDX_MAX]; - - /* - * Intel checkpoint mask - */ - u64 intel_cp_status; - - /* - * manage shared (per-core, per-cpu) registers - * used on Intel NHM/WSM/SNB - */ - struct intel_shared_regs *shared_regs; - /* - * manage exclusive counter access between hyperthread - */ - struct event_constraint *constraint_list; /* in enable order */ - struct intel_excl_cntrs *excl_cntrs; - int excl_thread_id; /* 0 or 1 */ - - /* - * AMD specific bits - */ - struct amd_nb *amd_nb; - /* Inverted mask of bits to clear in the perf_ctr ctrl registers */ - u64 perf_ctr_virt_mask; - - void *kfree_on_online[X86_PERF_KFREE_MAX]; -}; - -#define __EVENT_CONSTRAINT(c, n, m, w, o, f) {\ - { .idxmsk64 = (n) }, \ - .code = (c), \ - .cmask = (m), \ - .weight = (w), \ - .overlap = (o), \ - .flags = f, \ -} - -#define EVENT_CONSTRAINT(c, n, m) \ - __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0, 0) - -#define INTEL_EXCLEVT_CONSTRAINT(c, n) \ - __EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT, HWEIGHT(n),\ - 0, PERF_X86_EVENT_EXCL) - -/* - * The overlap flag marks event constraints with overlapping counter - * masks. This is the case if the counter mask of such an event is not - * a subset of any other counter mask of a constraint with an equal or - * higher weight, e.g.: - * - * c_overlaps = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0); - * c_another1 = EVENT_CONSTRAINT(0, 0x07, 0); - * c_another2 = EVENT_CONSTRAINT(0, 0x38, 0); - * - * The event scheduler may not select the correct counter in the first - * cycle because it needs to know which subsequent events will be - * scheduled. It may fail to schedule the events then. So we set the - * overlap flag for such constraints to give the scheduler a hint which - * events to select for counter rescheduling. - * - * Care must be taken as the rescheduling algorithm is O(n!) which - * will increase scheduling cycles for an over-commited system - * dramatically. The number of such EVENT_CONSTRAINT_OVERLAP() macros - * and its counter masks must be kept at a minimum. - */ -#define EVENT_CONSTRAINT_OVERLAP(c, n, m) \ - __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 1, 0) - -/* - * Constraint on the Event code. - */ -#define INTEL_EVENT_CONSTRAINT(c, n) \ - EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT) - -/* - * Constraint on the Event code + UMask + fixed-mask - * - * filter mask to validate fixed counter events. - * the following filters disqualify for fixed counters: - * - inv - * - edge - * - cnt-mask - * - in_tx - * - in_tx_checkpointed - * The other filters are supported by fixed counters. - * The any-thread option is supported starting with v3. - */ -#define FIXED_EVENT_FLAGS (X86_RAW_EVENT_MASK|HSW_IN_TX|HSW_IN_TX_CHECKPOINTED) -#define FIXED_EVENT_CONSTRAINT(c, n) \ - EVENT_CONSTRAINT(c, (1ULL << (32+n)), FIXED_EVENT_FLAGS) - -/* - * Constraint on the Event code + UMask - */ -#define INTEL_UEVENT_CONSTRAINT(c, n) \ - EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK) - -/* Constraint on specific umask bit only + event */ -#define INTEL_UBIT_EVENT_CONSTRAINT(c, n) \ - EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT|(c)) - -/* Like UEVENT_CONSTRAINT, but match flags too */ -#define INTEL_FLAGS_UEVENT_CONSTRAINT(c, n) \ - EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS) - -#define INTEL_EXCLUEVT_CONSTRAINT(c, n) \ - __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \ - HWEIGHT(n), 0, PERF_X86_EVENT_EXCL) - -#define INTEL_PLD_CONSTRAINT(c, n) \ - __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ - HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LDLAT) - -#define INTEL_PST_CONSTRAINT(c, n) \ - __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ - HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST) - -/* Event constraint, but match on all event flags too. */ -#define INTEL_FLAGS_EVENT_CONSTRAINT(c, n) \ - EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS) - -/* Check only flags, but allow all event/umask */ -#define INTEL_ALL_EVENT_CONSTRAINT(code, n) \ - EVENT_CONSTRAINT(code, n, X86_ALL_EVENT_FLAGS) - -/* Check flags and event code, and set the HSW store flag */ -#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_ST(code, n) \ - __EVENT_CONSTRAINT(code, n, \ - ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \ - HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW) - -/* Check flags and event code, and set the HSW load flag */ -#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(code, n) \ - __EVENT_CONSTRAINT(code, n, \ - ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \ - HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW) - -#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(code, n) \ - __EVENT_CONSTRAINT(code, n, \ - ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \ - HWEIGHT(n), 0, \ - PERF_X86_EVENT_PEBS_LD_HSW|PERF_X86_EVENT_EXCL) - -/* Check flags and event code/umask, and set the HSW store flag */ -#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(code, n) \ - __EVENT_CONSTRAINT(code, n, \ - INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ - HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW) - -#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(code, n) \ - __EVENT_CONSTRAINT(code, n, \ - INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ - HWEIGHT(n), 0, \ - PERF_X86_EVENT_PEBS_ST_HSW|PERF_X86_EVENT_EXCL) - -/* Check flags and event code/umask, and set the HSW load flag */ -#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(code, n) \ - __EVENT_CONSTRAINT(code, n, \ - INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ - HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW) - -#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(code, n) \ - __EVENT_CONSTRAINT(code, n, \ - INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ - HWEIGHT(n), 0, \ - PERF_X86_EVENT_PEBS_LD_HSW|PERF_X86_EVENT_EXCL) - -/* Check flags and event code/umask, and set the HSW N/A flag */ -#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(code, n) \ - __EVENT_CONSTRAINT(code, n, \ - INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ - HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_NA_HSW) - - -/* - * We define the end marker as having a weight of -1 - * to enable blacklisting of events using a counter bitmask - * of zero and thus a weight of zero. - * The end marker has a weight that cannot possibly be - * obtained from counting the bits in the bitmask. - */ -#define EVENT_CONSTRAINT_END { .weight = -1 } - -/* - * Check for end marker with weight == -1 - */ -#define for_each_event_constraint(e, c) \ - for ((e) = (c); (e)->weight != -1; (e)++) - -/* - * Extra registers for specific events. - * - * Some events need large masks and require external MSRs. - * Those extra MSRs end up being shared for all events on - * a PMU and sometimes between PMU of sibling HT threads. - * In either case, the kernel needs to handle conflicting - * accesses to those extra, shared, regs. The data structure - * to manage those registers is stored in cpu_hw_event. - */ -struct extra_reg { - unsigned int event; - unsigned int msr; - u64 config_mask; - u64 valid_mask; - int idx; /* per_xxx->regs[] reg index */ - bool extra_msr_access; -}; - -#define EVENT_EXTRA_REG(e, ms, m, vm, i) { \ - .event = (e), \ - .msr = (ms), \ - .config_mask = (m), \ - .valid_mask = (vm), \ - .idx = EXTRA_REG_##i, \ - .extra_msr_access = true, \ - } - -#define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx) \ - EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm, idx) - -#define INTEL_UEVENT_EXTRA_REG(event, msr, vm, idx) \ - EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT | \ - ARCH_PERFMON_EVENTSEL_UMASK, vm, idx) - -#define INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(c) \ - INTEL_UEVENT_EXTRA_REG(c, \ - MSR_PEBS_LD_LAT_THRESHOLD, \ - 0xffff, \ - LDLAT) - -#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0, RSP_0) - -union perf_capabilities { - struct { - u64 lbr_format:6; - u64 pebs_trap:1; - u64 pebs_arch_reg:1; - u64 pebs_format:4; - u64 smm_freeze:1; - /* - * PMU supports separate counter range for writing - * values > 32bit. - */ - u64 full_width_write:1; - }; - u64 capabilities; -}; - -struct x86_pmu_quirk { - struct x86_pmu_quirk *next; - void (*func)(void); -}; - -union x86_pmu_config { - struct { - u64 event:8, - umask:8, - usr:1, - os:1, - edge:1, - pc:1, - interrupt:1, - __reserved1:1, - en:1, - inv:1, - cmask:8, - event2:4, - __reserved2:4, - go:1, - ho:1; - } bits; - u64 value; -}; - -#define X86_CONFIG(args...) ((union x86_pmu_config){.bits = {args}}).value - -enum { - x86_lbr_exclusive_lbr, - x86_lbr_exclusive_bts, - x86_lbr_exclusive_pt, - x86_lbr_exclusive_max, -}; - -/* - * struct x86_pmu - generic x86 pmu - */ -struct x86_pmu { - /* - * Generic x86 PMC bits - */ - const char *name; - int version; - int (*handle_irq)(struct pt_regs *); - void (*disable_all)(void); - void (*enable_all)(int added); - void (*enable)(struct perf_event *); - void (*disable)(struct perf_event *); - int (*hw_config)(struct perf_event *event); - int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign); - unsigned eventsel; - unsigned perfctr; - int (*addr_offset)(int index, bool eventsel); - int (*rdpmc_index)(int index); - u64 (*event_map)(int); - int max_events; - int num_counters; - int num_counters_fixed; - int cntval_bits; - u64 cntval_mask; - union { - unsigned long events_maskl; - unsigned long events_mask[BITS_TO_LONGS(ARCH_PERFMON_EVENTS_COUNT)]; - }; - int events_mask_len; - int apic; - u64 max_period; - struct event_constraint * - (*get_event_constraints)(struct cpu_hw_events *cpuc, - int idx, - struct perf_event *event); - - void (*put_event_constraints)(struct cpu_hw_events *cpuc, - struct perf_event *event); - - void (*start_scheduling)(struct cpu_hw_events *cpuc); - - void (*commit_scheduling)(struct cpu_hw_events *cpuc, int idx, int cntr); - - void (*stop_scheduling)(struct cpu_hw_events *cpuc); - - struct event_constraint *event_constraints; - struct x86_pmu_quirk *quirks; - int perfctr_second_write; - bool late_ack; - unsigned (*limit_period)(struct perf_event *event, unsigned l); - - /* - * sysfs attrs - */ - int attr_rdpmc_broken; - int attr_rdpmc; - struct attribute **format_attrs; - struct attribute **event_attrs; - - ssize_t (*events_sysfs_show)(char *page, u64 config); - struct attribute **cpu_events; - - /* - * CPU Hotplug hooks - */ - int (*cpu_prepare)(int cpu); - void (*cpu_starting)(int cpu); - void (*cpu_dying)(int cpu); - void (*cpu_dead)(int cpu); - - void (*check_microcode)(void); - void (*sched_task)(struct perf_event_context *ctx, - bool sched_in); - - /* - * Intel Arch Perfmon v2+ - */ - u64 intel_ctrl; - union perf_capabilities intel_cap; - - /* - * Intel DebugStore bits - */ - unsigned int bts :1, - bts_active :1, - pebs :1, - pebs_active :1, - pebs_broken :1, - pebs_prec_dist :1; - int pebs_record_size; - void (*drain_pebs)(struct pt_regs *regs); - struct event_constraint *pebs_constraints; - void (*pebs_aliases)(struct perf_event *event); - int max_pebs_events; - unsigned long free_running_flags; - - /* - * Intel LBR - */ - unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */ - int lbr_nr; /* hardware stack size */ - u64 lbr_sel_mask; /* LBR_SELECT valid bits */ - const int *lbr_sel_map; /* lbr_select mappings */ - bool lbr_double_abort; /* duplicated lbr aborts */ - - /* - * Intel PT/LBR/BTS are exclusive - */ - atomic_t lbr_exclusive[x86_lbr_exclusive_max]; - - /* - * Extra registers for events - */ - struct extra_reg *extra_regs; - unsigned int flags; - - /* - * Intel host/guest support (KVM) - */ - struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr); -}; - -struct x86_perf_task_context { - u64 lbr_from[MAX_LBR_ENTRIES]; - u64 lbr_to[MAX_LBR_ENTRIES]; - u64 lbr_info[MAX_LBR_ENTRIES]; - int tos; - int lbr_callstack_users; - int lbr_stack_state; -}; - -#define x86_add_quirk(func_) \ -do { \ - static struct x86_pmu_quirk __quirk __initdata = { \ - .func = func_, \ - }; \ - __quirk.next = x86_pmu.quirks; \ - x86_pmu.quirks = &__quirk; \ -} while (0) - -/* - * x86_pmu flags - */ -#define PMU_FL_NO_HT_SHARING 0x1 /* no hyper-threading resource sharing */ -#define PMU_FL_HAS_RSP_1 0x2 /* has 2 equivalent offcore_rsp regs */ -#define PMU_FL_EXCL_CNTRS 0x4 /* has exclusive counter requirements */ -#define PMU_FL_EXCL_ENABLED 0x8 /* exclusive counter active */ - -#define EVENT_VAR(_id) event_attr_##_id -#define EVENT_PTR(_id) &event_attr_##_id.attr.attr - -#define EVENT_ATTR(_name, _id) \ -static struct perf_pmu_events_attr EVENT_VAR(_id) = { \ - .attr = __ATTR(_name, 0444, events_sysfs_show, NULL), \ - .id = PERF_COUNT_HW_##_id, \ - .event_str = NULL, \ -}; - -#define EVENT_ATTR_STR(_name, v, str) \ -static struct perf_pmu_events_attr event_attr_##v = { \ - .attr = __ATTR(_name, 0444, events_sysfs_show, NULL), \ - .id = 0, \ - .event_str = str, \ -}; - -extern struct x86_pmu x86_pmu __read_mostly; - -static inline bool x86_pmu_has_lbr_callstack(void) -{ - return x86_pmu.lbr_sel_map && - x86_pmu.lbr_sel_map[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] > 0; -} - -DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events); - -int x86_perf_event_set_period(struct perf_event *event); - -/* - * Generalized hw caching related hw_event table, filled - * in on a per model basis. A value of 0 means - * 'not supported', -1 means 'hw_event makes no sense on - * this CPU', any other value means the raw hw_event - * ID. - */ - -#define C(x) PERF_COUNT_HW_CACHE_##x - -extern u64 __read_mostly hw_cache_event_ids - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX]; -extern u64 __read_mostly hw_cache_extra_regs - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX]; - -u64 x86_perf_event_update(struct perf_event *event); - -static inline unsigned int x86_pmu_config_addr(int index) -{ - return x86_pmu.eventsel + (x86_pmu.addr_offset ? - x86_pmu.addr_offset(index, true) : index); -} - -static inline unsigned int x86_pmu_event_addr(int index) -{ - return x86_pmu.perfctr + (x86_pmu.addr_offset ? - x86_pmu.addr_offset(index, false) : index); -} - -static inline int x86_pmu_rdpmc_index(int index) -{ - return x86_pmu.rdpmc_index ? x86_pmu.rdpmc_index(index) : index; -} - -int x86_add_exclusive(unsigned int what); - -void x86_del_exclusive(unsigned int what); - -int x86_reserve_hardware(void); - -void x86_release_hardware(void); - -void hw_perf_lbr_event_destroy(struct perf_event *event); - -int x86_setup_perfctr(struct perf_event *event); - -int x86_pmu_hw_config(struct perf_event *event); - -void x86_pmu_disable_all(void); - -static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, - u64 enable_mask) -{ - u64 disable_mask = __this_cpu_read(cpu_hw_events.perf_ctr_virt_mask); - - if (hwc->extra_reg.reg) - wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config); - wrmsrl(hwc->config_base, (hwc->config | enable_mask) & ~disable_mask); -} - -void x86_pmu_enable_all(int added); - -int perf_assign_events(struct event_constraint **constraints, int n, - int wmin, int wmax, int gpmax, int *assign); -int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign); - -void x86_pmu_stop(struct perf_event *event, int flags); - -static inline void x86_pmu_disable_event(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - - wrmsrl(hwc->config_base, hwc->config); -} - -void x86_pmu_enable_event(struct perf_event *event); - -int x86_pmu_handle_irq(struct pt_regs *regs); - -extern struct event_constraint emptyconstraint; - -extern struct event_constraint unconstrained; - -static inline bool kernel_ip(unsigned long ip) -{ -#ifdef CONFIG_X86_32 - return ip > PAGE_OFFSET; -#else - return (long)ip < 0; -#endif -} - -/* - * Not all PMUs provide the right context information to place the reported IP - * into full context. Specifically segment registers are typically not - * supplied. - * - * Assuming the address is a linear address (it is for IBS), we fake the CS and - * vm86 mode using the known zero-based code segment and 'fix up' the registers - * to reflect this. - * - * Intel PEBS/LBR appear to typically provide the effective address, nothing - * much we can do about that but pray and treat it like a linear address. - */ -static inline void set_linear_ip(struct pt_regs *regs, unsigned long ip) -{ - regs->cs = kernel_ip(ip) ? __KERNEL_CS : __USER_CS; - if (regs->flags & X86_VM_MASK) - regs->flags ^= (PERF_EFLAGS_VM | X86_VM_MASK); - regs->ip = ip; -} - -ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event); -ssize_t intel_event_sysfs_show(char *page, u64 config); - -struct attribute **merge_attr(struct attribute **a, struct attribute **b); - -#ifdef CONFIG_CPU_SUP_AMD - -int amd_pmu_init(void); - -#else /* CONFIG_CPU_SUP_AMD */ - -static inline int amd_pmu_init(void) -{ - return 0; -} - -#endif /* CONFIG_CPU_SUP_AMD */ - -#ifdef CONFIG_CPU_SUP_INTEL - -static inline bool intel_pmu_has_bts(struct perf_event *event) -{ - if (event->attr.config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS && - !event->attr.freq && event->hw.sample_period == 1) - return true; - - return false; -} - -int intel_pmu_save_and_restart(struct perf_event *event); - -struct event_constraint * -x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx, - struct perf_event *event); - -struct intel_shared_regs *allocate_shared_regs(int cpu); - -int intel_pmu_init(void); - -void init_debug_store_on_cpu(int cpu); - -void fini_debug_store_on_cpu(int cpu); - -void release_ds_buffers(void); - -void reserve_ds_buffers(void); - -extern struct event_constraint bts_constraint; - -void intel_pmu_enable_bts(u64 config); - -void intel_pmu_disable_bts(void); - -int intel_pmu_drain_bts_buffer(void); - -extern struct event_constraint intel_core2_pebs_event_constraints[]; - -extern struct event_constraint intel_atom_pebs_event_constraints[]; - -extern struct event_constraint intel_slm_pebs_event_constraints[]; - -extern struct event_constraint intel_nehalem_pebs_event_constraints[]; - -extern struct event_constraint intel_westmere_pebs_event_constraints[]; - -extern struct event_constraint intel_snb_pebs_event_constraints[]; - -extern struct event_constraint intel_ivb_pebs_event_constraints[]; - -extern struct event_constraint intel_hsw_pebs_event_constraints[]; - -extern struct event_constraint intel_skl_pebs_event_constraints[]; - -struct event_constraint *intel_pebs_constraints(struct perf_event *event); - -void intel_pmu_pebs_enable(struct perf_event *event); - -void intel_pmu_pebs_disable(struct perf_event *event); - -void intel_pmu_pebs_enable_all(void); - -void intel_pmu_pebs_disable_all(void); - -void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in); - -void intel_ds_init(void); - -void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in); - -void intel_pmu_lbr_reset(void); - -void intel_pmu_lbr_enable(struct perf_event *event); - -void intel_pmu_lbr_disable(struct perf_event *event); - -void intel_pmu_lbr_enable_all(bool pmi); - -void intel_pmu_lbr_disable_all(void); - -void intel_pmu_lbr_read(void); - -void intel_pmu_lbr_init_core(void); - -void intel_pmu_lbr_init_nhm(void); - -void intel_pmu_lbr_init_atom(void); - -void intel_pmu_lbr_init_snb(void); - -void intel_pmu_lbr_init_hsw(void); - -void intel_pmu_lbr_init_skl(void); - -void intel_pmu_lbr_init_knl(void); - -int intel_pmu_setup_lbr_filter(struct perf_event *event); - -void intel_pt_interrupt(void); - -int intel_bts_interrupt(void); - -void intel_bts_enable_local(void); - -void intel_bts_disable_local(void); - -int p4_pmu_init(void); - -int p6_pmu_init(void); - -int knc_pmu_init(void); - -ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, - char *page); - -static inline int is_ht_workaround_enabled(void) -{ - return !!(x86_pmu.flags & PMU_FL_EXCL_ENABLED); -} - -#else /* CONFIG_CPU_SUP_INTEL */ - -static inline void reserve_ds_buffers(void) -{ -} - -static inline void release_ds_buffers(void) -{ -} - -static inline int intel_pmu_init(void) -{ - return 0; -} - -static inline struct intel_shared_regs *allocate_shared_regs(int cpu) -{ - return NULL; -} - -static inline int is_ht_workaround_enabled(void) -{ - return 0; -} -#endif /* CONFIG_CPU_SUP_INTEL */ diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c deleted file mode 100644 index 58610539b048..000000000000 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ /dev/null @@ -1,731 +0,0 @@ -#include <linux/perf_event.h> -#include <linux/export.h> -#include <linux/types.h> -#include <linux/init.h> -#include <linux/slab.h> -#include <asm/apicdef.h> - -#include "perf_event.h" - -static __initconst const u64 amd_hw_cache_event_ids - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = -{ - [ C(L1D) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ - [ C(RESULT_MISS) ] = 0x0141, /* Data Cache Misses */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts */ - [ C(RESULT_MISS) ] = 0x0167, /* Data Prefetcher :cancelled */ - }, - }, - [ C(L1I ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */ - [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */ - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(LL ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */ - [ C(RESULT_MISS) ] = 0x037E, /* L2 Cache Misses : IC+DC */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback */ - [ C(RESULT_MISS) ] = 0, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(DTLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ - [ C(RESULT_MISS) ] = 0x0746, /* L1_DTLB_AND_L2_DLTB_MISS.ALL */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(ITLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */ - [ C(RESULT_MISS) ] = 0x0385, /* L1_ITLB_AND_L2_ITLB_MISS.ALL */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, - [ C(BPU ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */ - [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, - [ C(NODE) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0xb8e9, /* CPU Request to Memory, l+r */ - [ C(RESULT_MISS) ] = 0x98e9, /* CPU Request to Memory, r */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, -}; - -/* - * AMD Performance Monitor K7 and later. - */ -static const u64 amd_perfmon_event_map[] = -{ - [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, - [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080, - [PERF_COUNT_HW_CACHE_MISSES] = 0x0081, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2, - [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3, - [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00d0, /* "Decoder empty" event */ - [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x00d1, /* "Dispatch stalls" event */ -}; - -static u64 amd_pmu_event_map(int hw_event) -{ - return amd_perfmon_event_map[hw_event]; -} - -/* - * Previously calculated offsets - */ -static unsigned int event_offsets[X86_PMC_IDX_MAX] __read_mostly; -static unsigned int count_offsets[X86_PMC_IDX_MAX] __read_mostly; - -/* - * Legacy CPUs: - * 4 counters starting at 0xc0010000 each offset by 1 - * - * CPUs with core performance counter extensions: - * 6 counters starting at 0xc0010200 each offset by 2 - */ -static inline int amd_pmu_addr_offset(int index, bool eventsel) -{ - int offset; - - if (!index) - return index; - - if (eventsel) - offset = event_offsets[index]; - else - offset = count_offsets[index]; - - if (offset) - return offset; - - if (!boot_cpu_has(X86_FEATURE_PERFCTR_CORE)) - offset = index; - else - offset = index << 1; - - if (eventsel) - event_offsets[index] = offset; - else - count_offsets[index] = offset; - - return offset; -} - -static int amd_core_hw_config(struct perf_event *event) -{ - if (event->attr.exclude_host && event->attr.exclude_guest) - /* - * When HO == GO == 1 the hardware treats that as GO == HO == 0 - * and will count in both modes. We don't want to count in that - * case so we emulate no-counting by setting US = OS = 0. - */ - event->hw.config &= ~(ARCH_PERFMON_EVENTSEL_USR | - ARCH_PERFMON_EVENTSEL_OS); - else if (event->attr.exclude_host) - event->hw.config |= AMD64_EVENTSEL_GUESTONLY; - else if (event->attr.exclude_guest) - event->hw.config |= AMD64_EVENTSEL_HOSTONLY; - - return 0; -} - -/* - * AMD64 events are detected based on their event codes. - */ -static inline unsigned int amd_get_event_code(struct hw_perf_event *hwc) -{ - return ((hwc->config >> 24) & 0x0f00) | (hwc->config & 0x00ff); -} - -static inline int amd_is_nb_event(struct hw_perf_event *hwc) -{ - return (hwc->config & 0xe0) == 0xe0; -} - -static inline int amd_has_nb(struct cpu_hw_events *cpuc) -{ - struct amd_nb *nb = cpuc->amd_nb; - - return nb && nb->nb_id != -1; -} - -static int amd_pmu_hw_config(struct perf_event *event) -{ - int ret; - - /* pass precise event sampling to ibs: */ - if (event->attr.precise_ip && get_ibs_caps()) - return -ENOENT; - - if (has_branch_stack(event)) - return -EOPNOTSUPP; - - ret = x86_pmu_hw_config(event); - if (ret) - return ret; - - if (event->attr.type == PERF_TYPE_RAW) - event->hw.config |= event->attr.config & AMD64_RAW_EVENT_MASK; - - return amd_core_hw_config(event); -} - -static void __amd_put_nb_event_constraints(struct cpu_hw_events *cpuc, - struct perf_event *event) -{ - struct amd_nb *nb = cpuc->amd_nb; - int i; - - /* - * need to scan whole list because event may not have - * been assigned during scheduling - * - * no race condition possible because event can only - * be removed on one CPU at a time AND PMU is disabled - * when we come here - */ - for (i = 0; i < x86_pmu.num_counters; i++) { - if (cmpxchg(nb->owners + i, event, NULL) == event) - break; - } -} - - /* - * AMD64 NorthBridge events need special treatment because - * counter access needs to be synchronized across all cores - * of a package. Refer to BKDG section 3.12 - * - * NB events are events measuring L3 cache, Hypertransport - * traffic. They are identified by an event code >= 0xe00. - * They measure events on the NorthBride which is shared - * by all cores on a package. NB events are counted on a - * shared set of counters. When a NB event is programmed - * in a counter, the data actually comes from a shared - * counter. Thus, access to those counters needs to be - * synchronized. - * - * We implement the synchronization such that no two cores - * can be measuring NB events using the same counters. Thus, - * we maintain a per-NB allocation table. The available slot - * is propagated using the event_constraint structure. - * - * We provide only one choice for each NB event based on - * the fact that only NB events have restrictions. Consequently, - * if a counter is available, there is a guarantee the NB event - * will be assigned to it. If no slot is available, an empty - * constraint is returned and scheduling will eventually fail - * for this event. - * - * Note that all cores attached the same NB compete for the same - * counters to host NB events, this is why we use atomic ops. Some - * multi-chip CPUs may have more than one NB. - * - * Given that resources are allocated (cmpxchg), they must be - * eventually freed for others to use. This is accomplished by - * calling __amd_put_nb_event_constraints() - * - * Non NB events are not impacted by this restriction. - */ -static struct event_constraint * -__amd_get_nb_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event, - struct event_constraint *c) -{ - struct hw_perf_event *hwc = &event->hw; - struct amd_nb *nb = cpuc->amd_nb; - struct perf_event *old; - int idx, new = -1; - - if (!c) - c = &unconstrained; - - if (cpuc->is_fake) - return c; - - /* - * detect if already present, if so reuse - * - * cannot merge with actual allocation - * because of possible holes - * - * event can already be present yet not assigned (in hwc->idx) - * because of successive calls to x86_schedule_events() from - * hw_perf_group_sched_in() without hw_perf_enable() - */ - for_each_set_bit(idx, c->idxmsk, x86_pmu.num_counters) { - if (new == -1 || hwc->idx == idx) - /* assign free slot, prefer hwc->idx */ - old = cmpxchg(nb->owners + idx, NULL, event); - else if (nb->owners[idx] == event) - /* event already present */ - old = event; - else - continue; - - if (old && old != event) - continue; - - /* reassign to this slot */ - if (new != -1) - cmpxchg(nb->owners + new, event, NULL); - new = idx; - - /* already present, reuse */ - if (old == event) - break; - } - - if (new == -1) - return &emptyconstraint; - - return &nb->event_constraints[new]; -} - -static struct amd_nb *amd_alloc_nb(int cpu) -{ - struct amd_nb *nb; - int i; - - nb = kzalloc_node(sizeof(struct amd_nb), GFP_KERNEL, cpu_to_node(cpu)); - if (!nb) - return NULL; - - nb->nb_id = -1; - - /* - * initialize all possible NB constraints - */ - for (i = 0; i < x86_pmu.num_counters; i++) { - __set_bit(i, nb->event_constraints[i].idxmsk); - nb->event_constraints[i].weight = 1; - } - return nb; -} - -static int amd_pmu_cpu_prepare(int cpu) -{ - struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); - - WARN_ON_ONCE(cpuc->amd_nb); - - if (boot_cpu_data.x86_max_cores < 2) - return NOTIFY_OK; - - cpuc->amd_nb = amd_alloc_nb(cpu); - if (!cpuc->amd_nb) - return NOTIFY_BAD; - - return NOTIFY_OK; -} - -static void amd_pmu_cpu_starting(int cpu) -{ - struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); - void **onln = &cpuc->kfree_on_online[X86_PERF_KFREE_SHARED]; - struct amd_nb *nb; - int i, nb_id; - - cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY; - - if (boot_cpu_data.x86_max_cores < 2) - return; - - nb_id = amd_get_nb_id(cpu); - WARN_ON_ONCE(nb_id == BAD_APICID); - - for_each_online_cpu(i) { - nb = per_cpu(cpu_hw_events, i).amd_nb; - if (WARN_ON_ONCE(!nb)) - continue; - - if (nb->nb_id == nb_id) { - *onln = cpuc->amd_nb; - cpuc->amd_nb = nb; - break; - } - } - - cpuc->amd_nb->nb_id = nb_id; - cpuc->amd_nb->refcnt++; -} - -static void amd_pmu_cpu_dead(int cpu) -{ - struct cpu_hw_events *cpuhw; - - if (boot_cpu_data.x86_max_cores < 2) - return; - - cpuhw = &per_cpu(cpu_hw_events, cpu); - - if (cpuhw->amd_nb) { - struct amd_nb *nb = cpuhw->amd_nb; - - if (nb->nb_id == -1 || --nb->refcnt == 0) - kfree(nb); - - cpuhw->amd_nb = NULL; - } -} - -static struct event_constraint * -amd_get_event_constraints(struct cpu_hw_events *cpuc, int idx, - struct perf_event *event) -{ - /* - * if not NB event or no NB, then no constraints - */ - if (!(amd_has_nb(cpuc) && amd_is_nb_event(&event->hw))) - return &unconstrained; - - return __amd_get_nb_event_constraints(cpuc, event, NULL); -} - -static void amd_put_event_constraints(struct cpu_hw_events *cpuc, - struct perf_event *event) -{ - if (amd_has_nb(cpuc) && amd_is_nb_event(&event->hw)) - __amd_put_nb_event_constraints(cpuc, event); -} - -PMU_FORMAT_ATTR(event, "config:0-7,32-35"); -PMU_FORMAT_ATTR(umask, "config:8-15" ); -PMU_FORMAT_ATTR(edge, "config:18" ); -PMU_FORMAT_ATTR(inv, "config:23" ); -PMU_FORMAT_ATTR(cmask, "config:24-31" ); - -static struct attribute *amd_format_attr[] = { - &format_attr_event.attr, - &format_attr_umask.attr, - &format_attr_edge.attr, - &format_attr_inv.attr, - &format_attr_cmask.attr, - NULL, -}; - -/* AMD Family 15h */ - -#define AMD_EVENT_TYPE_MASK 0x000000F0ULL - -#define AMD_EVENT_FP 0x00000000ULL ... 0x00000010ULL -#define AMD_EVENT_LS 0x00000020ULL ... 0x00000030ULL -#define AMD_EVENT_DC 0x00000040ULL ... 0x00000050ULL -#define AMD_EVENT_CU 0x00000060ULL ... 0x00000070ULL -#define AMD_EVENT_IC_DE 0x00000080ULL ... 0x00000090ULL -#define AMD_EVENT_EX_LS 0x000000C0ULL -#define AMD_EVENT_DE 0x000000D0ULL -#define AMD_EVENT_NB 0x000000E0ULL ... 0x000000F0ULL - -/* - * AMD family 15h event code/PMC mappings: - * - * type = event_code & 0x0F0: - * - * 0x000 FP PERF_CTL[5:3] - * 0x010 FP PERF_CTL[5:3] - * 0x020 LS PERF_CTL[5:0] - * 0x030 LS PERF_CTL[5:0] - * 0x040 DC PERF_CTL[5:0] - * 0x050 DC PERF_CTL[5:0] - * 0x060 CU PERF_CTL[2:0] - * 0x070 CU PERF_CTL[2:0] - * 0x080 IC/DE PERF_CTL[2:0] - * 0x090 IC/DE PERF_CTL[2:0] - * 0x0A0 --- - * 0x0B0 --- - * 0x0C0 EX/LS PERF_CTL[5:0] - * 0x0D0 DE PERF_CTL[2:0] - * 0x0E0 NB NB_PERF_CTL[3:0] - * 0x0F0 NB NB_PERF_CTL[3:0] - * - * Exceptions: - * - * 0x000 FP PERF_CTL[3], PERF_CTL[5:3] (*) - * 0x003 FP PERF_CTL[3] - * 0x004 FP PERF_CTL[3], PERF_CTL[5:3] (*) - * 0x00B FP PERF_CTL[3] - * 0x00D FP PERF_CTL[3] - * 0x023 DE PERF_CTL[2:0] - * 0x02D LS PERF_CTL[3] - * 0x02E LS PERF_CTL[3,0] - * 0x031 LS PERF_CTL[2:0] (**) - * 0x043 CU PERF_CTL[2:0] - * 0x045 CU PERF_CTL[2:0] - * 0x046 CU PERF_CTL[2:0] - * 0x054 CU PERF_CTL[2:0] - * 0x055 CU PERF_CTL[2:0] - * 0x08F IC PERF_CTL[0] - * 0x187 DE PERF_CTL[0] - * 0x188 DE PERF_CTL[0] - * 0x0DB EX PERF_CTL[5:0] - * 0x0DC LS PERF_CTL[5:0] - * 0x0DD LS PERF_CTL[5:0] - * 0x0DE LS PERF_CTL[5:0] - * 0x0DF LS PERF_CTL[5:0] - * 0x1C0 EX PERF_CTL[5:3] - * 0x1D6 EX PERF_CTL[5:0] - * 0x1D8 EX PERF_CTL[5:0] - * - * (*) depending on the umask all FPU counters may be used - * (**) only one unitmask enabled at a time - */ - -static struct event_constraint amd_f15_PMC0 = EVENT_CONSTRAINT(0, 0x01, 0); -static struct event_constraint amd_f15_PMC20 = EVENT_CONSTRAINT(0, 0x07, 0); -static struct event_constraint amd_f15_PMC3 = EVENT_CONSTRAINT(0, 0x08, 0); -static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0); -static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0); -static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0); - -static struct event_constraint * -amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, int idx, - struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - unsigned int event_code = amd_get_event_code(hwc); - - switch (event_code & AMD_EVENT_TYPE_MASK) { - case AMD_EVENT_FP: - switch (event_code) { - case 0x000: - if (!(hwc->config & 0x0000F000ULL)) - break; - if (!(hwc->config & 0x00000F00ULL)) - break; - return &amd_f15_PMC3; - case 0x004: - if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1) - break; - return &amd_f15_PMC3; - case 0x003: - case 0x00B: - case 0x00D: - return &amd_f15_PMC3; - } - return &amd_f15_PMC53; - case AMD_EVENT_LS: - case AMD_EVENT_DC: - case AMD_EVENT_EX_LS: - switch (event_code) { - case 0x023: - case 0x043: - case 0x045: - case 0x046: - case 0x054: - case 0x055: - return &amd_f15_PMC20; - case 0x02D: - return &amd_f15_PMC3; - case 0x02E: - return &amd_f15_PMC30; - case 0x031: - if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1) - return &amd_f15_PMC20; - return &emptyconstraint; - case 0x1C0: - return &amd_f15_PMC53; - default: - return &amd_f15_PMC50; - } - case AMD_EVENT_CU: - case AMD_EVENT_IC_DE: - case AMD_EVENT_DE: - switch (event_code) { - case 0x08F: - case 0x187: - case 0x188: - return &amd_f15_PMC0; - case 0x0DB ... 0x0DF: - case 0x1D6: - case 0x1D8: - return &amd_f15_PMC50; - default: - return &amd_f15_PMC20; - } - case AMD_EVENT_NB: - /* moved to perf_event_amd_uncore.c */ - return &emptyconstraint; - default: - return &emptyconstraint; - } -} - -static ssize_t amd_event_sysfs_show(char *page, u64 config) -{ - u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT) | - (config & AMD64_EVENTSEL_EVENT) >> 24; - - return x86_event_sysfs_show(page, config, event); -} - -static __initconst const struct x86_pmu amd_pmu = { - .name = "AMD", - .handle_irq = x86_pmu_handle_irq, - .disable_all = x86_pmu_disable_all, - .enable_all = x86_pmu_enable_all, - .enable = x86_pmu_enable_event, - .disable = x86_pmu_disable_event, - .hw_config = amd_pmu_hw_config, - .schedule_events = x86_schedule_events, - .eventsel = MSR_K7_EVNTSEL0, - .perfctr = MSR_K7_PERFCTR0, - .addr_offset = amd_pmu_addr_offset, - .event_map = amd_pmu_event_map, - .max_events = ARRAY_SIZE(amd_perfmon_event_map), - .num_counters = AMD64_NUM_COUNTERS, - .cntval_bits = 48, - .cntval_mask = (1ULL << 48) - 1, - .apic = 1, - /* use highest bit to detect overflow */ - .max_period = (1ULL << 47) - 1, - .get_event_constraints = amd_get_event_constraints, - .put_event_constraints = amd_put_event_constraints, - - .format_attrs = amd_format_attr, - .events_sysfs_show = amd_event_sysfs_show, - - .cpu_prepare = amd_pmu_cpu_prepare, - .cpu_starting = amd_pmu_cpu_starting, - .cpu_dead = amd_pmu_cpu_dead, -}; - -static int __init amd_core_pmu_init(void) -{ - if (!boot_cpu_has(X86_FEATURE_PERFCTR_CORE)) - return 0; - - switch (boot_cpu_data.x86) { - case 0x15: - pr_cont("Fam15h "); - x86_pmu.get_event_constraints = amd_get_event_constraints_f15h; - break; - - default: - pr_err("core perfctr but no constraints; unknown hardware!\n"); - return -ENODEV; - } - - /* - * If core performance counter extensions exists, we must use - * MSR_F15H_PERF_CTL/MSR_F15H_PERF_CTR msrs. See also - * amd_pmu_addr_offset(). - */ - x86_pmu.eventsel = MSR_F15H_PERF_CTL; - x86_pmu.perfctr = MSR_F15H_PERF_CTR; - x86_pmu.num_counters = AMD64_NUM_COUNTERS_CORE; - - pr_cont("core perfctr, "); - return 0; -} - -__init int amd_pmu_init(void) -{ - int ret; - - /* Performance-monitoring supported from K7 and later: */ - if (boot_cpu_data.x86 < 6) - return -ENODEV; - - x86_pmu = amd_pmu; - - ret = amd_core_pmu_init(); - if (ret) - return ret; - - /* Events are common for all AMDs */ - memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); - - return 0; -} - -void amd_pmu_enable_virt(void) -{ - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - - cpuc->perf_ctr_virt_mask = 0; - - /* Reload all events */ - x86_pmu_disable_all(); - x86_pmu_enable_all(0); -} -EXPORT_SYMBOL_GPL(amd_pmu_enable_virt); - -void amd_pmu_disable_virt(void) -{ - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - - /* - * We only mask out the Host-only bit so that host-only counting works - * when SVM is disabled. If someone sets up a guest-only counter when - * SVM is disabled the Guest-only bits still gets set and the counter - * will not count anything. - */ - cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY; - - /* Reload all events */ - x86_pmu_disable_all(); - x86_pmu_enable_all(0); -} -EXPORT_SYMBOL_GPL(amd_pmu_disable_virt); diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c deleted file mode 100644 index 989d3c215d2b..000000000000 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ /dev/null @@ -1,959 +0,0 @@ -/* - * Performance events - AMD IBS - * - * Copyright (C) 2011 Advanced Micro Devices, Inc., Robert Richter - * - * For licencing details see kernel-base/COPYING - */ - -#include <linux/perf_event.h> -#include <linux/module.h> -#include <linux/pci.h> -#include <linux/ptrace.h> -#include <linux/syscore_ops.h> - -#include <asm/apic.h> - -#include "perf_event.h" - -static u32 ibs_caps; - -#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) - -#include <linux/kprobes.h> -#include <linux/hardirq.h> - -#include <asm/nmi.h> - -#define IBS_FETCH_CONFIG_MASK (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT) -#define IBS_OP_CONFIG_MASK IBS_OP_MAX_CNT - -enum ibs_states { - IBS_ENABLED = 0, - IBS_STARTED = 1, - IBS_STOPPING = 2, - - IBS_MAX_STATES, -}; - -struct cpu_perf_ibs { - struct perf_event *event; - unsigned long state[BITS_TO_LONGS(IBS_MAX_STATES)]; -}; - -struct perf_ibs { - struct pmu pmu; - unsigned int msr; - u64 config_mask; - u64 cnt_mask; - u64 enable_mask; - u64 valid_mask; - u64 max_period; - unsigned long offset_mask[1]; - int offset_max; - struct cpu_perf_ibs __percpu *pcpu; - - struct attribute **format_attrs; - struct attribute_group format_group; - const struct attribute_group *attr_groups[2]; - - u64 (*get_count)(u64 config); -}; - -struct perf_ibs_data { - u32 size; - union { - u32 data[0]; /* data buffer starts here */ - u32 caps; - }; - u64 regs[MSR_AMD64_IBS_REG_COUNT_MAX]; -}; - -static int -perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *hw_period) -{ - s64 left = local64_read(&hwc->period_left); - s64 period = hwc->sample_period; - int overflow = 0; - - /* - * If we are way outside a reasonable range then just skip forward: - */ - if (unlikely(left <= -period)) { - left = period; - local64_set(&hwc->period_left, left); - hwc->last_period = period; - overflow = 1; - } - - if (unlikely(left < (s64)min)) { - left += period; - local64_set(&hwc->period_left, left); - hwc->last_period = period; - overflow = 1; - } - - /* - * If the hw period that triggers the sw overflow is too short - * we might hit the irq handler. This biases the results. - * Thus we shorten the next-to-last period and set the last - * period to the max period. - */ - if (left > max) { - left -= max; - if (left > max) - left = max; - else if (left < min) - left = min; - } - - *hw_period = (u64)left; - - return overflow; -} - -static int -perf_event_try_update(struct perf_event *event, u64 new_raw_count, int width) -{ - struct hw_perf_event *hwc = &event->hw; - int shift = 64 - width; - u64 prev_raw_count; - u64 delta; - - /* - * Careful: an NMI might modify the previous event value. - * - * Our tactic to handle this is to first atomically read and - * exchange a new raw count - then add that new-prev delta - * count to the generic event atomically: - */ - prev_raw_count = local64_read(&hwc->prev_count); - if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, - new_raw_count) != prev_raw_count) - return 0; - - /* - * Now we have the new raw value and have updated the prev - * timestamp already. We can now calculate the elapsed delta - * (event-)time and add that to the generic event. - * - * Careful, not all hw sign-extends above the physical width - * of the count. - */ - delta = (new_raw_count << shift) - (prev_raw_count << shift); - delta >>= shift; - - local64_add(delta, &event->count); - local64_sub(delta, &hwc->period_left); - - return 1; -} - -static struct perf_ibs perf_ibs_fetch; -static struct perf_ibs perf_ibs_op; - -static struct perf_ibs *get_ibs_pmu(int type) -{ - if (perf_ibs_fetch.pmu.type == type) - return &perf_ibs_fetch; - if (perf_ibs_op.pmu.type == type) - return &perf_ibs_op; - return NULL; -} - -/* - * Use IBS for precise event sampling: - * - * perf record -a -e cpu-cycles:p ... # use ibs op counting cycle count - * perf record -a -e r076:p ... # same as -e cpu-cycles:p - * perf record -a -e r0C1:p ... # use ibs op counting micro-ops - * - * IbsOpCntCtl (bit 19) of IBS Execution Control Register (IbsOpCtl, - * MSRC001_1033) is used to select either cycle or micro-ops counting - * mode. - * - * The rip of IBS samples has skid 0. Thus, IBS supports precise - * levels 1 and 2 and the PERF_EFLAGS_EXACT is set. In rare cases the - * rip is invalid when IBS was not able to record the rip correctly. - * We clear PERF_EFLAGS_EXACT and take the rip from pt_regs then. - * - */ -static int perf_ibs_precise_event(struct perf_event *event, u64 *config) -{ - switch (event->attr.precise_ip) { - case 0: - return -ENOENT; - case 1: - case 2: - break; - default: - return -EOPNOTSUPP; - } - - switch (event->attr.type) { - case PERF_TYPE_HARDWARE: - switch (event->attr.config) { - case PERF_COUNT_HW_CPU_CYCLES: - *config = 0; - return 0; - } - break; - case PERF_TYPE_RAW: - switch (event->attr.config) { - case 0x0076: - *config = 0; - return 0; - case 0x00C1: - *config = IBS_OP_CNT_CTL; - return 0; - } - break; - default: - return -ENOENT; - } - - return -EOPNOTSUPP; -} - -static const struct perf_event_attr ibs_notsupp = { - .exclude_user = 1, - .exclude_kernel = 1, - .exclude_hv = 1, - .exclude_idle = 1, - .exclude_host = 1, - .exclude_guest = 1, -}; - -static int perf_ibs_init(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - struct perf_ibs *perf_ibs; - u64 max_cnt, config; - int ret; - - perf_ibs = get_ibs_pmu(event->attr.type); - if (perf_ibs) { - config = event->attr.config; - } else { - perf_ibs = &perf_ibs_op; - ret = perf_ibs_precise_event(event, &config); - if (ret) - return ret; - } - - if (event->pmu != &perf_ibs->pmu) - return -ENOENT; - - if (perf_flags(&event->attr) & perf_flags(&ibs_notsupp)) - return -EINVAL; - - if (config & ~perf_ibs->config_mask) - return -EINVAL; - - if (hwc->sample_period) { - if (config & perf_ibs->cnt_mask) - /* raw max_cnt may not be set */ - return -EINVAL; - if (!event->attr.sample_freq && hwc->sample_period & 0x0f) - /* - * lower 4 bits can not be set in ibs max cnt, - * but allowing it in case we adjust the - * sample period to set a frequency. - */ - return -EINVAL; - hwc->sample_period &= ~0x0FULL; - if (!hwc->sample_period) - hwc->sample_period = 0x10; - } else { - max_cnt = config & perf_ibs->cnt_mask; - config &= ~perf_ibs->cnt_mask; - event->attr.sample_period = max_cnt << 4; - hwc->sample_period = event->attr.sample_period; - } - - if (!hwc->sample_period) - return -EINVAL; - - /* - * If we modify hwc->sample_period, we also need to update - * hwc->last_period and hwc->period_left. - */ - hwc->last_period = hwc->sample_period; - local64_set(&hwc->period_left, hwc->sample_period); - - hwc->config_base = perf_ibs->msr; - hwc->config = config; - - return 0; -} - -static int perf_ibs_set_period(struct perf_ibs *perf_ibs, - struct hw_perf_event *hwc, u64 *period) -{ - int overflow; - - /* ignore lower 4 bits in min count: */ - overflow = perf_event_set_period(hwc, 1<<4, perf_ibs->max_period, period); - local64_set(&hwc->prev_count, 0); - - return overflow; -} - -static u64 get_ibs_fetch_count(u64 config) -{ - return (config & IBS_FETCH_CNT) >> 12; -} - -static u64 get_ibs_op_count(u64 config) -{ - u64 count = 0; - - if (config & IBS_OP_VAL) - count += (config & IBS_OP_MAX_CNT) << 4; /* cnt rolled over */ - - if (ibs_caps & IBS_CAPS_RDWROPCNT) - count += (config & IBS_OP_CUR_CNT) >> 32; - - return count; -} - -static void -perf_ibs_event_update(struct perf_ibs *perf_ibs, struct perf_event *event, - u64 *config) -{ - u64 count = perf_ibs->get_count(*config); - - /* - * Set width to 64 since we do not overflow on max width but - * instead on max count. In perf_ibs_set_period() we clear - * prev count manually on overflow. - */ - while (!perf_event_try_update(event, count, 64)) { - rdmsrl(event->hw.config_base, *config); - count = perf_ibs->get_count(*config); - } -} - -static inline void perf_ibs_enable_event(struct perf_ibs *perf_ibs, - struct hw_perf_event *hwc, u64 config) -{ - wrmsrl(hwc->config_base, hwc->config | config | perf_ibs->enable_mask); -} - -/* - * Erratum #420 Instruction-Based Sampling Engine May Generate - * Interrupt that Cannot Be Cleared: - * - * Must clear counter mask first, then clear the enable bit. See - * Revision Guide for AMD Family 10h Processors, Publication #41322. - */ -static inline void perf_ibs_disable_event(struct perf_ibs *perf_ibs, - struct hw_perf_event *hwc, u64 config) -{ - config &= ~perf_ibs->cnt_mask; - wrmsrl(hwc->config_base, config); - config &= ~perf_ibs->enable_mask; - wrmsrl(hwc->config_base, config); -} - -/* - * We cannot restore the ibs pmu state, so we always needs to update - * the event while stopping it and then reset the state when starting - * again. Thus, ignoring PERF_EF_RELOAD and PERF_EF_UPDATE flags in - * perf_ibs_start()/perf_ibs_stop() and instead always do it. - */ -static void perf_ibs_start(struct perf_event *event, int flags) -{ - struct hw_perf_event *hwc = &event->hw; - struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); - struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); - u64 period; - - if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED))) - return; - - WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE)); - hwc->state = 0; - - perf_ibs_set_period(perf_ibs, hwc, &period); - set_bit(IBS_STARTED, pcpu->state); - perf_ibs_enable_event(perf_ibs, hwc, period >> 4); - - perf_event_update_userpage(event); -} - -static void perf_ibs_stop(struct perf_event *event, int flags) -{ - struct hw_perf_event *hwc = &event->hw; - struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); - struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); - u64 config; - int stopping; - - stopping = test_and_clear_bit(IBS_STARTED, pcpu->state); - - if (!stopping && (hwc->state & PERF_HES_UPTODATE)) - return; - - rdmsrl(hwc->config_base, config); - - if (stopping) { - set_bit(IBS_STOPPING, pcpu->state); - perf_ibs_disable_event(perf_ibs, hwc, config); - WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); - hwc->state |= PERF_HES_STOPPED; - } - - if (hwc->state & PERF_HES_UPTODATE) - return; - - /* - * Clear valid bit to not count rollovers on update, rollovers - * are only updated in the irq handler. - */ - config &= ~perf_ibs->valid_mask; - - perf_ibs_event_update(perf_ibs, event, &config); - hwc->state |= PERF_HES_UPTODATE; -} - -static int perf_ibs_add(struct perf_event *event, int flags) -{ - struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); - struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); - - if (test_and_set_bit(IBS_ENABLED, pcpu->state)) - return -ENOSPC; - - event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED; - - pcpu->event = event; - - if (flags & PERF_EF_START) - perf_ibs_start(event, PERF_EF_RELOAD); - - return 0; -} - -static void perf_ibs_del(struct perf_event *event, int flags) -{ - struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); - struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); - - if (!test_and_clear_bit(IBS_ENABLED, pcpu->state)) - return; - - perf_ibs_stop(event, PERF_EF_UPDATE); - - pcpu->event = NULL; - - perf_event_update_userpage(event); -} - -static void perf_ibs_read(struct perf_event *event) { } - -PMU_FORMAT_ATTR(rand_en, "config:57"); -PMU_FORMAT_ATTR(cnt_ctl, "config:19"); - -static struct attribute *ibs_fetch_format_attrs[] = { - &format_attr_rand_en.attr, - NULL, -}; - -static struct attribute *ibs_op_format_attrs[] = { - NULL, /* &format_attr_cnt_ctl.attr if IBS_CAPS_OPCNT */ - NULL, -}; - -static struct perf_ibs perf_ibs_fetch = { - .pmu = { - .task_ctx_nr = perf_invalid_context, - - .event_init = perf_ibs_init, - .add = perf_ibs_add, - .del = perf_ibs_del, - .start = perf_ibs_start, - .stop = perf_ibs_stop, - .read = perf_ibs_read, - }, - .msr = MSR_AMD64_IBSFETCHCTL, - .config_mask = IBS_FETCH_CONFIG_MASK, - .cnt_mask = IBS_FETCH_MAX_CNT, - .enable_mask = IBS_FETCH_ENABLE, - .valid_mask = IBS_FETCH_VAL, - .max_period = IBS_FETCH_MAX_CNT << 4, - .offset_mask = { MSR_AMD64_IBSFETCH_REG_MASK }, - .offset_max = MSR_AMD64_IBSFETCH_REG_COUNT, - .format_attrs = ibs_fetch_format_attrs, - - .get_count = get_ibs_fetch_count, -}; - -static struct perf_ibs perf_ibs_op = { - .pmu = { - .task_ctx_nr = perf_invalid_context, - - .event_init = perf_ibs_init, - .add = perf_ibs_add, - .del = perf_ibs_del, - .start = perf_ibs_start, - .stop = perf_ibs_stop, - .read = perf_ibs_read, - }, - .msr = MSR_AMD64_IBSOPCTL, - .config_mask = IBS_OP_CONFIG_MASK, - .cnt_mask = IBS_OP_MAX_CNT, - .enable_mask = IBS_OP_ENABLE, - .valid_mask = IBS_OP_VAL, - .max_period = IBS_OP_MAX_CNT << 4, - .offset_mask = { MSR_AMD64_IBSOP_REG_MASK }, - .offset_max = MSR_AMD64_IBSOP_REG_COUNT, - .format_attrs = ibs_op_format_attrs, - - .get_count = get_ibs_op_count, -}; - -static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) -{ - struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); - struct perf_event *event = pcpu->event; - struct hw_perf_event *hwc = &event->hw; - struct perf_sample_data data; - struct perf_raw_record raw; - struct pt_regs regs; - struct perf_ibs_data ibs_data; - int offset, size, check_rip, offset_max, throttle = 0; - unsigned int msr; - u64 *buf, *config, period; - - if (!test_bit(IBS_STARTED, pcpu->state)) { - /* - * Catch spurious interrupts after stopping IBS: After - * disabling IBS there could be still incoming NMIs - * with samples that even have the valid bit cleared. - * Mark all this NMIs as handled. - */ - return test_and_clear_bit(IBS_STOPPING, pcpu->state) ? 1 : 0; - } - - msr = hwc->config_base; - buf = ibs_data.regs; - rdmsrl(msr, *buf); - if (!(*buf++ & perf_ibs->valid_mask)) - return 0; - - config = &ibs_data.regs[0]; - perf_ibs_event_update(perf_ibs, event, config); - perf_sample_data_init(&data, 0, hwc->last_period); - if (!perf_ibs_set_period(perf_ibs, hwc, &period)) - goto out; /* no sw counter overflow */ - - ibs_data.caps = ibs_caps; - size = 1; - offset = 1; - check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK)); - if (event->attr.sample_type & PERF_SAMPLE_RAW) - offset_max = perf_ibs->offset_max; - else if (check_rip) - offset_max = 2; - else - offset_max = 1; - do { - rdmsrl(msr + offset, *buf++); - size++; - offset = find_next_bit(perf_ibs->offset_mask, - perf_ibs->offset_max, - offset + 1); - } while (offset < offset_max); - if (event->attr.sample_type & PERF_SAMPLE_RAW) { - /* - * Read IbsBrTarget and IbsOpData4 separately - * depending on their availability. - * Can't add to offset_max as they are staggered - */ - if (ibs_caps & IBS_CAPS_BRNTRGT) { - rdmsrl(MSR_AMD64_IBSBRTARGET, *buf++); - size++; - } - if (ibs_caps & IBS_CAPS_OPDATA4) { - rdmsrl(MSR_AMD64_IBSOPDATA4, *buf++); - size++; - } - } - ibs_data.size = sizeof(u64) * size; - - regs = *iregs; - if (check_rip && (ibs_data.regs[2] & IBS_RIP_INVALID)) { - regs.flags &= ~PERF_EFLAGS_EXACT; - } else { - set_linear_ip(®s, ibs_data.regs[1]); - regs.flags |= PERF_EFLAGS_EXACT; - } - - if (event->attr.sample_type & PERF_SAMPLE_RAW) { - raw.size = sizeof(u32) + ibs_data.size; - raw.data = ibs_data.data; - data.raw = &raw; - } - - throttle = perf_event_overflow(event, &data, ®s); -out: - if (throttle) - perf_ibs_disable_event(perf_ibs, hwc, *config); - else - perf_ibs_enable_event(perf_ibs, hwc, period >> 4); - - perf_event_update_userpage(event); - - return 1; -} - -static int -perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs) -{ - int handled = 0; - - handled += perf_ibs_handle_irq(&perf_ibs_fetch, regs); - handled += perf_ibs_handle_irq(&perf_ibs_op, regs); - - if (handled) - inc_irq_stat(apic_perf_irqs); - - return handled; -} -NOKPROBE_SYMBOL(perf_ibs_nmi_handler); - -static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name) -{ - struct cpu_perf_ibs __percpu *pcpu; - int ret; - - pcpu = alloc_percpu(struct cpu_perf_ibs); - if (!pcpu) - return -ENOMEM; - - perf_ibs->pcpu = pcpu; - - /* register attributes */ - if (perf_ibs->format_attrs[0]) { - memset(&perf_ibs->format_group, 0, sizeof(perf_ibs->format_group)); - perf_ibs->format_group.name = "format"; - perf_ibs->format_group.attrs = perf_ibs->format_attrs; - - memset(&perf_ibs->attr_groups, 0, sizeof(perf_ibs->attr_groups)); - perf_ibs->attr_groups[0] = &perf_ibs->format_group; - perf_ibs->pmu.attr_groups = perf_ibs->attr_groups; - } - - ret = perf_pmu_register(&perf_ibs->pmu, name, -1); - if (ret) { - perf_ibs->pcpu = NULL; - free_percpu(pcpu); - } - - return ret; -} - -static __init int perf_event_ibs_init(void) -{ - struct attribute **attr = ibs_op_format_attrs; - - if (!ibs_caps) - return -ENODEV; /* ibs not supported by the cpu */ - - perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch"); - - if (ibs_caps & IBS_CAPS_OPCNT) { - perf_ibs_op.config_mask |= IBS_OP_CNT_CTL; - *attr++ = &format_attr_cnt_ctl.attr; - } - perf_ibs_pmu_init(&perf_ibs_op, "ibs_op"); - - register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs"); - printk(KERN_INFO "perf: AMD IBS detected (0x%08x)\n", ibs_caps); - - return 0; -} - -#else /* defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) */ - -static __init int perf_event_ibs_init(void) { return 0; } - -#endif - -/* IBS - apic initialization, for perf and oprofile */ - -static __init u32 __get_ibs_caps(void) -{ - u32 caps; - unsigned int max_level; - - if (!boot_cpu_has(X86_FEATURE_IBS)) - return 0; - - /* check IBS cpuid feature flags */ - max_level = cpuid_eax(0x80000000); - if (max_level < IBS_CPUID_FEATURES) - return IBS_CAPS_DEFAULT; - - caps = cpuid_eax(IBS_CPUID_FEATURES); - if (!(caps & IBS_CAPS_AVAIL)) - /* cpuid flags not valid */ - return IBS_CAPS_DEFAULT; - - return caps; -} - -u32 get_ibs_caps(void) -{ - return ibs_caps; -} - -EXPORT_SYMBOL(get_ibs_caps); - -static inline int get_eilvt(int offset) -{ - return !setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 1); -} - -static inline int put_eilvt(int offset) -{ - return !setup_APIC_eilvt(offset, 0, 0, 1); -} - -/* - * Check and reserve APIC extended interrupt LVT offset for IBS if available. - */ -static inline int ibs_eilvt_valid(void) -{ - int offset; - u64 val; - int valid = 0; - - preempt_disable(); - - rdmsrl(MSR_AMD64_IBSCTL, val); - offset = val & IBSCTL_LVT_OFFSET_MASK; - - if (!(val & IBSCTL_LVT_OFFSET_VALID)) { - pr_err(FW_BUG "cpu %d, invalid IBS interrupt offset %d (MSR%08X=0x%016llx)\n", - smp_processor_id(), offset, MSR_AMD64_IBSCTL, val); - goto out; - } - - if (!get_eilvt(offset)) { - pr_err(FW_BUG "cpu %d, IBS interrupt offset %d not available (MSR%08X=0x%016llx)\n", - smp_processor_id(), offset, MSR_AMD64_IBSCTL, val); - goto out; - } - - valid = 1; -out: - preempt_enable(); - - return valid; -} - -static int setup_ibs_ctl(int ibs_eilvt_off) -{ - struct pci_dev *cpu_cfg; - int nodes; - u32 value = 0; - - nodes = 0; - cpu_cfg = NULL; - do { - cpu_cfg = pci_get_device(PCI_VENDOR_ID_AMD, - PCI_DEVICE_ID_AMD_10H_NB_MISC, - cpu_cfg); - if (!cpu_cfg) - break; - ++nodes; - pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off - | IBSCTL_LVT_OFFSET_VALID); - pci_read_config_dword(cpu_cfg, IBSCTL, &value); - if (value != (ibs_eilvt_off | IBSCTL_LVT_OFFSET_VALID)) { - pci_dev_put(cpu_cfg); - printk(KERN_DEBUG "Failed to setup IBS LVT offset, " - "IBSCTL = 0x%08x\n", value); - return -EINVAL; - } - } while (1); - - if (!nodes) { - printk(KERN_DEBUG "No CPU node configured for IBS\n"); - return -ENODEV; - } - - return 0; -} - -/* - * This runs only on the current cpu. We try to find an LVT offset and - * setup the local APIC. For this we must disable preemption. On - * success we initialize all nodes with this offset. This updates then - * the offset in the IBS_CTL per-node msr. The per-core APIC setup of - * the IBS interrupt vector is handled by perf_ibs_cpu_notifier that - * is using the new offset. - */ -static void force_ibs_eilvt_setup(void) -{ - int offset; - int ret; - - preempt_disable(); - /* find the next free available EILVT entry, skip offset 0 */ - for (offset = 1; offset < APIC_EILVT_NR_MAX; offset++) { - if (get_eilvt(offset)) - break; - } - preempt_enable(); - - if (offset == APIC_EILVT_NR_MAX) { - printk(KERN_DEBUG "No EILVT entry available\n"); - return; - } - - ret = setup_ibs_ctl(offset); - if (ret) - goto out; - - if (!ibs_eilvt_valid()) - goto out; - - pr_info("IBS: LVT offset %d assigned\n", offset); - - return; -out: - preempt_disable(); - put_eilvt(offset); - preempt_enable(); - return; -} - -static void ibs_eilvt_setup(void) -{ - /* - * Force LVT offset assignment for family 10h: The offsets are - * not assigned by the BIOS for this family, so the OS is - * responsible for doing it. If the OS assignment fails, fall - * back to BIOS settings and try to setup this. - */ - if (boot_cpu_data.x86 == 0x10) - force_ibs_eilvt_setup(); -} - -static inline int get_ibs_lvt_offset(void) -{ - u64 val; - - rdmsrl(MSR_AMD64_IBSCTL, val); - if (!(val & IBSCTL_LVT_OFFSET_VALID)) - return -EINVAL; - - return val & IBSCTL_LVT_OFFSET_MASK; -} - -static void setup_APIC_ibs(void *dummy) -{ - int offset; - - offset = get_ibs_lvt_offset(); - if (offset < 0) - goto failed; - - if (!setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 0)) - return; -failed: - pr_warn("perf: IBS APIC setup failed on cpu #%d\n", - smp_processor_id()); -} - -static void clear_APIC_ibs(void *dummy) -{ - int offset; - - offset = get_ibs_lvt_offset(); - if (offset >= 0) - setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1); -} - -#ifdef CONFIG_PM - -static int perf_ibs_suspend(void) -{ - clear_APIC_ibs(NULL); - return 0; -} - -static void perf_ibs_resume(void) -{ - ibs_eilvt_setup(); - setup_APIC_ibs(NULL); -} - -static struct syscore_ops perf_ibs_syscore_ops = { - .resume = perf_ibs_resume, - .suspend = perf_ibs_suspend, -}; - -static void perf_ibs_pm_init(void) -{ - register_syscore_ops(&perf_ibs_syscore_ops); -} - -#else - -static inline void perf_ibs_pm_init(void) { } - -#endif - -static int -perf_ibs_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) -{ - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_STARTING: - setup_APIC_ibs(NULL); - break; - case CPU_DYING: - clear_APIC_ibs(NULL); - break; - default: - break; - } - - return NOTIFY_OK; -} - -static __init int amd_ibs_init(void) -{ - u32 caps; - int ret = -EINVAL; - - caps = __get_ibs_caps(); - if (!caps) - return -ENODEV; /* ibs not supported by the cpu */ - - ibs_eilvt_setup(); - - if (!ibs_eilvt_valid()) - goto out; - - perf_ibs_pm_init(); - cpu_notifier_register_begin(); - ibs_caps = caps; - /* make ibs_caps visible to other cpus: */ - smp_mb(); - smp_call_function(setup_APIC_ibs, NULL, 1); - __perf_cpu_notifier(perf_ibs_cpu_notifier); - cpu_notifier_register_done(); - - ret = perf_event_ibs_init(); -out: - if (ret) - pr_err("Failed to setup IBS, %d\n", ret); - return ret; -} - -/* Since we need the pci subsystem to init ibs we can't do this earlier: */ -device_initcall(amd_ibs_init); diff --git a/arch/x86/kernel/cpu/perf_event_amd_iommu.c b/arch/x86/kernel/cpu/perf_event_amd_iommu.c deleted file mode 100644 index 97242a9242bd..000000000000 --- a/arch/x86/kernel/cpu/perf_event_amd_iommu.c +++ /dev/null @@ -1,499 +0,0 @@ -/* - * Copyright (C) 2013 Advanced Micro Devices, Inc. - * - * Author: Steven Kinney <Steven.Kinney@amd.com> - * Author: Suravee Suthikulpanit <Suraveee.Suthikulpanit@amd.com> - * - * Perf: amd_iommu - AMD IOMMU Performance Counter PMU implementation - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/perf_event.h> -#include <linux/module.h> -#include <linux/cpumask.h> -#include <linux/slab.h> - -#include "perf_event.h" -#include "perf_event_amd_iommu.h" - -#define COUNTER_SHIFT 16 - -#define _GET_BANK(ev) ((u8)(ev->hw.extra_reg.reg >> 8)) -#define _GET_CNTR(ev) ((u8)(ev->hw.extra_reg.reg)) - -/* iommu pmu config masks */ -#define _GET_CSOURCE(ev) ((ev->hw.config & 0xFFULL)) -#define _GET_DEVID(ev) ((ev->hw.config >> 8) & 0xFFFFULL) -#define _GET_PASID(ev) ((ev->hw.config >> 24) & 0xFFFFULL) -#define _GET_DOMID(ev) ((ev->hw.config >> 40) & 0xFFFFULL) -#define _GET_DEVID_MASK(ev) ((ev->hw.extra_reg.config) & 0xFFFFULL) -#define _GET_PASID_MASK(ev) ((ev->hw.extra_reg.config >> 16) & 0xFFFFULL) -#define _GET_DOMID_MASK(ev) ((ev->hw.extra_reg.config >> 32) & 0xFFFFULL) - -static struct perf_amd_iommu __perf_iommu; - -struct perf_amd_iommu { - struct pmu pmu; - u8 max_banks; - u8 max_counters; - u64 cntr_assign_mask; - raw_spinlock_t lock; - const struct attribute_group *attr_groups[4]; -}; - -#define format_group attr_groups[0] -#define cpumask_group attr_groups[1] -#define events_group attr_groups[2] -#define null_group attr_groups[3] - -/*--------------------------------------------- - * sysfs format attributes - *---------------------------------------------*/ -PMU_FORMAT_ATTR(csource, "config:0-7"); -PMU_FORMAT_ATTR(devid, "config:8-23"); -PMU_FORMAT_ATTR(pasid, "config:24-39"); -PMU_FORMAT_ATTR(domid, "config:40-55"); -PMU_FORMAT_ATTR(devid_mask, "config1:0-15"); -PMU_FORMAT_ATTR(pasid_mask, "config1:16-31"); -PMU_FORMAT_ATTR(domid_mask, "config1:32-47"); - -static struct attribute *iommu_format_attrs[] = { - &format_attr_csource.attr, - &format_attr_devid.attr, - &format_attr_pasid.attr, - &format_attr_domid.attr, - &format_attr_devid_mask.attr, - &format_attr_pasid_mask.attr, - &format_attr_domid_mask.attr, - NULL, -}; - -static struct attribute_group amd_iommu_format_group = { - .name = "format", - .attrs = iommu_format_attrs, -}; - -/*--------------------------------------------- - * sysfs events attributes - *---------------------------------------------*/ -struct amd_iommu_event_desc { - struct kobj_attribute attr; - const char *event; -}; - -static ssize_t _iommu_event_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct amd_iommu_event_desc *event = - container_of(attr, struct amd_iommu_event_desc, attr); - return sprintf(buf, "%s\n", event->event); -} - -#define AMD_IOMMU_EVENT_DESC(_name, _event) \ -{ \ - .attr = __ATTR(_name, 0444, _iommu_event_show, NULL), \ - .event = _event, \ -} - -static struct amd_iommu_event_desc amd_iommu_v2_event_descs[] = { - AMD_IOMMU_EVENT_DESC(mem_pass_untrans, "csource=0x01"), - AMD_IOMMU_EVENT_DESC(mem_pass_pretrans, "csource=0x02"), - AMD_IOMMU_EVENT_DESC(mem_pass_excl, "csource=0x03"), - AMD_IOMMU_EVENT_DESC(mem_target_abort, "csource=0x04"), - AMD_IOMMU_EVENT_DESC(mem_trans_total, "csource=0x05"), - AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pte_hit, "csource=0x06"), - AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pte_mis, "csource=0x07"), - AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pde_hit, "csource=0x08"), - AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pde_mis, "csource=0x09"), - AMD_IOMMU_EVENT_DESC(mem_dte_hit, "csource=0x0a"), - AMD_IOMMU_EVENT_DESC(mem_dte_mis, "csource=0x0b"), - AMD_IOMMU_EVENT_DESC(page_tbl_read_tot, "csource=0x0c"), - AMD_IOMMU_EVENT_DESC(page_tbl_read_nst, "csource=0x0d"), - AMD_IOMMU_EVENT_DESC(page_tbl_read_gst, "csource=0x0e"), - AMD_IOMMU_EVENT_DESC(int_dte_hit, "csource=0x0f"), - AMD_IOMMU_EVENT_DESC(int_dte_mis, "csource=0x10"), - AMD_IOMMU_EVENT_DESC(cmd_processed, "csource=0x11"), - AMD_IOMMU_EVENT_DESC(cmd_processed_inv, "csource=0x12"), - AMD_IOMMU_EVENT_DESC(tlb_inv, "csource=0x13"), - { /* end: all zeroes */ }, -}; - -/*--------------------------------------------- - * sysfs cpumask attributes - *---------------------------------------------*/ -static cpumask_t iommu_cpumask; - -static ssize_t _iommu_cpumask_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - return cpumap_print_to_pagebuf(true, buf, &iommu_cpumask); -} -static DEVICE_ATTR(cpumask, S_IRUGO, _iommu_cpumask_show, NULL); - -static struct attribute *iommu_cpumask_attrs[] = { - &dev_attr_cpumask.attr, - NULL, -}; - -static struct attribute_group amd_iommu_cpumask_group = { - .attrs = iommu_cpumask_attrs, -}; - -/*---------------------------------------------*/ - -static int get_next_avail_iommu_bnk_cntr(struct perf_amd_iommu *perf_iommu) -{ - unsigned long flags; - int shift, bank, cntr, retval; - int max_banks = perf_iommu->max_banks; - int max_cntrs = perf_iommu->max_counters; - - raw_spin_lock_irqsave(&perf_iommu->lock, flags); - - for (bank = 0, shift = 0; bank < max_banks; bank++) { - for (cntr = 0; cntr < max_cntrs; cntr++) { - shift = bank + (bank*3) + cntr; - if (perf_iommu->cntr_assign_mask & (1ULL<<shift)) { - continue; - } else { - perf_iommu->cntr_assign_mask |= (1ULL<<shift); - retval = ((u16)((u16)bank<<8) | (u8)(cntr)); - goto out; - } - } - } - retval = -ENOSPC; -out: - raw_spin_unlock_irqrestore(&perf_iommu->lock, flags); - return retval; -} - -static int clear_avail_iommu_bnk_cntr(struct perf_amd_iommu *perf_iommu, - u8 bank, u8 cntr) -{ - unsigned long flags; - int max_banks, max_cntrs; - int shift = 0; - - max_banks = perf_iommu->max_banks; - max_cntrs = perf_iommu->max_counters; - - if ((bank > max_banks) || (cntr > max_cntrs)) - return -EINVAL; - - shift = bank + cntr + (bank*3); - - raw_spin_lock_irqsave(&perf_iommu->lock, flags); - perf_iommu->cntr_assign_mask &= ~(1ULL<<shift); - raw_spin_unlock_irqrestore(&perf_iommu->lock, flags); - - return 0; -} - -static int perf_iommu_event_init(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - struct perf_amd_iommu *perf_iommu; - u64 config, config1; - - /* test the event attr type check for PMU enumeration */ - if (event->attr.type != event->pmu->type) - return -ENOENT; - - /* - * IOMMU counters are shared across all cores. - * Therefore, it does not support per-process mode. - * Also, it does not support event sampling mode. - */ - if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK) - return -EINVAL; - - /* IOMMU counters do not have usr/os/guest/host bits */ - if (event->attr.exclude_user || event->attr.exclude_kernel || - event->attr.exclude_host || event->attr.exclude_guest) - return -EINVAL; - - if (event->cpu < 0) - return -EINVAL; - - perf_iommu = &__perf_iommu; - - if (event->pmu != &perf_iommu->pmu) - return -ENOENT; - - if (perf_iommu) { - config = event->attr.config; - config1 = event->attr.config1; - } else { - return -EINVAL; - } - - /* integrate with iommu base devid (0000), assume one iommu */ - perf_iommu->max_banks = - amd_iommu_pc_get_max_banks(IOMMU_BASE_DEVID); - perf_iommu->max_counters = - amd_iommu_pc_get_max_counters(IOMMU_BASE_DEVID); - if ((perf_iommu->max_banks == 0) || (perf_iommu->max_counters == 0)) - return -EINVAL; - - /* update the hw_perf_event struct with the iommu config data */ - hwc->config = config; - hwc->extra_reg.config = config1; - - return 0; -} - -static void perf_iommu_enable_event(struct perf_event *ev) -{ - u8 csource = _GET_CSOURCE(ev); - u16 devid = _GET_DEVID(ev); - u64 reg = 0ULL; - - reg = csource; - amd_iommu_pc_get_set_reg_val(devid, - _GET_BANK(ev), _GET_CNTR(ev) , - IOMMU_PC_COUNTER_SRC_REG, ®, true); - - reg = 0ULL | devid | (_GET_DEVID_MASK(ev) << 32); - if (reg) - reg |= (1UL << 31); - amd_iommu_pc_get_set_reg_val(devid, - _GET_BANK(ev), _GET_CNTR(ev) , - IOMMU_PC_DEVID_MATCH_REG, ®, true); - - reg = 0ULL | _GET_PASID(ev) | (_GET_PASID_MASK(ev) << 32); - if (reg) - reg |= (1UL << 31); - amd_iommu_pc_get_set_reg_val(devid, - _GET_BANK(ev), _GET_CNTR(ev) , - IOMMU_PC_PASID_MATCH_REG, ®, true); - - reg = 0ULL | _GET_DOMID(ev) | (_GET_DOMID_MASK(ev) << 32); - if (reg) - reg |= (1UL << 31); - amd_iommu_pc_get_set_reg_val(devid, - _GET_BANK(ev), _GET_CNTR(ev) , - IOMMU_PC_DOMID_MATCH_REG, ®, true); -} - -static void perf_iommu_disable_event(struct perf_event *event) -{ - u64 reg = 0ULL; - - amd_iommu_pc_get_set_reg_val(_GET_DEVID(event), - _GET_BANK(event), _GET_CNTR(event), - IOMMU_PC_COUNTER_SRC_REG, ®, true); -} - -static void perf_iommu_start(struct perf_event *event, int flags) -{ - struct hw_perf_event *hwc = &event->hw; - - pr_debug("perf: amd_iommu:perf_iommu_start\n"); - if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED))) - return; - - WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE)); - hwc->state = 0; - - if (flags & PERF_EF_RELOAD) { - u64 prev_raw_count = local64_read(&hwc->prev_count); - amd_iommu_pc_get_set_reg_val(_GET_DEVID(event), - _GET_BANK(event), _GET_CNTR(event), - IOMMU_PC_COUNTER_REG, &prev_raw_count, true); - } - - perf_iommu_enable_event(event); - perf_event_update_userpage(event); - -} - -static void perf_iommu_read(struct perf_event *event) -{ - u64 count = 0ULL; - u64 prev_raw_count = 0ULL; - u64 delta = 0ULL; - struct hw_perf_event *hwc = &event->hw; - pr_debug("perf: amd_iommu:perf_iommu_read\n"); - - amd_iommu_pc_get_set_reg_val(_GET_DEVID(event), - _GET_BANK(event), _GET_CNTR(event), - IOMMU_PC_COUNTER_REG, &count, false); - - /* IOMMU pc counter register is only 48 bits */ - count &= 0xFFFFFFFFFFFFULL; - - prev_raw_count = local64_read(&hwc->prev_count); - if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, - count) != prev_raw_count) - return; - - /* Handling 48-bit counter overflowing */ - delta = (count << COUNTER_SHIFT) - (prev_raw_count << COUNTER_SHIFT); - delta >>= COUNTER_SHIFT; - local64_add(delta, &event->count); - -} - -static void perf_iommu_stop(struct perf_event *event, int flags) -{ - struct hw_perf_event *hwc = &event->hw; - u64 config; - - pr_debug("perf: amd_iommu:perf_iommu_stop\n"); - - if (hwc->state & PERF_HES_UPTODATE) - return; - - perf_iommu_disable_event(event); - WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); - hwc->state |= PERF_HES_STOPPED; - - if (hwc->state & PERF_HES_UPTODATE) - return; - - config = hwc->config; - perf_iommu_read(event); - hwc->state |= PERF_HES_UPTODATE; -} - -static int perf_iommu_add(struct perf_event *event, int flags) -{ - int retval; - struct perf_amd_iommu *perf_iommu = - container_of(event->pmu, struct perf_amd_iommu, pmu); - - pr_debug("perf: amd_iommu:perf_iommu_add\n"); - event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED; - - /* request an iommu bank/counter */ - retval = get_next_avail_iommu_bnk_cntr(perf_iommu); - if (retval != -ENOSPC) - event->hw.extra_reg.reg = (u16)retval; - else - return retval; - - if (flags & PERF_EF_START) - perf_iommu_start(event, PERF_EF_RELOAD); - - return 0; -} - -static void perf_iommu_del(struct perf_event *event, int flags) -{ - struct perf_amd_iommu *perf_iommu = - container_of(event->pmu, struct perf_amd_iommu, pmu); - - pr_debug("perf: amd_iommu:perf_iommu_del\n"); - perf_iommu_stop(event, PERF_EF_UPDATE); - - /* clear the assigned iommu bank/counter */ - clear_avail_iommu_bnk_cntr(perf_iommu, - _GET_BANK(event), - _GET_CNTR(event)); - - perf_event_update_userpage(event); -} - -static __init int _init_events_attrs(struct perf_amd_iommu *perf_iommu) -{ - struct attribute **attrs; - struct attribute_group *attr_group; - int i = 0, j; - - while (amd_iommu_v2_event_descs[i].attr.attr.name) - i++; - - attr_group = kzalloc(sizeof(struct attribute *) - * (i + 1) + sizeof(*attr_group), GFP_KERNEL); - if (!attr_group) - return -ENOMEM; - - attrs = (struct attribute **)(attr_group + 1); - for (j = 0; j < i; j++) - attrs[j] = &amd_iommu_v2_event_descs[j].attr.attr; - - attr_group->name = "events"; - attr_group->attrs = attrs; - perf_iommu->events_group = attr_group; - - return 0; -} - -static __init void amd_iommu_pc_exit(void) -{ - if (__perf_iommu.events_group != NULL) { - kfree(__perf_iommu.events_group); - __perf_iommu.events_group = NULL; - } -} - -static __init int _init_perf_amd_iommu( - struct perf_amd_iommu *perf_iommu, char *name) -{ - int ret; - - raw_spin_lock_init(&perf_iommu->lock); - - /* Init format attributes */ - perf_iommu->format_group = &amd_iommu_format_group; - - /* Init cpumask attributes to only core 0 */ - cpumask_set_cpu(0, &iommu_cpumask); - perf_iommu->cpumask_group = &amd_iommu_cpumask_group; - - /* Init events attributes */ - if (_init_events_attrs(perf_iommu) != 0) - pr_err("perf: amd_iommu: Only support raw events.\n"); - - /* Init null attributes */ - perf_iommu->null_group = NULL; - perf_iommu->pmu.attr_groups = perf_iommu->attr_groups; - - ret = perf_pmu_register(&perf_iommu->pmu, name, -1); - if (ret) { - pr_err("perf: amd_iommu: Failed to initialized.\n"); - amd_iommu_pc_exit(); - } else { - pr_info("perf: amd_iommu: Detected. (%d banks, %d counters/bank)\n", - amd_iommu_pc_get_max_banks(IOMMU_BASE_DEVID), - amd_iommu_pc_get_max_counters(IOMMU_BASE_DEVID)); - } - - return ret; -} - -static struct perf_amd_iommu __perf_iommu = { - .pmu = { - .event_init = perf_iommu_event_init, - .add = perf_iommu_add, - .del = perf_iommu_del, - .start = perf_iommu_start, - .stop = perf_iommu_stop, - .read = perf_iommu_read, - }, - .max_banks = 0x00, - .max_counters = 0x00, - .cntr_assign_mask = 0ULL, - .format_group = NULL, - .cpumask_group = NULL, - .events_group = NULL, - .null_group = NULL, -}; - -static __init int amd_iommu_pc_init(void) -{ - /* Make sure the IOMMU PC resource is available */ - if (!amd_iommu_pc_supported()) - return -ENODEV; - - _init_perf_amd_iommu(&__perf_iommu, "amd_iommu"); - - return 0; -} - -device_initcall(amd_iommu_pc_init); diff --git a/arch/x86/kernel/cpu/perf_event_amd_iommu.h b/arch/x86/kernel/cpu/perf_event_amd_iommu.h deleted file mode 100644 index 845d173278e3..000000000000 --- a/arch/x86/kernel/cpu/perf_event_amd_iommu.h +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Copyright (C) 2013 Advanced Micro Devices, Inc. - * - * Author: Steven Kinney <Steven.Kinney@amd.com> - * Author: Suravee Suthikulpanit <Suraveee.Suthikulpanit@amd.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#ifndef _PERF_EVENT_AMD_IOMMU_H_ -#define _PERF_EVENT_AMD_IOMMU_H_ - -/* iommu pc mmio region register indexes */ -#define IOMMU_PC_COUNTER_REG 0x00 -#define IOMMU_PC_COUNTER_SRC_REG 0x08 -#define IOMMU_PC_PASID_MATCH_REG 0x10 -#define IOMMU_PC_DOMID_MATCH_REG 0x18 -#define IOMMU_PC_DEVID_MATCH_REG 0x20 -#define IOMMU_PC_COUNTER_REPORT_REG 0x28 - -/* maximun specified bank/counters */ -#define PC_MAX_SPEC_BNKS 64 -#define PC_MAX_SPEC_CNTRS 16 - -/* iommu pc reg masks*/ -#define IOMMU_BASE_DEVID 0x0000 - -/* amd_iommu_init.c external support functions */ -extern bool amd_iommu_pc_supported(void); - -extern u8 amd_iommu_pc_get_max_banks(u16 devid); - -extern u8 amd_iommu_pc_get_max_counters(u16 devid); - -extern int amd_iommu_pc_get_set_reg_val(u16 devid, u8 bank, u8 cntr, - u8 fxn, u64 *value, bool is_write); - -#endif /*_PERF_EVENT_AMD_IOMMU_H_*/ diff --git a/arch/x86/kernel/cpu/perf_event_amd_uncore.c b/arch/x86/kernel/cpu/perf_event_amd_uncore.c deleted file mode 100644 index 8836fc9fa84b..000000000000 --- a/arch/x86/kernel/cpu/perf_event_amd_uncore.c +++ /dev/null @@ -1,603 +0,0 @@ -/* - * Copyright (C) 2013 Advanced Micro Devices, Inc. - * - * Author: Jacob Shin <jacob.shin@amd.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/perf_event.h> -#include <linux/percpu.h> -#include <linux/types.h> -#include <linux/slab.h> -#include <linux/init.h> -#include <linux/cpu.h> -#include <linux/cpumask.h> - -#include <asm/cpufeature.h> -#include <asm/perf_event.h> -#include <asm/msr.h> - -#define NUM_COUNTERS_NB 4 -#define NUM_COUNTERS_L2 4 -#define MAX_COUNTERS NUM_COUNTERS_NB - -#define RDPMC_BASE_NB 6 -#define RDPMC_BASE_L2 10 - -#define COUNTER_SHIFT 16 - -struct amd_uncore { - int id; - int refcnt; - int cpu; - int num_counters; - int rdpmc_base; - u32 msr_base; - cpumask_t *active_mask; - struct pmu *pmu; - struct perf_event *events[MAX_COUNTERS]; - struct amd_uncore *free_when_cpu_online; -}; - -static struct amd_uncore * __percpu *amd_uncore_nb; -static struct amd_uncore * __percpu *amd_uncore_l2; - -static struct pmu amd_nb_pmu; -static struct pmu amd_l2_pmu; - -static cpumask_t amd_nb_active_mask; -static cpumask_t amd_l2_active_mask; - -static bool is_nb_event(struct perf_event *event) -{ - return event->pmu->type == amd_nb_pmu.type; -} - -static bool is_l2_event(struct perf_event *event) -{ - return event->pmu->type == amd_l2_pmu.type; -} - -static struct amd_uncore *event_to_amd_uncore(struct perf_event *event) -{ - if (is_nb_event(event) && amd_uncore_nb) - return *per_cpu_ptr(amd_uncore_nb, event->cpu); - else if (is_l2_event(event) && amd_uncore_l2) - return *per_cpu_ptr(amd_uncore_l2, event->cpu); - - return NULL; -} - -static void amd_uncore_read(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - u64 prev, new; - s64 delta; - - /* - * since we do not enable counter overflow interrupts, - * we do not have to worry about prev_count changing on us - */ - - prev = local64_read(&hwc->prev_count); - rdpmcl(hwc->event_base_rdpmc, new); - local64_set(&hwc->prev_count, new); - delta = (new << COUNTER_SHIFT) - (prev << COUNTER_SHIFT); - delta >>= COUNTER_SHIFT; - local64_add(delta, &event->count); -} - -static void amd_uncore_start(struct perf_event *event, int flags) -{ - struct hw_perf_event *hwc = &event->hw; - - if (flags & PERF_EF_RELOAD) - wrmsrl(hwc->event_base, (u64)local64_read(&hwc->prev_count)); - - hwc->state = 0; - wrmsrl(hwc->config_base, (hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE)); - perf_event_update_userpage(event); -} - -static void amd_uncore_stop(struct perf_event *event, int flags) -{ - struct hw_perf_event *hwc = &event->hw; - - wrmsrl(hwc->config_base, hwc->config); - hwc->state |= PERF_HES_STOPPED; - - if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { - amd_uncore_read(event); - hwc->state |= PERF_HES_UPTODATE; - } -} - -static int amd_uncore_add(struct perf_event *event, int flags) -{ - int i; - struct amd_uncore *uncore = event_to_amd_uncore(event); - struct hw_perf_event *hwc = &event->hw; - - /* are we already assigned? */ - if (hwc->idx != -1 && uncore->events[hwc->idx] == event) - goto out; - - for (i = 0; i < uncore->num_counters; i++) { - if (uncore->events[i] == event) { - hwc->idx = i; - goto out; - } - } - - /* if not, take the first available counter */ - hwc->idx = -1; - for (i = 0; i < uncore->num_counters; i++) { - if (cmpxchg(&uncore->events[i], NULL, event) == NULL) { - hwc->idx = i; - break; - } - } - -out: - if (hwc->idx == -1) - return -EBUSY; - - hwc->config_base = uncore->msr_base + (2 * hwc->idx); - hwc->event_base = uncore->msr_base + 1 + (2 * hwc->idx); - hwc->event_base_rdpmc = uncore->rdpmc_base + hwc->idx; - hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; - - if (flags & PERF_EF_START) - amd_uncore_start(event, PERF_EF_RELOAD); - - return 0; -} - -static void amd_uncore_del(struct perf_event *event, int flags) -{ - int i; - struct amd_uncore *uncore = event_to_amd_uncore(event); - struct hw_perf_event *hwc = &event->hw; - - amd_uncore_stop(event, PERF_EF_UPDATE); - - for (i = 0; i < uncore->num_counters; i++) { - if (cmpxchg(&uncore->events[i], event, NULL) == event) - break; - } - - hwc->idx = -1; -} - -static int amd_uncore_event_init(struct perf_event *event) -{ - struct amd_uncore *uncore; - struct hw_perf_event *hwc = &event->hw; - - if (event->attr.type != event->pmu->type) - return -ENOENT; - - /* - * NB and L2 counters (MSRs) are shared across all cores that share the - * same NB / L2 cache. Interrupts can be directed to a single target - * core, however, event counts generated by processes running on other - * cores cannot be masked out. So we do not support sampling and - * per-thread events. - */ - if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK) - return -EINVAL; - - /* NB and L2 counters do not have usr/os/guest/host bits */ - if (event->attr.exclude_user || event->attr.exclude_kernel || - event->attr.exclude_host || event->attr.exclude_guest) - return -EINVAL; - - /* and we do not enable counter overflow interrupts */ - hwc->config = event->attr.config & AMD64_RAW_EVENT_MASK_NB; - hwc->idx = -1; - - if (event->cpu < 0) - return -EINVAL; - - uncore = event_to_amd_uncore(event); - if (!uncore) - return -ENODEV; - - /* - * since request can come in to any of the shared cores, we will remap - * to a single common cpu. - */ - event->cpu = uncore->cpu; - - return 0; -} - -static ssize_t amd_uncore_attr_show_cpumask(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - cpumask_t *active_mask; - struct pmu *pmu = dev_get_drvdata(dev); - - if (pmu->type == amd_nb_pmu.type) - active_mask = &amd_nb_active_mask; - else if (pmu->type == amd_l2_pmu.type) - active_mask = &amd_l2_active_mask; - else - return 0; - - return cpumap_print_to_pagebuf(true, buf, active_mask); -} -static DEVICE_ATTR(cpumask, S_IRUGO, amd_uncore_attr_show_cpumask, NULL); - -static struct attribute *amd_uncore_attrs[] = { - &dev_attr_cpumask.attr, - NULL, -}; - -static struct attribute_group amd_uncore_attr_group = { - .attrs = amd_uncore_attrs, -}; - -PMU_FORMAT_ATTR(event, "config:0-7,32-35"); -PMU_FORMAT_ATTR(umask, "config:8-15"); - -static struct attribute *amd_uncore_format_attr[] = { - &format_attr_event.attr, - &format_attr_umask.attr, - NULL, -}; - -static struct attribute_group amd_uncore_format_group = { - .name = "format", - .attrs = amd_uncore_format_attr, -}; - -static const struct attribute_group *amd_uncore_attr_groups[] = { - &amd_uncore_attr_group, - &amd_uncore_format_group, - NULL, -}; - -static struct pmu amd_nb_pmu = { - .attr_groups = amd_uncore_attr_groups, - .name = "amd_nb", - .event_init = amd_uncore_event_init, - .add = amd_uncore_add, - .del = amd_uncore_del, - .start = amd_uncore_start, - .stop = amd_uncore_stop, - .read = amd_uncore_read, -}; - -static struct pmu amd_l2_pmu = { - .attr_groups = amd_uncore_attr_groups, - .name = "amd_l2", - .event_init = amd_uncore_event_init, - .add = amd_uncore_add, - .del = amd_uncore_del, - .start = amd_uncore_start, - .stop = amd_uncore_stop, - .read = amd_uncore_read, -}; - -static struct amd_uncore *amd_uncore_alloc(unsigned int cpu) -{ - return kzalloc_node(sizeof(struct amd_uncore), GFP_KERNEL, - cpu_to_node(cpu)); -} - -static int amd_uncore_cpu_up_prepare(unsigned int cpu) -{ - struct amd_uncore *uncore_nb = NULL, *uncore_l2; - - if (amd_uncore_nb) { - uncore_nb = amd_uncore_alloc(cpu); - if (!uncore_nb) - goto fail; - uncore_nb->cpu = cpu; - uncore_nb->num_counters = NUM_COUNTERS_NB; - uncore_nb->rdpmc_base = RDPMC_BASE_NB; - uncore_nb->msr_base = MSR_F15H_NB_PERF_CTL; - uncore_nb->active_mask = &amd_nb_active_mask; - uncore_nb->pmu = &amd_nb_pmu; - *per_cpu_ptr(amd_uncore_nb, cpu) = uncore_nb; - } - - if (amd_uncore_l2) { - uncore_l2 = amd_uncore_alloc(cpu); - if (!uncore_l2) - goto fail; - uncore_l2->cpu = cpu; - uncore_l2->num_counters = NUM_COUNTERS_L2; - uncore_l2->rdpmc_base = RDPMC_BASE_L2; - uncore_l2->msr_base = MSR_F16H_L2I_PERF_CTL; - uncore_l2->active_mask = &amd_l2_active_mask; - uncore_l2->pmu = &amd_l2_pmu; - *per_cpu_ptr(amd_uncore_l2, cpu) = uncore_l2; - } - - return 0; - -fail: - if (amd_uncore_nb) - *per_cpu_ptr(amd_uncore_nb, cpu) = NULL; - kfree(uncore_nb); - return -ENOMEM; -} - -static struct amd_uncore * -amd_uncore_find_online_sibling(struct amd_uncore *this, - struct amd_uncore * __percpu *uncores) -{ - unsigned int cpu; - struct amd_uncore *that; - - for_each_online_cpu(cpu) { - that = *per_cpu_ptr(uncores, cpu); - - if (!that) - continue; - - if (this == that) - continue; - - if (this->id == that->id) { - that->free_when_cpu_online = this; - this = that; - break; - } - } - - this->refcnt++; - return this; -} - -static void amd_uncore_cpu_starting(unsigned int cpu) -{ - unsigned int eax, ebx, ecx, edx; - struct amd_uncore *uncore; - - if (amd_uncore_nb) { - uncore = *per_cpu_ptr(amd_uncore_nb, cpu); - cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); - uncore->id = ecx & 0xff; - - uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_nb); - *per_cpu_ptr(amd_uncore_nb, cpu) = uncore; - } - - if (amd_uncore_l2) { - unsigned int apicid = cpu_data(cpu).apicid; - unsigned int nshared; - - uncore = *per_cpu_ptr(amd_uncore_l2, cpu); - cpuid_count(0x8000001d, 2, &eax, &ebx, &ecx, &edx); - nshared = ((eax >> 14) & 0xfff) + 1; - uncore->id = apicid - (apicid % nshared); - - uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_l2); - *per_cpu_ptr(amd_uncore_l2, cpu) = uncore; - } -} - -static void uncore_online(unsigned int cpu, - struct amd_uncore * __percpu *uncores) -{ - struct amd_uncore *uncore = *per_cpu_ptr(uncores, cpu); - - kfree(uncore->free_when_cpu_online); - uncore->free_when_cpu_online = NULL; - - if (cpu == uncore->cpu) - cpumask_set_cpu(cpu, uncore->active_mask); -} - -static void amd_uncore_cpu_online(unsigned int cpu) -{ - if (amd_uncore_nb) - uncore_online(cpu, amd_uncore_nb); - - if (amd_uncore_l2) - uncore_online(cpu, amd_uncore_l2); -} - -static void uncore_down_prepare(unsigned int cpu, - struct amd_uncore * __percpu *uncores) -{ - unsigned int i; - struct amd_uncore *this = *per_cpu_ptr(uncores, cpu); - - if (this->cpu != cpu) - return; - - /* this cpu is going down, migrate to a shared sibling if possible */ - for_each_online_cpu(i) { - struct amd_uncore *that = *per_cpu_ptr(uncores, i); - - if (cpu == i) - continue; - - if (this == that) { - perf_pmu_migrate_context(this->pmu, cpu, i); - cpumask_clear_cpu(cpu, that->active_mask); - cpumask_set_cpu(i, that->active_mask); - that->cpu = i; - break; - } - } -} - -static void amd_uncore_cpu_down_prepare(unsigned int cpu) -{ - if (amd_uncore_nb) - uncore_down_prepare(cpu, amd_uncore_nb); - - if (amd_uncore_l2) - uncore_down_prepare(cpu, amd_uncore_l2); -} - -static void uncore_dead(unsigned int cpu, struct amd_uncore * __percpu *uncores) -{ - struct amd_uncore *uncore = *per_cpu_ptr(uncores, cpu); - - if (cpu == uncore->cpu) - cpumask_clear_cpu(cpu, uncore->active_mask); - - if (!--uncore->refcnt) - kfree(uncore); - *per_cpu_ptr(uncores, cpu) = NULL; -} - -static void amd_uncore_cpu_dead(unsigned int cpu) -{ - if (amd_uncore_nb) - uncore_dead(cpu, amd_uncore_nb); - - if (amd_uncore_l2) - uncore_dead(cpu, amd_uncore_l2); -} - -static int -amd_uncore_cpu_notifier(struct notifier_block *self, unsigned long action, - void *hcpu) -{ - unsigned int cpu = (long)hcpu; - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_UP_PREPARE: - if (amd_uncore_cpu_up_prepare(cpu)) - return notifier_from_errno(-ENOMEM); - break; - - case CPU_STARTING: - amd_uncore_cpu_starting(cpu); - break; - - case CPU_ONLINE: - amd_uncore_cpu_online(cpu); - break; - - case CPU_DOWN_PREPARE: - amd_uncore_cpu_down_prepare(cpu); - break; - - case CPU_UP_CANCELED: - case CPU_DEAD: - amd_uncore_cpu_dead(cpu); - break; - - default: - break; - } - - return NOTIFY_OK; -} - -static struct notifier_block amd_uncore_cpu_notifier_block = { - .notifier_call = amd_uncore_cpu_notifier, - .priority = CPU_PRI_PERF + 1, -}; - -static void __init init_cpu_already_online(void *dummy) -{ - unsigned int cpu = smp_processor_id(); - - amd_uncore_cpu_starting(cpu); - amd_uncore_cpu_online(cpu); -} - -static void cleanup_cpu_online(void *dummy) -{ - unsigned int cpu = smp_processor_id(); - - amd_uncore_cpu_dead(cpu); -} - -static int __init amd_uncore_init(void) -{ - unsigned int cpu, cpu2; - int ret = -ENODEV; - - if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) - goto fail_nodev; - - if (!boot_cpu_has(X86_FEATURE_TOPOEXT)) - goto fail_nodev; - - if (boot_cpu_has(X86_FEATURE_PERFCTR_NB)) { - amd_uncore_nb = alloc_percpu(struct amd_uncore *); - if (!amd_uncore_nb) { - ret = -ENOMEM; - goto fail_nb; - } - ret = perf_pmu_register(&amd_nb_pmu, amd_nb_pmu.name, -1); - if (ret) - goto fail_nb; - - printk(KERN_INFO "perf: AMD NB counters detected\n"); - ret = 0; - } - - if (boot_cpu_has(X86_FEATURE_PERFCTR_L2)) { - amd_uncore_l2 = alloc_percpu(struct amd_uncore *); - if (!amd_uncore_l2) { - ret = -ENOMEM; - goto fail_l2; - } - ret = perf_pmu_register(&amd_l2_pmu, amd_l2_pmu.name, -1); - if (ret) - goto fail_l2; - - printk(KERN_INFO "perf: AMD L2I counters detected\n"); - ret = 0; - } - - if (ret) - goto fail_nodev; - - cpu_notifier_register_begin(); - - /* init cpus already online before registering for hotplug notifier */ - for_each_online_cpu(cpu) { - ret = amd_uncore_cpu_up_prepare(cpu); - if (ret) - goto fail_online; - smp_call_function_single(cpu, init_cpu_already_online, NULL, 1); - } - - __register_cpu_notifier(&amd_uncore_cpu_notifier_block); - cpu_notifier_register_done(); - - return 0; - - -fail_online: - for_each_online_cpu(cpu2) { - if (cpu2 == cpu) - break; - smp_call_function_single(cpu, cleanup_cpu_online, NULL, 1); - } - cpu_notifier_register_done(); - - /* amd_uncore_nb/l2 should have been freed by cleanup_cpu_online */ - amd_uncore_nb = amd_uncore_l2 = NULL; - - if (boot_cpu_has(X86_FEATURE_PERFCTR_L2)) - perf_pmu_unregister(&amd_l2_pmu); -fail_l2: - if (boot_cpu_has(X86_FEATURE_PERFCTR_NB)) - perf_pmu_unregister(&amd_nb_pmu); - if (amd_uncore_l2) - free_percpu(amd_uncore_l2); -fail_nb: - if (amd_uncore_nb) - free_percpu(amd_uncore_nb); - -fail_nodev: - return ret; -} -device_initcall(amd_uncore_init); diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c deleted file mode 100644 index fed2ab1f1065..000000000000 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ /dev/null @@ -1,3773 +0,0 @@ -/* - * Per core/cpu state - * - * Used to coordinate shared registers between HT threads or - * among events on a single PMU. - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/stddef.h> -#include <linux/types.h> -#include <linux/init.h> -#include <linux/slab.h> -#include <linux/export.h> -#include <linux/nmi.h> - -#include <asm/cpufeature.h> -#include <asm/hardirq.h> -#include <asm/apic.h> - -#include "perf_event.h" - -/* - * Intel PerfMon, used on Core and later. - */ -static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly = -{ - [PERF_COUNT_HW_CPU_CYCLES] = 0x003c, - [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e, - [PERF_COUNT_HW_CACHE_MISSES] = 0x412e, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, - [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, - [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, - [PERF_COUNT_HW_REF_CPU_CYCLES] = 0x0300, /* pseudo-encoding */ -}; - -static struct event_constraint intel_core_event_constraints[] __read_mostly = -{ - INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ - INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ - INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ - INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ - INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */ - INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FP_COMP_INSTR_RET */ - EVENT_CONSTRAINT_END -}; - -static struct event_constraint intel_core2_event_constraints[] __read_mostly = -{ - FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ - FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ - FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ - INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ - INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ - INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ - INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ - INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ - INTEL_EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */ - INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */ - INTEL_EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */ - INTEL_EVENT_CONSTRAINT(0xc9, 0x1), /* ITLB_MISS_RETIRED (T30-9) */ - INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */ - EVENT_CONSTRAINT_END -}; - -static struct event_constraint intel_nehalem_event_constraints[] __read_mostly = -{ - FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ - FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ - FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ - INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */ - INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */ - INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */ - INTEL_EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */ - INTEL_EVENT_CONSTRAINT(0x48, 0x3), /* L1D_PEND_MISS */ - INTEL_EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */ - INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */ - INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */ - EVENT_CONSTRAINT_END -}; - -static struct extra_reg intel_nehalem_extra_regs[] __read_mostly = -{ - /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ - INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0), - INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x100b), - EVENT_EXTRA_END -}; - -static struct event_constraint intel_westmere_event_constraints[] __read_mostly = -{ - FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ - FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ - FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ - INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */ - INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */ - INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */ - INTEL_EVENT_CONSTRAINT(0xb3, 0x1), /* SNOOPQ_REQUEST_OUTSTANDING */ - EVENT_CONSTRAINT_END -}; - -static struct event_constraint intel_snb_event_constraints[] __read_mostly = -{ - FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ - FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ - FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ - INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */ - INTEL_UEVENT_CONSTRAINT(0x05a3, 0xf), /* CYCLE_ACTIVITY.STALLS_L2_PENDING */ - INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */ - INTEL_UEVENT_CONSTRAINT(0x06a3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */ - INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */ - INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */ - INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */ - INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */ - INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */ - - INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */ - INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ - INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ - INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */ - - EVENT_CONSTRAINT_END -}; - -static struct event_constraint intel_ivb_event_constraints[] __read_mostly = -{ - FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ - FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ - FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ - INTEL_UEVENT_CONSTRAINT(0x0148, 0x4), /* L1D_PEND_MISS.PENDING */ - INTEL_UEVENT_CONSTRAINT(0x0279, 0xf), /* IDQ.EMTPY */ - INTEL_UEVENT_CONSTRAINT(0x019c, 0xf), /* IDQ_UOPS_NOT_DELIVERED.CORE */ - INTEL_UEVENT_CONSTRAINT(0x02a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_LDM_PENDING */ - INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */ - INTEL_UEVENT_CONSTRAINT(0x05a3, 0xf), /* CYCLE_ACTIVITY.STALLS_L2_PENDING */ - INTEL_UEVENT_CONSTRAINT(0x06a3, 0xf), /* CYCLE_ACTIVITY.STALLS_LDM_PENDING */ - INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */ - INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */ - INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */ - - INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */ - INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ - INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ - INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */ - - EVENT_CONSTRAINT_END -}; - -static struct extra_reg intel_westmere_extra_regs[] __read_mostly = -{ - /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ - INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0), - INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0xffff, RSP_1), - INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x100b), - EVENT_EXTRA_END -}; - -static struct event_constraint intel_v1_event_constraints[] __read_mostly = -{ - EVENT_CONSTRAINT_END -}; - -static struct event_constraint intel_gen_event_constraints[] __read_mostly = -{ - FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ - FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ - FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ - EVENT_CONSTRAINT_END -}; - -static struct event_constraint intel_slm_event_constraints[] __read_mostly = -{ - FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ - FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ - FIXED_EVENT_CONSTRAINT(0x0300, 2), /* pseudo CPU_CLK_UNHALTED.REF */ - EVENT_CONSTRAINT_END -}; - -struct event_constraint intel_skl_event_constraints[] = { - FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ - FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ - FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ - INTEL_UEVENT_CONSTRAINT(0x1c0, 0x2), /* INST_RETIRED.PREC_DIST */ - EVENT_CONSTRAINT_END -}; - -static struct extra_reg intel_knl_extra_regs[] __read_mostly = { - INTEL_UEVENT_EXTRA_REG(0x01b7, - MSR_OFFCORE_RSP_0, 0x7f9ffbffffull, RSP_0), - INTEL_UEVENT_EXTRA_REG(0x02b7, - MSR_OFFCORE_RSP_1, 0x3f9ffbffffull, RSP_1), - EVENT_EXTRA_END -}; - -static struct extra_reg intel_snb_extra_regs[] __read_mostly = { - /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ - INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3f807f8fffull, RSP_0), - INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3f807f8fffull, RSP_1), - INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd), - EVENT_EXTRA_END -}; - -static struct extra_reg intel_snbep_extra_regs[] __read_mostly = { - /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ - INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0), - INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1), - INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd), - EVENT_EXTRA_END -}; - -static struct extra_reg intel_skl_extra_regs[] __read_mostly = { - INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0), - INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1), - INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd), - /* - * Note the low 8 bits eventsel code is not a continuous field, containing - * some #GPing bits. These are masked out. - */ - INTEL_UEVENT_EXTRA_REG(0x01c6, MSR_PEBS_FRONTEND, 0x7fff17, FE), - EVENT_EXTRA_END -}; - -EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3"); -EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3"); -EVENT_ATTR_STR(mem-stores, mem_st_snb, "event=0xcd,umask=0x2"); - -struct attribute *nhm_events_attrs[] = { - EVENT_PTR(mem_ld_nhm), - NULL, -}; - -struct attribute *snb_events_attrs[] = { - EVENT_PTR(mem_ld_snb), - EVENT_PTR(mem_st_snb), - NULL, -}; - -static struct event_constraint intel_hsw_event_constraints[] = { - FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ - FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ - FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ - INTEL_UEVENT_CONSTRAINT(0x148, 0x4), /* L1D_PEND_MISS.PENDING */ - INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */ - INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */ - /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */ - INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4), - /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */ - INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4), - /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */ - INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), - - INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */ - INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ - INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ - INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */ - - EVENT_CONSTRAINT_END -}; - -struct event_constraint intel_bdw_event_constraints[] = { - FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ - FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ - FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ - INTEL_UEVENT_CONSTRAINT(0x148, 0x4), /* L1D_PEND_MISS.PENDING */ - INTEL_UBIT_EVENT_CONSTRAINT(0x8a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_MISS */ - EVENT_CONSTRAINT_END -}; - -static u64 intel_pmu_event_map(int hw_event) -{ - return intel_perfmon_event_map[hw_event]; -} - -/* - * Notes on the events: - * - data reads do not include code reads (comparable to earlier tables) - * - data counts include speculative execution (except L1 write, dtlb, bpu) - * - remote node access includes remote memory, remote cache, remote mmio. - * - prefetches are not included in the counts. - * - icache miss does not include decoded icache - */ - -#define SKL_DEMAND_DATA_RD BIT_ULL(0) -#define SKL_DEMAND_RFO BIT_ULL(1) -#define SKL_ANY_RESPONSE BIT_ULL(16) -#define SKL_SUPPLIER_NONE BIT_ULL(17) -#define SKL_L3_MISS_LOCAL_DRAM BIT_ULL(26) -#define SKL_L3_MISS_REMOTE_HOP0_DRAM BIT_ULL(27) -#define SKL_L3_MISS_REMOTE_HOP1_DRAM BIT_ULL(28) -#define SKL_L3_MISS_REMOTE_HOP2P_DRAM BIT_ULL(29) -#define SKL_L3_MISS (SKL_L3_MISS_LOCAL_DRAM| \ - SKL_L3_MISS_REMOTE_HOP0_DRAM| \ - SKL_L3_MISS_REMOTE_HOP1_DRAM| \ - SKL_L3_MISS_REMOTE_HOP2P_DRAM) -#define SKL_SPL_HIT BIT_ULL(30) -#define SKL_SNOOP_NONE BIT_ULL(31) -#define SKL_SNOOP_NOT_NEEDED BIT_ULL(32) -#define SKL_SNOOP_MISS BIT_ULL(33) -#define SKL_SNOOP_HIT_NO_FWD BIT_ULL(34) -#define SKL_SNOOP_HIT_WITH_FWD BIT_ULL(35) -#define SKL_SNOOP_HITM BIT_ULL(36) -#define SKL_SNOOP_NON_DRAM BIT_ULL(37) -#define SKL_ANY_SNOOP (SKL_SPL_HIT|SKL_SNOOP_NONE| \ - SKL_SNOOP_NOT_NEEDED|SKL_SNOOP_MISS| \ - SKL_SNOOP_HIT_NO_FWD|SKL_SNOOP_HIT_WITH_FWD| \ - SKL_SNOOP_HITM|SKL_SNOOP_NON_DRAM) -#define SKL_DEMAND_READ SKL_DEMAND_DATA_RD -#define SKL_SNOOP_DRAM (SKL_SNOOP_NONE| \ - SKL_SNOOP_NOT_NEEDED|SKL_SNOOP_MISS| \ - SKL_SNOOP_HIT_NO_FWD|SKL_SNOOP_HIT_WITH_FWD| \ - SKL_SNOOP_HITM|SKL_SPL_HIT) -#define SKL_DEMAND_WRITE SKL_DEMAND_RFO -#define SKL_LLC_ACCESS SKL_ANY_RESPONSE -#define SKL_L3_MISS_REMOTE (SKL_L3_MISS_REMOTE_HOP0_DRAM| \ - SKL_L3_MISS_REMOTE_HOP1_DRAM| \ - SKL_L3_MISS_REMOTE_HOP2P_DRAM) - -static __initconst const u64 skl_hw_cache_event_ids - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = -{ - [ C(L1D ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_INST_RETIRED.ALL_LOADS */ - [ C(RESULT_MISS) ] = 0x151, /* L1D.REPLACEMENT */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_INST_RETIRED.ALL_STORES */ - [ C(RESULT_MISS) ] = 0x0, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0x0, - }, - }, - [ C(L1I ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0x283, /* ICACHE_64B.MISS */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0x0, - }, - }, - [ C(LL ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */ - [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */ - [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0x0, - }, - }, - [ C(DTLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_INST_RETIRED.ALL_LOADS */ - [ C(RESULT_MISS) ] = 0x608, /* DTLB_LOAD_MISSES.WALK_COMPLETED */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_INST_RETIRED.ALL_STORES */ - [ C(RESULT_MISS) ] = 0x649, /* DTLB_STORE_MISSES.WALK_COMPLETED */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0x0, - }, - }, - [ C(ITLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x2085, /* ITLB_MISSES.STLB_HIT */ - [ C(RESULT_MISS) ] = 0xe85, /* ITLB_MISSES.WALK_COMPLETED */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, - [ C(BPU ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0xc4, /* BR_INST_RETIRED.ALL_BRANCHES */ - [ C(RESULT_MISS) ] = 0xc5, /* BR_MISP_RETIRED.ALL_BRANCHES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, - [ C(NODE) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */ - [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */ - [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0x0, - }, - }, -}; - -static __initconst const u64 skl_hw_cache_extra_regs - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = -{ - [ C(LL ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = SKL_DEMAND_READ| - SKL_LLC_ACCESS|SKL_ANY_SNOOP, - [ C(RESULT_MISS) ] = SKL_DEMAND_READ| - SKL_L3_MISS|SKL_ANY_SNOOP| - SKL_SUPPLIER_NONE, - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = SKL_DEMAND_WRITE| - SKL_LLC_ACCESS|SKL_ANY_SNOOP, - [ C(RESULT_MISS) ] = SKL_DEMAND_WRITE| - SKL_L3_MISS|SKL_ANY_SNOOP| - SKL_SUPPLIER_NONE, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0x0, - }, - }, - [ C(NODE) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = SKL_DEMAND_READ| - SKL_L3_MISS_LOCAL_DRAM|SKL_SNOOP_DRAM, - [ C(RESULT_MISS) ] = SKL_DEMAND_READ| - SKL_L3_MISS_REMOTE|SKL_SNOOP_DRAM, - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = SKL_DEMAND_WRITE| - SKL_L3_MISS_LOCAL_DRAM|SKL_SNOOP_DRAM, - [ C(RESULT_MISS) ] = SKL_DEMAND_WRITE| - SKL_L3_MISS_REMOTE|SKL_SNOOP_DRAM, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0x0, - }, - }, -}; - -#define SNB_DMND_DATA_RD (1ULL << 0) -#define SNB_DMND_RFO (1ULL << 1) -#define SNB_DMND_IFETCH (1ULL << 2) -#define SNB_DMND_WB (1ULL << 3) -#define SNB_PF_DATA_RD (1ULL << 4) -#define SNB_PF_RFO (1ULL << 5) -#define SNB_PF_IFETCH (1ULL << 6) -#define SNB_LLC_DATA_RD (1ULL << 7) -#define SNB_LLC_RFO (1ULL << 8) -#define SNB_LLC_IFETCH (1ULL << 9) -#define SNB_BUS_LOCKS (1ULL << 10) -#define SNB_STRM_ST (1ULL << 11) -#define SNB_OTHER (1ULL << 15) -#define SNB_RESP_ANY (1ULL << 16) -#define SNB_NO_SUPP (1ULL << 17) -#define SNB_LLC_HITM (1ULL << 18) -#define SNB_LLC_HITE (1ULL << 19) -#define SNB_LLC_HITS (1ULL << 20) -#define SNB_LLC_HITF (1ULL << 21) -#define SNB_LOCAL (1ULL << 22) -#define SNB_REMOTE (0xffULL << 23) -#define SNB_SNP_NONE (1ULL << 31) -#define SNB_SNP_NOT_NEEDED (1ULL << 32) -#define SNB_SNP_MISS (1ULL << 33) -#define SNB_NO_FWD (1ULL << 34) -#define SNB_SNP_FWD (1ULL << 35) -#define SNB_HITM (1ULL << 36) -#define SNB_NON_DRAM (1ULL << 37) - -#define SNB_DMND_READ (SNB_DMND_DATA_RD|SNB_LLC_DATA_RD) -#define SNB_DMND_WRITE (SNB_DMND_RFO|SNB_LLC_RFO) -#define SNB_DMND_PREFETCH (SNB_PF_DATA_RD|SNB_PF_RFO) - -#define SNB_SNP_ANY (SNB_SNP_NONE|SNB_SNP_NOT_NEEDED| \ - SNB_SNP_MISS|SNB_NO_FWD|SNB_SNP_FWD| \ - SNB_HITM) - -#define SNB_DRAM_ANY (SNB_LOCAL|SNB_REMOTE|SNB_SNP_ANY) -#define SNB_DRAM_REMOTE (SNB_REMOTE|SNB_SNP_ANY) - -#define SNB_L3_ACCESS SNB_RESP_ANY -#define SNB_L3_MISS (SNB_DRAM_ANY|SNB_NON_DRAM) - -static __initconst const u64 snb_hw_cache_extra_regs - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = -{ - [ C(LL ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = SNB_DMND_READ|SNB_L3_ACCESS, - [ C(RESULT_MISS) ] = SNB_DMND_READ|SNB_L3_MISS, - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = SNB_DMND_WRITE|SNB_L3_ACCESS, - [ C(RESULT_MISS) ] = SNB_DMND_WRITE|SNB_L3_MISS, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = SNB_DMND_PREFETCH|SNB_L3_ACCESS, - [ C(RESULT_MISS) ] = SNB_DMND_PREFETCH|SNB_L3_MISS, - }, - }, - [ C(NODE) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = SNB_DMND_READ|SNB_DRAM_ANY, - [ C(RESULT_MISS) ] = SNB_DMND_READ|SNB_DRAM_REMOTE, - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = SNB_DMND_WRITE|SNB_DRAM_ANY, - [ C(RESULT_MISS) ] = SNB_DMND_WRITE|SNB_DRAM_REMOTE, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = SNB_DMND_PREFETCH|SNB_DRAM_ANY, - [ C(RESULT_MISS) ] = SNB_DMND_PREFETCH|SNB_DRAM_REMOTE, - }, - }, -}; - -static __initconst const u64 snb_hw_cache_event_ids - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = -{ - [ C(L1D) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0xf1d0, /* MEM_UOP_RETIRED.LOADS */ - [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPLACEMENT */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0xf2d0, /* MEM_UOP_RETIRED.STORES */ - [ C(RESULT_MISS) ] = 0x0851, /* L1D.ALL_M_REPLACEMENT */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0x024e, /* HW_PRE_REQ.DL1_MISS */ - }, - }, - [ C(L1I ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0x0280, /* ICACHE.MISSES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0x0, - }, - }, - [ C(LL ) ] = { - [ C(OP_READ) ] = { - /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */ - [ C(RESULT_ACCESS) ] = 0x01b7, - /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */ - [ C(RESULT_MISS) ] = 0x01b7, - }, - [ C(OP_WRITE) ] = { - /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */ - [ C(RESULT_ACCESS) ] = 0x01b7, - /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */ - [ C(RESULT_MISS) ] = 0x01b7, - }, - [ C(OP_PREFETCH) ] = { - /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */ - [ C(RESULT_ACCESS) ] = 0x01b7, - /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */ - [ C(RESULT_MISS) ] = 0x01b7, - }, - }, - [ C(DTLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOP_RETIRED.ALL_LOADS */ - [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.CAUSES_A_WALK */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOP_RETIRED.ALL_STORES */ - [ C(RESULT_MISS) ] = 0x0149, /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0x0, - }, - }, - [ C(ITLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x1085, /* ITLB_MISSES.STLB_HIT */ - [ C(RESULT_MISS) ] = 0x0185, /* ITLB_MISSES.CAUSES_A_WALK */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, - [ C(BPU ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */ - [ C(RESULT_MISS) ] = 0x00c5, /* BR_MISP_RETIRED.ALL_BRANCHES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, - [ C(NODE) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x01b7, - [ C(RESULT_MISS) ] = 0x01b7, - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x01b7, - [ C(RESULT_MISS) ] = 0x01b7, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x01b7, - [ C(RESULT_MISS) ] = 0x01b7, - }, - }, - -}; - -/* - * Notes on the events: - * - data reads do not include code reads (comparable to earlier tables) - * - data counts include speculative execution (except L1 write, dtlb, bpu) - * - remote node access includes remote memory, remote cache, remote mmio. - * - prefetches are not included in the counts because they are not - * reliably counted. - */ - -#define HSW_DEMAND_DATA_RD BIT_ULL(0) -#define HSW_DEMAND_RFO BIT_ULL(1) -#define HSW_ANY_RESPONSE BIT_ULL(16) -#define HSW_SUPPLIER_NONE BIT_ULL(17) -#define HSW_L3_MISS_LOCAL_DRAM BIT_ULL(22) -#define HSW_L3_MISS_REMOTE_HOP0 BIT_ULL(27) -#define HSW_L3_MISS_REMOTE_HOP1 BIT_ULL(28) -#define HSW_L3_MISS_REMOTE_HOP2P BIT_ULL(29) -#define HSW_L3_MISS (HSW_L3_MISS_LOCAL_DRAM| \ - HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \ - HSW_L3_MISS_REMOTE_HOP2P) -#define HSW_SNOOP_NONE BIT_ULL(31) -#define HSW_SNOOP_NOT_NEEDED BIT_ULL(32) -#define HSW_SNOOP_MISS BIT_ULL(33) -#define HSW_SNOOP_HIT_NO_FWD BIT_ULL(34) -#define HSW_SNOOP_HIT_WITH_FWD BIT_ULL(35) -#define HSW_SNOOP_HITM BIT_ULL(36) -#define HSW_SNOOP_NON_DRAM BIT_ULL(37) -#define HSW_ANY_SNOOP (HSW_SNOOP_NONE| \ - HSW_SNOOP_NOT_NEEDED|HSW_SNOOP_MISS| \ - HSW_SNOOP_HIT_NO_FWD|HSW_SNOOP_HIT_WITH_FWD| \ - HSW_SNOOP_HITM|HSW_SNOOP_NON_DRAM) -#define HSW_SNOOP_DRAM (HSW_ANY_SNOOP & ~HSW_SNOOP_NON_DRAM) -#define HSW_DEMAND_READ HSW_DEMAND_DATA_RD -#define HSW_DEMAND_WRITE HSW_DEMAND_RFO -#define HSW_L3_MISS_REMOTE (HSW_L3_MISS_REMOTE_HOP0|\ - HSW_L3_MISS_REMOTE_HOP1|HSW_L3_MISS_REMOTE_HOP2P) -#define HSW_LLC_ACCESS HSW_ANY_RESPONSE - -#define BDW_L3_MISS_LOCAL BIT(26) -#define BDW_L3_MISS (BDW_L3_MISS_LOCAL| \ - HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \ - HSW_L3_MISS_REMOTE_HOP2P) - - -static __initconst const u64 hsw_hw_cache_event_ids - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = -{ - [ C(L1D ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */ - [ C(RESULT_MISS) ] = 0x151, /* L1D.REPLACEMENT */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */ - [ C(RESULT_MISS) ] = 0x0, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0x0, - }, - }, - [ C(L1I ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0x280, /* ICACHE.MISSES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0x0, - }, - }, - [ C(LL ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */ - [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */ - [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0x0, - }, - }, - [ C(DTLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */ - [ C(RESULT_MISS) ] = 0x108, /* DTLB_LOAD_MISSES.MISS_CAUSES_A_WALK */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */ - [ C(RESULT_MISS) ] = 0x149, /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0x0, - }, - }, - [ C(ITLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x6085, /* ITLB_MISSES.STLB_HIT */ - [ C(RESULT_MISS) ] = 0x185, /* ITLB_MISSES.MISS_CAUSES_A_WALK */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, - [ C(BPU ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0xc4, /* BR_INST_RETIRED.ALL_BRANCHES */ - [ C(RESULT_MISS) ] = 0xc5, /* BR_MISP_RETIRED.ALL_BRANCHES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, - [ C(NODE) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */ - [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */ - [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0x0, - }, - }, -}; - -static __initconst const u64 hsw_hw_cache_extra_regs - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = -{ - [ C(LL ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = HSW_DEMAND_READ| - HSW_LLC_ACCESS, - [ C(RESULT_MISS) ] = HSW_DEMAND_READ| - HSW_L3_MISS|HSW_ANY_SNOOP, - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE| - HSW_LLC_ACCESS, - [ C(RESULT_MISS) ] = HSW_DEMAND_WRITE| - HSW_L3_MISS|HSW_ANY_SNOOP, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0x0, - }, - }, - [ C(NODE) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = HSW_DEMAND_READ| - HSW_L3_MISS_LOCAL_DRAM| - HSW_SNOOP_DRAM, - [ C(RESULT_MISS) ] = HSW_DEMAND_READ| - HSW_L3_MISS_REMOTE| - HSW_SNOOP_DRAM, - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE| - HSW_L3_MISS_LOCAL_DRAM| - HSW_SNOOP_DRAM, - [ C(RESULT_MISS) ] = HSW_DEMAND_WRITE| - HSW_L3_MISS_REMOTE| - HSW_SNOOP_DRAM, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0x0, - }, - }, -}; - -static __initconst const u64 westmere_hw_cache_event_ids - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = -{ - [ C(L1D) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */ - [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPL */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */ - [ C(RESULT_MISS) ] = 0x0251, /* L1D.M_REPL */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */ - [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */ - }, - }, - [ C(L1I ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ - [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0x0, - }, - }, - [ C(LL ) ] = { - [ C(OP_READ) ] = { - /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */ - [ C(RESULT_ACCESS) ] = 0x01b7, - /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */ - [ C(RESULT_MISS) ] = 0x01b7, - }, - /* - * Use RFO, not WRITEBACK, because a write miss would typically occur - * on RFO. - */ - [ C(OP_WRITE) ] = { - /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */ - [ C(RESULT_ACCESS) ] = 0x01b7, - /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */ - [ C(RESULT_MISS) ] = 0x01b7, - }, - [ C(OP_PREFETCH) ] = { - /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */ - [ C(RESULT_ACCESS) ] = 0x01b7, - /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */ - [ C(RESULT_MISS) ] = 0x01b7, - }, - }, - [ C(DTLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */ - [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */ - [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0x0, - }, - }, - [ C(ITLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */ - [ C(RESULT_MISS) ] = 0x0185, /* ITLB_MISSES.ANY */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, - [ C(BPU ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */ - [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, - [ C(NODE) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x01b7, - [ C(RESULT_MISS) ] = 0x01b7, - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x01b7, - [ C(RESULT_MISS) ] = 0x01b7, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x01b7, - [ C(RESULT_MISS) ] = 0x01b7, - }, - }, -}; - -/* - * Nehalem/Westmere MSR_OFFCORE_RESPONSE bits; - * See IA32 SDM Vol 3B 30.6.1.3 - */ - -#define NHM_DMND_DATA_RD (1 << 0) -#define NHM_DMND_RFO (1 << 1) -#define NHM_DMND_IFETCH (1 << 2) -#define NHM_DMND_WB (1 << 3) -#define NHM_PF_DATA_RD (1 << 4) -#define NHM_PF_DATA_RFO (1 << 5) -#define NHM_PF_IFETCH (1 << 6) -#define NHM_OFFCORE_OTHER (1 << 7) -#define NHM_UNCORE_HIT (1 << 8) -#define NHM_OTHER_CORE_HIT_SNP (1 << 9) -#define NHM_OTHER_CORE_HITM (1 << 10) - /* reserved */ -#define NHM_REMOTE_CACHE_FWD (1 << 12) -#define NHM_REMOTE_DRAM (1 << 13) -#define NHM_LOCAL_DRAM (1 << 14) -#define NHM_NON_DRAM (1 << 15) - -#define NHM_LOCAL (NHM_LOCAL_DRAM|NHM_REMOTE_CACHE_FWD) -#define NHM_REMOTE (NHM_REMOTE_DRAM) - -#define NHM_DMND_READ (NHM_DMND_DATA_RD) -#define NHM_DMND_WRITE (NHM_DMND_RFO|NHM_DMND_WB) -#define NHM_DMND_PREFETCH (NHM_PF_DATA_RD|NHM_PF_DATA_RFO) - -#define NHM_L3_HIT (NHM_UNCORE_HIT|NHM_OTHER_CORE_HIT_SNP|NHM_OTHER_CORE_HITM) -#define NHM_L3_MISS (NHM_NON_DRAM|NHM_LOCAL_DRAM|NHM_REMOTE_DRAM|NHM_REMOTE_CACHE_FWD) -#define NHM_L3_ACCESS (NHM_L3_HIT|NHM_L3_MISS) - -static __initconst const u64 nehalem_hw_cache_extra_regs - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = -{ - [ C(LL ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_L3_ACCESS, - [ C(RESULT_MISS) ] = NHM_DMND_READ|NHM_L3_MISS, - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_L3_ACCESS, - [ C(RESULT_MISS) ] = NHM_DMND_WRITE|NHM_L3_MISS, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_L3_ACCESS, - [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_L3_MISS, - }, - }, - [ C(NODE) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_LOCAL|NHM_REMOTE, - [ C(RESULT_MISS) ] = NHM_DMND_READ|NHM_REMOTE, - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_LOCAL|NHM_REMOTE, - [ C(RESULT_MISS) ] = NHM_DMND_WRITE|NHM_REMOTE, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_LOCAL|NHM_REMOTE, - [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_REMOTE, - }, - }, -}; - -static __initconst const u64 nehalem_hw_cache_event_ids - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = -{ - [ C(L1D) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */ - [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPL */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */ - [ C(RESULT_MISS) ] = 0x0251, /* L1D.M_REPL */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */ - [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */ - }, - }, - [ C(L1I ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ - [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0x0, - }, - }, - [ C(LL ) ] = { - [ C(OP_READ) ] = { - /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */ - [ C(RESULT_ACCESS) ] = 0x01b7, - /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */ - [ C(RESULT_MISS) ] = 0x01b7, - }, - /* - * Use RFO, not WRITEBACK, because a write miss would typically occur - * on RFO. - */ - [ C(OP_WRITE) ] = { - /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */ - [ C(RESULT_ACCESS) ] = 0x01b7, - /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */ - [ C(RESULT_MISS) ] = 0x01b7, - }, - [ C(OP_PREFETCH) ] = { - /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */ - [ C(RESULT_ACCESS) ] = 0x01b7, - /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */ - [ C(RESULT_MISS) ] = 0x01b7, - }, - }, - [ C(DTLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0x0, - }, - }, - [ C(ITLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */ - [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, - [ C(BPU ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */ - [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, - [ C(NODE) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x01b7, - [ C(RESULT_MISS) ] = 0x01b7, - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x01b7, - [ C(RESULT_MISS) ] = 0x01b7, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x01b7, - [ C(RESULT_MISS) ] = 0x01b7, - }, - }, -}; - -static __initconst const u64 core2_hw_cache_event_ids - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = -{ - [ C(L1D) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ - [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ - [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */ - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(L1I ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */ - [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(LL ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ - [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */ - [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(DTLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(ITLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ - [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, - [ C(BPU ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ - [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, -}; - -static __initconst const u64 atom_hw_cache_event_ids - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = -{ - [ C(L1D) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */ - [ C(RESULT_MISS) ] = 0, - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */ - [ C(RESULT_MISS) ] = 0, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(L1I ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ - [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(LL ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ - [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */ - [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(DTLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(ITLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ - [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, - [ C(BPU ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ - [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, -}; - -static struct extra_reg intel_slm_extra_regs[] __read_mostly = -{ - /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ - INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x768005ffffull, RSP_0), - INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x368005ffffull, RSP_1), - EVENT_EXTRA_END -}; - -#define SLM_DMND_READ SNB_DMND_DATA_RD -#define SLM_DMND_WRITE SNB_DMND_RFO -#define SLM_DMND_PREFETCH (SNB_PF_DATA_RD|SNB_PF_RFO) - -#define SLM_SNP_ANY (SNB_SNP_NONE|SNB_SNP_MISS|SNB_NO_FWD|SNB_HITM) -#define SLM_LLC_ACCESS SNB_RESP_ANY -#define SLM_LLC_MISS (SLM_SNP_ANY|SNB_NON_DRAM) - -static __initconst const u64 slm_hw_cache_extra_regs - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = -{ - [ C(LL ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = SLM_DMND_READ|SLM_LLC_ACCESS, - [ C(RESULT_MISS) ] = 0, - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = SLM_DMND_WRITE|SLM_LLC_ACCESS, - [ C(RESULT_MISS) ] = SLM_DMND_WRITE|SLM_LLC_MISS, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = SLM_DMND_PREFETCH|SLM_LLC_ACCESS, - [ C(RESULT_MISS) ] = SLM_DMND_PREFETCH|SLM_LLC_MISS, - }, - }, -}; - -static __initconst const u64 slm_hw_cache_event_ids - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = -{ - [ C(L1D) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0x0104, /* LD_DCU_MISS */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(L1I ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0380, /* ICACHE.ACCESSES */ - [ C(RESULT_MISS) ] = 0x0280, /* ICACGE.MISSES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(LL ) ] = { - [ C(OP_READ) ] = { - /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */ - [ C(RESULT_ACCESS) ] = 0x01b7, - [ C(RESULT_MISS) ] = 0, - }, - [ C(OP_WRITE) ] = { - /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */ - [ C(RESULT_ACCESS) ] = 0x01b7, - /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */ - [ C(RESULT_MISS) ] = 0x01b7, - }, - [ C(OP_PREFETCH) ] = { - /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */ - [ C(RESULT_ACCESS) ] = 0x01b7, - /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */ - [ C(RESULT_MISS) ] = 0x01b7, - }, - }, - [ C(DTLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0x0804, /* LD_DTLB_MISS */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(ITLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ - [ C(RESULT_MISS) ] = 0x40205, /* PAGE_WALKS.I_SIDE_WALKS */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, - [ C(BPU ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ - [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, -}; - -#define KNL_OT_L2_HITE BIT_ULL(19) /* Other Tile L2 Hit */ -#define KNL_OT_L2_HITF BIT_ULL(20) /* Other Tile L2 Hit */ -#define KNL_MCDRAM_LOCAL BIT_ULL(21) -#define KNL_MCDRAM_FAR BIT_ULL(22) -#define KNL_DDR_LOCAL BIT_ULL(23) -#define KNL_DDR_FAR BIT_ULL(24) -#define KNL_DRAM_ANY (KNL_MCDRAM_LOCAL | KNL_MCDRAM_FAR | \ - KNL_DDR_LOCAL | KNL_DDR_FAR) -#define KNL_L2_READ SLM_DMND_READ -#define KNL_L2_WRITE SLM_DMND_WRITE -#define KNL_L2_PREFETCH SLM_DMND_PREFETCH -#define KNL_L2_ACCESS SLM_LLC_ACCESS -#define KNL_L2_MISS (KNL_OT_L2_HITE | KNL_OT_L2_HITF | \ - KNL_DRAM_ANY | SNB_SNP_ANY | \ - SNB_NON_DRAM) - -static __initconst const u64 knl_hw_cache_extra_regs - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = { - [C(LL)] = { - [C(OP_READ)] = { - [C(RESULT_ACCESS)] = KNL_L2_READ | KNL_L2_ACCESS, - [C(RESULT_MISS)] = 0, - }, - [C(OP_WRITE)] = { - [C(RESULT_ACCESS)] = KNL_L2_WRITE | KNL_L2_ACCESS, - [C(RESULT_MISS)] = KNL_L2_WRITE | KNL_L2_MISS, - }, - [C(OP_PREFETCH)] = { - [C(RESULT_ACCESS)] = KNL_L2_PREFETCH | KNL_L2_ACCESS, - [C(RESULT_MISS)] = KNL_L2_PREFETCH | KNL_L2_MISS, - }, - }, -}; - -/* - * Use from PMIs where the LBRs are already disabled. - */ -static void __intel_pmu_disable_all(void) -{ - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); - - if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) - intel_pmu_disable_bts(); - else - intel_bts_disable_local(); - - intel_pmu_pebs_disable_all(); -} - -static void intel_pmu_disable_all(void) -{ - __intel_pmu_disable_all(); - intel_pmu_lbr_disable_all(); -} - -static void __intel_pmu_enable_all(int added, bool pmi) -{ - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - - intel_pmu_pebs_enable_all(); - intel_pmu_lbr_enable_all(pmi); - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, - x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask); - - if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { - struct perf_event *event = - cpuc->events[INTEL_PMC_IDX_FIXED_BTS]; - - if (WARN_ON_ONCE(!event)) - return; - - intel_pmu_enable_bts(event->hw.config); - } else - intel_bts_enable_local(); -} - -static void intel_pmu_enable_all(int added) -{ - __intel_pmu_enable_all(added, false); -} - -/* - * Workaround for: - * Intel Errata AAK100 (model 26) - * Intel Errata AAP53 (model 30) - * Intel Errata BD53 (model 44) - * - * The official story: - * These chips need to be 'reset' when adding counters by programming the - * magic three (non-counting) events 0x4300B5, 0x4300D2, and 0x4300B1 either - * in sequence on the same PMC or on different PMCs. - * - * In practise it appears some of these events do in fact count, and - * we need to programm all 4 events. - */ -static void intel_pmu_nhm_workaround(void) -{ - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - static const unsigned long nhm_magic[4] = { - 0x4300B5, - 0x4300D2, - 0x4300B1, - 0x4300B1 - }; - struct perf_event *event; - int i; - - /* - * The Errata requires below steps: - * 1) Clear MSR_IA32_PEBS_ENABLE and MSR_CORE_PERF_GLOBAL_CTRL; - * 2) Configure 4 PERFEVTSELx with the magic events and clear - * the corresponding PMCx; - * 3) set bit0~bit3 of MSR_CORE_PERF_GLOBAL_CTRL; - * 4) Clear MSR_CORE_PERF_GLOBAL_CTRL; - * 5) Clear 4 pairs of ERFEVTSELx and PMCx; - */ - - /* - * The real steps we choose are a little different from above. - * A) To reduce MSR operations, we don't run step 1) as they - * are already cleared before this function is called; - * B) Call x86_perf_event_update to save PMCx before configuring - * PERFEVTSELx with magic number; - * C) With step 5), we do clear only when the PERFEVTSELx is - * not used currently. - * D) Call x86_perf_event_set_period to restore PMCx; - */ - - /* We always operate 4 pairs of PERF Counters */ - for (i = 0; i < 4; i++) { - event = cpuc->events[i]; - if (event) - x86_perf_event_update(event); - } - - for (i = 0; i < 4; i++) { - wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, nhm_magic[i]); - wrmsrl(MSR_ARCH_PERFMON_PERFCTR0 + i, 0x0); - } - - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0xf); - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x0); - - for (i = 0; i < 4; i++) { - event = cpuc->events[i]; - - if (event) { - x86_perf_event_set_period(event); - __x86_pmu_enable_event(&event->hw, - ARCH_PERFMON_EVENTSEL_ENABLE); - } else - wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, 0x0); - } -} - -static void intel_pmu_nhm_enable_all(int added) -{ - if (added) - intel_pmu_nhm_workaround(); - intel_pmu_enable_all(added); -} - -static inline u64 intel_pmu_get_status(void) -{ - u64 status; - - rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); - - return status; -} - -static inline void intel_pmu_ack_status(u64 ack) -{ - wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); -} - -static void intel_pmu_disable_fixed(struct hw_perf_event *hwc) -{ - int idx = hwc->idx - INTEL_PMC_IDX_FIXED; - u64 ctrl_val, mask; - - mask = 0xfULL << (idx * 4); - - rdmsrl(hwc->config_base, ctrl_val); - ctrl_val &= ~mask; - wrmsrl(hwc->config_base, ctrl_val); -} - -static inline bool event_is_checkpointed(struct perf_event *event) -{ - return (event->hw.config & HSW_IN_TX_CHECKPOINTED) != 0; -} - -static void intel_pmu_disable_event(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - - if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) { - intel_pmu_disable_bts(); - intel_pmu_drain_bts_buffer(); - return; - } - - cpuc->intel_ctrl_guest_mask &= ~(1ull << hwc->idx); - cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx); - cpuc->intel_cp_status &= ~(1ull << hwc->idx); - - /* - * must disable before any actual event - * because any event may be combined with LBR - */ - if (needs_branch_stack(event)) - intel_pmu_lbr_disable(event); - - if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { - intel_pmu_disable_fixed(hwc); - return; - } - - x86_pmu_disable_event(event); - - if (unlikely(event->attr.precise_ip)) - intel_pmu_pebs_disable(event); -} - -static void intel_pmu_enable_fixed(struct hw_perf_event *hwc) -{ - int idx = hwc->idx - INTEL_PMC_IDX_FIXED; - u64 ctrl_val, bits, mask; - - /* - * Enable IRQ generation (0x8), - * and enable ring-3 counting (0x2) and ring-0 counting (0x1) - * if requested: - */ - bits = 0x8ULL; - if (hwc->config & ARCH_PERFMON_EVENTSEL_USR) - bits |= 0x2; - if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) - bits |= 0x1; - - /* - * ANY bit is supported in v3 and up - */ - if (x86_pmu.version > 2 && hwc->config & ARCH_PERFMON_EVENTSEL_ANY) - bits |= 0x4; - - bits <<= (idx * 4); - mask = 0xfULL << (idx * 4); - - rdmsrl(hwc->config_base, ctrl_val); - ctrl_val &= ~mask; - ctrl_val |= bits; - wrmsrl(hwc->config_base, ctrl_val); -} - -static void intel_pmu_enable_event(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - - if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) { - if (!__this_cpu_read(cpu_hw_events.enabled)) - return; - - intel_pmu_enable_bts(hwc->config); - return; - } - /* - * must enabled before any actual event - * because any event may be combined with LBR - */ - if (needs_branch_stack(event)) - intel_pmu_lbr_enable(event); - - if (event->attr.exclude_host) - cpuc->intel_ctrl_guest_mask |= (1ull << hwc->idx); - if (event->attr.exclude_guest) - cpuc->intel_ctrl_host_mask |= (1ull << hwc->idx); - - if (unlikely(event_is_checkpointed(event))) - cpuc->intel_cp_status |= (1ull << hwc->idx); - - if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { - intel_pmu_enable_fixed(hwc); - return; - } - - if (unlikely(event->attr.precise_ip)) - intel_pmu_pebs_enable(event); - - __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); -} - -/* - * Save and restart an expired event. Called by NMI contexts, - * so it has to be careful about preempting normal event ops: - */ -int intel_pmu_save_and_restart(struct perf_event *event) -{ - x86_perf_event_update(event); - /* - * For a checkpointed counter always reset back to 0. This - * avoids a situation where the counter overflows, aborts the - * transaction and is then set back to shortly before the - * overflow, and overflows and aborts again. - */ - if (unlikely(event_is_checkpointed(event))) { - /* No race with NMIs because the counter should not be armed */ - wrmsrl(event->hw.event_base, 0); - local64_set(&event->hw.prev_count, 0); - } - return x86_perf_event_set_period(event); -} - -static void intel_pmu_reset(void) -{ - struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds); - unsigned long flags; - int idx; - - if (!x86_pmu.num_counters) - return; - - local_irq_save(flags); - - pr_info("clearing PMU state on CPU#%d\n", smp_processor_id()); - - for (idx = 0; idx < x86_pmu.num_counters; idx++) { - wrmsrl_safe(x86_pmu_config_addr(idx), 0ull); - wrmsrl_safe(x86_pmu_event_addr(idx), 0ull); - } - for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) - wrmsrl_safe(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); - - if (ds) - ds->bts_index = ds->bts_buffer_base; - - /* Ack all overflows and disable fixed counters */ - if (x86_pmu.version >= 2) { - intel_pmu_ack_status(intel_pmu_get_status()); - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); - } - - /* Reset LBRs and LBR freezing */ - if (x86_pmu.lbr_nr) { - update_debugctlmsr(get_debugctlmsr() & - ~(DEBUGCTLMSR_FREEZE_LBRS_ON_PMI|DEBUGCTLMSR_LBR)); - } - - local_irq_restore(flags); -} - -/* - * This handler is triggered by the local APIC, so the APIC IRQ handling - * rules apply: - */ -static int intel_pmu_handle_irq(struct pt_regs *regs) -{ - struct perf_sample_data data; - struct cpu_hw_events *cpuc; - int bit, loops; - u64 status; - int handled; - - cpuc = this_cpu_ptr(&cpu_hw_events); - - /* - * No known reason to not always do late ACK, - * but just in case do it opt-in. - */ - if (!x86_pmu.late_ack) - apic_write(APIC_LVTPC, APIC_DM_NMI); - __intel_pmu_disable_all(); - handled = intel_pmu_drain_bts_buffer(); - handled += intel_bts_interrupt(); - status = intel_pmu_get_status(); - if (!status) - goto done; - - loops = 0; -again: - intel_pmu_lbr_read(); - intel_pmu_ack_status(status); - if (++loops > 100) { - static bool warned = false; - if (!warned) { - WARN(1, "perfevents: irq loop stuck!\n"); - perf_event_print_debug(); - warned = true; - } - intel_pmu_reset(); - goto done; - } - - inc_irq_stat(apic_perf_irqs); - - - /* - * Ignore a range of extra bits in status that do not indicate - * overflow by themselves. - */ - status &= ~(GLOBAL_STATUS_COND_CHG | - GLOBAL_STATUS_ASIF | - GLOBAL_STATUS_LBRS_FROZEN); - if (!status) - goto done; - - /* - * PEBS overflow sets bit 62 in the global status register - */ - if (__test_and_clear_bit(62, (unsigned long *)&status)) { - handled++; - x86_pmu.drain_pebs(regs); - } - - /* - * Intel PT - */ - if (__test_and_clear_bit(55, (unsigned long *)&status)) { - handled++; - intel_pt_interrupt(); - } - - /* - * Checkpointed counters can lead to 'spurious' PMIs because the - * rollback caused by the PMI will have cleared the overflow status - * bit. Therefore always force probe these counters. - */ - status |= cpuc->intel_cp_status; - - for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { - struct perf_event *event = cpuc->events[bit]; - - handled++; - - if (!test_bit(bit, cpuc->active_mask)) - continue; - - if (!intel_pmu_save_and_restart(event)) - continue; - - perf_sample_data_init(&data, 0, event->hw.last_period); - - if (has_branch_stack(event)) - data.br_stack = &cpuc->lbr_stack; - - if (perf_event_overflow(event, &data, regs)) - x86_pmu_stop(event, 0); - } - - /* - * Repeat if there is more work to be done: - */ - status = intel_pmu_get_status(); - if (status) - goto again; - -done: - __intel_pmu_enable_all(0, true); - /* - * Only unmask the NMI after the overflow counters - * have been reset. This avoids spurious NMIs on - * Haswell CPUs. - */ - if (x86_pmu.late_ack) - apic_write(APIC_LVTPC, APIC_DM_NMI); - return handled; -} - -static struct event_constraint * -intel_bts_constraints(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - unsigned int hw_event, bts_event; - - if (event->attr.freq) - return NULL; - - hw_event = hwc->config & INTEL_ARCH_EVENT_MASK; - bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS); - - if (unlikely(hw_event == bts_event && hwc->sample_period == 1)) - return &bts_constraint; - - return NULL; -} - -static int intel_alt_er(int idx, u64 config) -{ - int alt_idx = idx; - - if (!(x86_pmu.flags & PMU_FL_HAS_RSP_1)) - return idx; - - if (idx == EXTRA_REG_RSP_0) - alt_idx = EXTRA_REG_RSP_1; - - if (idx == EXTRA_REG_RSP_1) - alt_idx = EXTRA_REG_RSP_0; - - if (config & ~x86_pmu.extra_regs[alt_idx].valid_mask) - return idx; - - return alt_idx; -} - -static void intel_fixup_er(struct perf_event *event, int idx) -{ - event->hw.extra_reg.idx = idx; - - if (idx == EXTRA_REG_RSP_0) { - event->hw.config &= ~INTEL_ARCH_EVENT_MASK; - event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_0].event; - event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0; - } else if (idx == EXTRA_REG_RSP_1) { - event->hw.config &= ~INTEL_ARCH_EVENT_MASK; - event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_1].event; - event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1; - } -} - -/* - * manage allocation of shared extra msr for certain events - * - * sharing can be: - * per-cpu: to be shared between the various events on a single PMU - * per-core: per-cpu + shared by HT threads - */ -static struct event_constraint * -__intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc, - struct perf_event *event, - struct hw_perf_event_extra *reg) -{ - struct event_constraint *c = &emptyconstraint; - struct er_account *era; - unsigned long flags; - int idx = reg->idx; - - /* - * reg->alloc can be set due to existing state, so for fake cpuc we - * need to ignore this, otherwise we might fail to allocate proper fake - * state for this extra reg constraint. Also see the comment below. - */ - if (reg->alloc && !cpuc->is_fake) - return NULL; /* call x86_get_event_constraint() */ - -again: - era = &cpuc->shared_regs->regs[idx]; - /* - * we use spin_lock_irqsave() to avoid lockdep issues when - * passing a fake cpuc - */ - raw_spin_lock_irqsave(&era->lock, flags); - - if (!atomic_read(&era->ref) || era->config == reg->config) { - - /* - * If its a fake cpuc -- as per validate_{group,event}() we - * shouldn't touch event state and we can avoid doing so - * since both will only call get_event_constraints() once - * on each event, this avoids the need for reg->alloc. - * - * Not doing the ER fixup will only result in era->reg being - * wrong, but since we won't actually try and program hardware - * this isn't a problem either. - */ - if (!cpuc->is_fake) { - if (idx != reg->idx) - intel_fixup_er(event, idx); - - /* - * x86_schedule_events() can call get_event_constraints() - * multiple times on events in the case of incremental - * scheduling(). reg->alloc ensures we only do the ER - * allocation once. - */ - reg->alloc = 1; - } - - /* lock in msr value */ - era->config = reg->config; - era->reg = reg->reg; - - /* one more user */ - atomic_inc(&era->ref); - - /* - * need to call x86_get_event_constraint() - * to check if associated event has constraints - */ - c = NULL; - } else { - idx = intel_alt_er(idx, reg->config); - if (idx != reg->idx) { - raw_spin_unlock_irqrestore(&era->lock, flags); - goto again; - } - } - raw_spin_unlock_irqrestore(&era->lock, flags); - - return c; -} - -static void -__intel_shared_reg_put_constraints(struct cpu_hw_events *cpuc, - struct hw_perf_event_extra *reg) -{ - struct er_account *era; - - /* - * Only put constraint if extra reg was actually allocated. Also takes - * care of event which do not use an extra shared reg. - * - * Also, if this is a fake cpuc we shouldn't touch any event state - * (reg->alloc) and we don't care about leaving inconsistent cpuc state - * either since it'll be thrown out. - */ - if (!reg->alloc || cpuc->is_fake) - return; - - era = &cpuc->shared_regs->regs[reg->idx]; - - /* one fewer user */ - atomic_dec(&era->ref); - - /* allocate again next time */ - reg->alloc = 0; -} - -static struct event_constraint * -intel_shared_regs_constraints(struct cpu_hw_events *cpuc, - struct perf_event *event) -{ - struct event_constraint *c = NULL, *d; - struct hw_perf_event_extra *xreg, *breg; - - xreg = &event->hw.extra_reg; - if (xreg->idx != EXTRA_REG_NONE) { - c = __intel_shared_reg_get_constraints(cpuc, event, xreg); - if (c == &emptyconstraint) - return c; - } - breg = &event->hw.branch_reg; - if (breg->idx != EXTRA_REG_NONE) { - d = __intel_shared_reg_get_constraints(cpuc, event, breg); - if (d == &emptyconstraint) { - __intel_shared_reg_put_constraints(cpuc, xreg); - c = d; - } - } - return c; -} - -struct event_constraint * -x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx, - struct perf_event *event) -{ - struct event_constraint *c; - - if (x86_pmu.event_constraints) { - for_each_event_constraint(c, x86_pmu.event_constraints) { - if ((event->hw.config & c->cmask) == c->code) { - event->hw.flags |= c->flags; - return c; - } - } - } - - return &unconstrained; -} - -static struct event_constraint * -__intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx, - struct perf_event *event) -{ - struct event_constraint *c; - - c = intel_bts_constraints(event); - if (c) - return c; - - c = intel_shared_regs_constraints(cpuc, event); - if (c) - return c; - - c = intel_pebs_constraints(event); - if (c) - return c; - - return x86_get_event_constraints(cpuc, idx, event); -} - -static void -intel_start_scheduling(struct cpu_hw_events *cpuc) -{ - struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs; - struct intel_excl_states *xl; - int tid = cpuc->excl_thread_id; - - /* - * nothing needed if in group validation mode - */ - if (cpuc->is_fake || !is_ht_workaround_enabled()) - return; - - /* - * no exclusion needed - */ - if (WARN_ON_ONCE(!excl_cntrs)) - return; - - xl = &excl_cntrs->states[tid]; - - xl->sched_started = true; - /* - * lock shared state until we are done scheduling - * in stop_event_scheduling() - * makes scheduling appear as a transaction - */ - raw_spin_lock(&excl_cntrs->lock); -} - -static void intel_commit_scheduling(struct cpu_hw_events *cpuc, int idx, int cntr) -{ - struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs; - struct event_constraint *c = cpuc->event_constraint[idx]; - struct intel_excl_states *xl; - int tid = cpuc->excl_thread_id; - - if (cpuc->is_fake || !is_ht_workaround_enabled()) - return; - - if (WARN_ON_ONCE(!excl_cntrs)) - return; - - if (!(c->flags & PERF_X86_EVENT_DYNAMIC)) - return; - - xl = &excl_cntrs->states[tid]; - - lockdep_assert_held(&excl_cntrs->lock); - - if (c->flags & PERF_X86_EVENT_EXCL) - xl->state[cntr] = INTEL_EXCL_EXCLUSIVE; - else - xl->state[cntr] = INTEL_EXCL_SHARED; -} - -static void -intel_stop_scheduling(struct cpu_hw_events *cpuc) -{ - struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs; - struct intel_excl_states *xl; - int tid = cpuc->excl_thread_id; - - /* - * nothing needed if in group validation mode - */ - if (cpuc->is_fake || !is_ht_workaround_enabled()) - return; - /* - * no exclusion needed - */ - if (WARN_ON_ONCE(!excl_cntrs)) - return; - - xl = &excl_cntrs->states[tid]; - - xl->sched_started = false; - /* - * release shared state lock (acquired in intel_start_scheduling()) - */ - raw_spin_unlock(&excl_cntrs->lock); -} - -static struct event_constraint * -intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event, - int idx, struct event_constraint *c) -{ - struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs; - struct intel_excl_states *xlo; - int tid = cpuc->excl_thread_id; - int is_excl, i; - - /* - * validating a group does not require - * enforcing cross-thread exclusion - */ - if (cpuc->is_fake || !is_ht_workaround_enabled()) - return c; - - /* - * no exclusion needed - */ - if (WARN_ON_ONCE(!excl_cntrs)) - return c; - - /* - * because we modify the constraint, we need - * to make a copy. Static constraints come - * from static const tables. - * - * only needed when constraint has not yet - * been cloned (marked dynamic) - */ - if (!(c->flags & PERF_X86_EVENT_DYNAMIC)) { - struct event_constraint *cx; - - /* - * grab pre-allocated constraint entry - */ - cx = &cpuc->constraint_list[idx]; - - /* - * initialize dynamic constraint - * with static constraint - */ - *cx = *c; - - /* - * mark constraint as dynamic, so we - * can free it later on - */ - cx->flags |= PERF_X86_EVENT_DYNAMIC; - c = cx; - } - - /* - * From here on, the constraint is dynamic. - * Either it was just allocated above, or it - * was allocated during a earlier invocation - * of this function - */ - - /* - * state of sibling HT - */ - xlo = &excl_cntrs->states[tid ^ 1]; - - /* - * event requires exclusive counter access - * across HT threads - */ - is_excl = c->flags & PERF_X86_EVENT_EXCL; - if (is_excl && !(event->hw.flags & PERF_X86_EVENT_EXCL_ACCT)) { - event->hw.flags |= PERF_X86_EVENT_EXCL_ACCT; - if (!cpuc->n_excl++) - WRITE_ONCE(excl_cntrs->has_exclusive[tid], 1); - } - - /* - * Modify static constraint with current dynamic - * state of thread - * - * EXCLUSIVE: sibling counter measuring exclusive event - * SHARED : sibling counter measuring non-exclusive event - * UNUSED : sibling counter unused - */ - for_each_set_bit(i, c->idxmsk, X86_PMC_IDX_MAX) { - /* - * exclusive event in sibling counter - * our corresponding counter cannot be used - * regardless of our event - */ - if (xlo->state[i] == INTEL_EXCL_EXCLUSIVE) - __clear_bit(i, c->idxmsk); - /* - * if measuring an exclusive event, sibling - * measuring non-exclusive, then counter cannot - * be used - */ - if (is_excl && xlo->state[i] == INTEL_EXCL_SHARED) - __clear_bit(i, c->idxmsk); - } - - /* - * recompute actual bit weight for scheduling algorithm - */ - c->weight = hweight64(c->idxmsk64); - - /* - * if we return an empty mask, then switch - * back to static empty constraint to avoid - * the cost of freeing later on - */ - if (c->weight == 0) - c = &emptyconstraint; - - return c; -} - -static struct event_constraint * -intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx, - struct perf_event *event) -{ - struct event_constraint *c1 = NULL; - struct event_constraint *c2; - - if (idx >= 0) /* fake does < 0 */ - c1 = cpuc->event_constraint[idx]; - - /* - * first time only - * - static constraint: no change across incremental scheduling calls - * - dynamic constraint: handled by intel_get_excl_constraints() - */ - c2 = __intel_get_event_constraints(cpuc, idx, event); - if (c1 && (c1->flags & PERF_X86_EVENT_DYNAMIC)) { - bitmap_copy(c1->idxmsk, c2->idxmsk, X86_PMC_IDX_MAX); - c1->weight = c2->weight; - c2 = c1; - } - - if (cpuc->excl_cntrs) - return intel_get_excl_constraints(cpuc, event, idx, c2); - - return c2; -} - -static void intel_put_excl_constraints(struct cpu_hw_events *cpuc, - struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs; - int tid = cpuc->excl_thread_id; - struct intel_excl_states *xl; - - /* - * nothing needed if in group validation mode - */ - if (cpuc->is_fake) - return; - - if (WARN_ON_ONCE(!excl_cntrs)) - return; - - if (hwc->flags & PERF_X86_EVENT_EXCL_ACCT) { - hwc->flags &= ~PERF_X86_EVENT_EXCL_ACCT; - if (!--cpuc->n_excl) - WRITE_ONCE(excl_cntrs->has_exclusive[tid], 0); - } - - /* - * If event was actually assigned, then mark the counter state as - * unused now. - */ - if (hwc->idx >= 0) { - xl = &excl_cntrs->states[tid]; - - /* - * put_constraint may be called from x86_schedule_events() - * which already has the lock held so here make locking - * conditional. - */ - if (!xl->sched_started) - raw_spin_lock(&excl_cntrs->lock); - - xl->state[hwc->idx] = INTEL_EXCL_UNUSED; - - if (!xl->sched_started) - raw_spin_unlock(&excl_cntrs->lock); - } -} - -static void -intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc, - struct perf_event *event) -{ - struct hw_perf_event_extra *reg; - - reg = &event->hw.extra_reg; - if (reg->idx != EXTRA_REG_NONE) - __intel_shared_reg_put_constraints(cpuc, reg); - - reg = &event->hw.branch_reg; - if (reg->idx != EXTRA_REG_NONE) - __intel_shared_reg_put_constraints(cpuc, reg); -} - -static void intel_put_event_constraints(struct cpu_hw_events *cpuc, - struct perf_event *event) -{ - intel_put_shared_regs_event_constraints(cpuc, event); - - /* - * is PMU has exclusive counter restrictions, then - * all events are subject to and must call the - * put_excl_constraints() routine - */ - if (cpuc->excl_cntrs) - intel_put_excl_constraints(cpuc, event); -} - -static void intel_pebs_aliases_core2(struct perf_event *event) -{ - if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) { - /* - * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P - * (0x003c) so that we can use it with PEBS. - * - * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't - * PEBS capable. However we can use INST_RETIRED.ANY_P - * (0x00c0), which is a PEBS capable event, to get the same - * count. - * - * INST_RETIRED.ANY_P counts the number of cycles that retires - * CNTMASK instructions. By setting CNTMASK to a value (16) - * larger than the maximum number of instructions that can be - * retired per cycle (4) and then inverting the condition, we - * count all cycles that retire 16 or less instructions, which - * is every cycle. - * - * Thereby we gain a PEBS capable cycle counter. - */ - u64 alt_config = X86_CONFIG(.event=0xc0, .inv=1, .cmask=16); - - alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK); - event->hw.config = alt_config; - } -} - -static void intel_pebs_aliases_snb(struct perf_event *event) -{ - if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) { - /* - * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P - * (0x003c) so that we can use it with PEBS. - * - * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't - * PEBS capable. However we can use UOPS_RETIRED.ALL - * (0x01c2), which is a PEBS capable event, to get the same - * count. - * - * UOPS_RETIRED.ALL counts the number of cycles that retires - * CNTMASK micro-ops. By setting CNTMASK to a value (16) - * larger than the maximum number of micro-ops that can be - * retired per cycle (4) and then inverting the condition, we - * count all cycles that retire 16 or less micro-ops, which - * is every cycle. - * - * Thereby we gain a PEBS capable cycle counter. - */ - u64 alt_config = X86_CONFIG(.event=0xc2, .umask=0x01, .inv=1, .cmask=16); - - alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK); - event->hw.config = alt_config; - } -} - -static void intel_pebs_aliases_precdist(struct perf_event *event) -{ - if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) { - /* - * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P - * (0x003c) so that we can use it with PEBS. - * - * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't - * PEBS capable. However we can use INST_RETIRED.PREC_DIST - * (0x01c0), which is a PEBS capable event, to get the same - * count. - * - * The PREC_DIST event has special support to minimize sample - * shadowing effects. One drawback is that it can be - * only programmed on counter 1, but that seems like an - * acceptable trade off. - */ - u64 alt_config = X86_CONFIG(.event=0xc0, .umask=0x01, .inv=1, .cmask=16); - - alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK); - event->hw.config = alt_config; - } -} - -static void intel_pebs_aliases_ivb(struct perf_event *event) -{ - if (event->attr.precise_ip < 3) - return intel_pebs_aliases_snb(event); - return intel_pebs_aliases_precdist(event); -} - -static void intel_pebs_aliases_skl(struct perf_event *event) -{ - if (event->attr.precise_ip < 3) - return intel_pebs_aliases_core2(event); - return intel_pebs_aliases_precdist(event); -} - -static unsigned long intel_pmu_free_running_flags(struct perf_event *event) -{ - unsigned long flags = x86_pmu.free_running_flags; - - if (event->attr.use_clockid) - flags &= ~PERF_SAMPLE_TIME; - return flags; -} - -static int intel_pmu_hw_config(struct perf_event *event) -{ - int ret = x86_pmu_hw_config(event); - - if (ret) - return ret; - - if (event->attr.precise_ip) { - if (!event->attr.freq) { - event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD; - if (!(event->attr.sample_type & - ~intel_pmu_free_running_flags(event))) - event->hw.flags |= PERF_X86_EVENT_FREERUNNING; - } - if (x86_pmu.pebs_aliases) - x86_pmu.pebs_aliases(event); - } - - if (needs_branch_stack(event)) { - ret = intel_pmu_setup_lbr_filter(event); - if (ret) - return ret; - - /* - * BTS is set up earlier in this path, so don't account twice - */ - if (!intel_pmu_has_bts(event)) { - /* disallow lbr if conflicting events are present */ - if (x86_add_exclusive(x86_lbr_exclusive_lbr)) - return -EBUSY; - - event->destroy = hw_perf_lbr_event_destroy; - } - } - - if (event->attr.type != PERF_TYPE_RAW) - return 0; - - if (!(event->attr.config & ARCH_PERFMON_EVENTSEL_ANY)) - return 0; - - if (x86_pmu.version < 3) - return -EINVAL; - - if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) - return -EACCES; - - event->hw.config |= ARCH_PERFMON_EVENTSEL_ANY; - - return 0; -} - -struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr) -{ - if (x86_pmu.guest_get_msrs) - return x86_pmu.guest_get_msrs(nr); - *nr = 0; - return NULL; -} -EXPORT_SYMBOL_GPL(perf_guest_get_msrs); - -static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr) -{ - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs; - - arr[0].msr = MSR_CORE_PERF_GLOBAL_CTRL; - arr[0].host = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask; - arr[0].guest = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_host_mask; - /* - * If PMU counter has PEBS enabled it is not enough to disable counter - * on a guest entry since PEBS memory write can overshoot guest entry - * and corrupt guest memory. Disabling PEBS solves the problem. - */ - arr[1].msr = MSR_IA32_PEBS_ENABLE; - arr[1].host = cpuc->pebs_enabled; - arr[1].guest = 0; - - *nr = 2; - return arr; -} - -static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr) -{ - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs; - int idx; - - for (idx = 0; idx < x86_pmu.num_counters; idx++) { - struct perf_event *event = cpuc->events[idx]; - - arr[idx].msr = x86_pmu_config_addr(idx); - arr[idx].host = arr[idx].guest = 0; - - if (!test_bit(idx, cpuc->active_mask)) - continue; - - arr[idx].host = arr[idx].guest = - event->hw.config | ARCH_PERFMON_EVENTSEL_ENABLE; - - if (event->attr.exclude_host) - arr[idx].host &= ~ARCH_PERFMON_EVENTSEL_ENABLE; - else if (event->attr.exclude_guest) - arr[idx].guest &= ~ARCH_PERFMON_EVENTSEL_ENABLE; - } - - *nr = x86_pmu.num_counters; - return arr; -} - -static void core_pmu_enable_event(struct perf_event *event) -{ - if (!event->attr.exclude_host) - x86_pmu_enable_event(event); -} - -static void core_pmu_enable_all(int added) -{ - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - int idx; - - for (idx = 0; idx < x86_pmu.num_counters; idx++) { - struct hw_perf_event *hwc = &cpuc->events[idx]->hw; - - if (!test_bit(idx, cpuc->active_mask) || - cpuc->events[idx]->attr.exclude_host) - continue; - - __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); - } -} - -static int hsw_hw_config(struct perf_event *event) -{ - int ret = intel_pmu_hw_config(event); - - if (ret) - return ret; - if (!boot_cpu_has(X86_FEATURE_RTM) && !boot_cpu_has(X86_FEATURE_HLE)) - return 0; - event->hw.config |= event->attr.config & (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED); - - /* - * IN_TX/IN_TX-CP filters are not supported by the Haswell PMU with - * PEBS or in ANY thread mode. Since the results are non-sensical forbid - * this combination. - */ - if ((event->hw.config & (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED)) && - ((event->hw.config & ARCH_PERFMON_EVENTSEL_ANY) || - event->attr.precise_ip > 0)) - return -EOPNOTSUPP; - - if (event_is_checkpointed(event)) { - /* - * Sampling of checkpointed events can cause situations where - * the CPU constantly aborts because of a overflow, which is - * then checkpointed back and ignored. Forbid checkpointing - * for sampling. - * - * But still allow a long sampling period, so that perf stat - * from KVM works. - */ - if (event->attr.sample_period > 0 && - event->attr.sample_period < 0x7fffffff) - return -EOPNOTSUPP; - } - return 0; -} - -static struct event_constraint counter2_constraint = - EVENT_CONSTRAINT(0, 0x4, 0); - -static struct event_constraint * -hsw_get_event_constraints(struct cpu_hw_events *cpuc, int idx, - struct perf_event *event) -{ - struct event_constraint *c; - - c = intel_get_event_constraints(cpuc, idx, event); - - /* Handle special quirk on in_tx_checkpointed only in counter 2 */ - if (event->hw.config & HSW_IN_TX_CHECKPOINTED) { - if (c->idxmsk64 & (1U << 2)) - return &counter2_constraint; - return &emptyconstraint; - } - - return c; -} - -/* - * Broadwell: - * - * The INST_RETIRED.ALL period always needs to have lowest 6 bits cleared - * (BDM55) and it must not use a period smaller than 100 (BDM11). We combine - * the two to enforce a minimum period of 128 (the smallest value that has bits - * 0-5 cleared and >= 100). - * - * Because of how the code in x86_perf_event_set_period() works, the truncation - * of the lower 6 bits is 'harmless' as we'll occasionally add a longer period - * to make up for the 'lost' events due to carrying the 'error' in period_left. - * - * Therefore the effective (average) period matches the requested period, - * despite coarser hardware granularity. - */ -static unsigned bdw_limit_period(struct perf_event *event, unsigned left) -{ - if ((event->hw.config & INTEL_ARCH_EVENT_MASK) == - X86_CONFIG(.event=0xc0, .umask=0x01)) { - if (left < 128) - left = 128; - left &= ~0x3fu; - } - return left; -} - -PMU_FORMAT_ATTR(event, "config:0-7" ); -PMU_FORMAT_ATTR(umask, "config:8-15" ); -PMU_FORMAT_ATTR(edge, "config:18" ); -PMU_FORMAT_ATTR(pc, "config:19" ); -PMU_FORMAT_ATTR(any, "config:21" ); /* v3 + */ -PMU_FORMAT_ATTR(inv, "config:23" ); -PMU_FORMAT_ATTR(cmask, "config:24-31" ); -PMU_FORMAT_ATTR(in_tx, "config:32"); -PMU_FORMAT_ATTR(in_tx_cp, "config:33"); - -static struct attribute *intel_arch_formats_attr[] = { - &format_attr_event.attr, - &format_attr_umask.attr, - &format_attr_edge.attr, - &format_attr_pc.attr, - &format_attr_inv.attr, - &format_attr_cmask.attr, - NULL, -}; - -ssize_t intel_event_sysfs_show(char *page, u64 config) -{ - u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT); - - return x86_event_sysfs_show(page, config, event); -} - -struct intel_shared_regs *allocate_shared_regs(int cpu) -{ - struct intel_shared_regs *regs; - int i; - - regs = kzalloc_node(sizeof(struct intel_shared_regs), - GFP_KERNEL, cpu_to_node(cpu)); - if (regs) { - /* - * initialize the locks to keep lockdep happy - */ - for (i = 0; i < EXTRA_REG_MAX; i++) - raw_spin_lock_init(®s->regs[i].lock); - - regs->core_id = -1; - } - return regs; -} - -static struct intel_excl_cntrs *allocate_excl_cntrs(int cpu) -{ - struct intel_excl_cntrs *c; - - c = kzalloc_node(sizeof(struct intel_excl_cntrs), - GFP_KERNEL, cpu_to_node(cpu)); - if (c) { - raw_spin_lock_init(&c->lock); - c->core_id = -1; - } - return c; -} - -static int intel_pmu_cpu_prepare(int cpu) -{ - struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); - - if (x86_pmu.extra_regs || x86_pmu.lbr_sel_map) { - cpuc->shared_regs = allocate_shared_regs(cpu); - if (!cpuc->shared_regs) - goto err; - } - - if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) { - size_t sz = X86_PMC_IDX_MAX * sizeof(struct event_constraint); - - cpuc->constraint_list = kzalloc(sz, GFP_KERNEL); - if (!cpuc->constraint_list) - goto err_shared_regs; - - cpuc->excl_cntrs = allocate_excl_cntrs(cpu); - if (!cpuc->excl_cntrs) - goto err_constraint_list; - - cpuc->excl_thread_id = 0; - } - - return NOTIFY_OK; - -err_constraint_list: - kfree(cpuc->constraint_list); - cpuc->constraint_list = NULL; - -err_shared_regs: - kfree(cpuc->shared_regs); - cpuc->shared_regs = NULL; - -err: - return NOTIFY_BAD; -} - -static void intel_pmu_cpu_starting(int cpu) -{ - struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); - int core_id = topology_core_id(cpu); - int i; - - init_debug_store_on_cpu(cpu); - /* - * Deal with CPUs that don't clear their LBRs on power-up. - */ - intel_pmu_lbr_reset(); - - cpuc->lbr_sel = NULL; - - if (!cpuc->shared_regs) - return; - - if (!(x86_pmu.flags & PMU_FL_NO_HT_SHARING)) { - for_each_cpu(i, topology_sibling_cpumask(cpu)) { - struct intel_shared_regs *pc; - - pc = per_cpu(cpu_hw_events, i).shared_regs; - if (pc && pc->core_id == core_id) { - cpuc->kfree_on_online[0] = cpuc->shared_regs; - cpuc->shared_regs = pc; - break; - } - } - cpuc->shared_regs->core_id = core_id; - cpuc->shared_regs->refcnt++; - } - - if (x86_pmu.lbr_sel_map) - cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR]; - - if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) { - for_each_cpu(i, topology_sibling_cpumask(cpu)) { - struct intel_excl_cntrs *c; - - c = per_cpu(cpu_hw_events, i).excl_cntrs; - if (c && c->core_id == core_id) { - cpuc->kfree_on_online[1] = cpuc->excl_cntrs; - cpuc->excl_cntrs = c; - cpuc->excl_thread_id = 1; - break; - } - } - cpuc->excl_cntrs->core_id = core_id; - cpuc->excl_cntrs->refcnt++; - } -} - -static void free_excl_cntrs(int cpu) -{ - struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); - struct intel_excl_cntrs *c; - - c = cpuc->excl_cntrs; - if (c) { - if (c->core_id == -1 || --c->refcnt == 0) - kfree(c); - cpuc->excl_cntrs = NULL; - kfree(cpuc->constraint_list); - cpuc->constraint_list = NULL; - } -} - -static void intel_pmu_cpu_dying(int cpu) -{ - struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); - struct intel_shared_regs *pc; - - pc = cpuc->shared_regs; - if (pc) { - if (pc->core_id == -1 || --pc->refcnt == 0) - kfree(pc); - cpuc->shared_regs = NULL; - } - - free_excl_cntrs(cpu); - - fini_debug_store_on_cpu(cpu); -} - -static void intel_pmu_sched_task(struct perf_event_context *ctx, - bool sched_in) -{ - if (x86_pmu.pebs_active) - intel_pmu_pebs_sched_task(ctx, sched_in); - if (x86_pmu.lbr_nr) - intel_pmu_lbr_sched_task(ctx, sched_in); -} - -PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63"); - -PMU_FORMAT_ATTR(ldlat, "config1:0-15"); - -PMU_FORMAT_ATTR(frontend, "config1:0-23"); - -static struct attribute *intel_arch3_formats_attr[] = { - &format_attr_event.attr, - &format_attr_umask.attr, - &format_attr_edge.attr, - &format_attr_pc.attr, - &format_attr_any.attr, - &format_attr_inv.attr, - &format_attr_cmask.attr, - &format_attr_in_tx.attr, - &format_attr_in_tx_cp.attr, - - &format_attr_offcore_rsp.attr, /* XXX do NHM/WSM + SNB breakout */ - &format_attr_ldlat.attr, /* PEBS load latency */ - NULL, -}; - -static struct attribute *skl_format_attr[] = { - &format_attr_frontend.attr, - NULL, -}; - -static __initconst const struct x86_pmu core_pmu = { - .name = "core", - .handle_irq = x86_pmu_handle_irq, - .disable_all = x86_pmu_disable_all, - .enable_all = core_pmu_enable_all, - .enable = core_pmu_enable_event, - .disable = x86_pmu_disable_event, - .hw_config = x86_pmu_hw_config, - .schedule_events = x86_schedule_events, - .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, - .perfctr = MSR_ARCH_PERFMON_PERFCTR0, - .event_map = intel_pmu_event_map, - .max_events = ARRAY_SIZE(intel_perfmon_event_map), - .apic = 1, - .free_running_flags = PEBS_FREERUNNING_FLAGS, - - /* - * Intel PMCs cannot be accessed sanely above 32-bit width, - * so we install an artificial 1<<31 period regardless of - * the generic event period: - */ - .max_period = (1ULL<<31) - 1, - .get_event_constraints = intel_get_event_constraints, - .put_event_constraints = intel_put_event_constraints, - .event_constraints = intel_core_event_constraints, - .guest_get_msrs = core_guest_get_msrs, - .format_attrs = intel_arch_formats_attr, - .events_sysfs_show = intel_event_sysfs_show, - - /* - * Virtual (or funny metal) CPU can define x86_pmu.extra_regs - * together with PMU version 1 and thus be using core_pmu with - * shared_regs. We need following callbacks here to allocate - * it properly. - */ - .cpu_prepare = intel_pmu_cpu_prepare, - .cpu_starting = intel_pmu_cpu_starting, - .cpu_dying = intel_pmu_cpu_dying, -}; - -static __initconst const struct x86_pmu intel_pmu = { - .name = "Intel", - .handle_irq = intel_pmu_handle_irq, - .disable_all = intel_pmu_disable_all, - .enable_all = intel_pmu_enable_all, - .enable = intel_pmu_enable_event, - .disable = intel_pmu_disable_event, - .hw_config = intel_pmu_hw_config, - .schedule_events = x86_schedule_events, - .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, - .perfctr = MSR_ARCH_PERFMON_PERFCTR0, - .event_map = intel_pmu_event_map, - .max_events = ARRAY_SIZE(intel_perfmon_event_map), - .apic = 1, - .free_running_flags = PEBS_FREERUNNING_FLAGS, - /* - * Intel PMCs cannot be accessed sanely above 32 bit width, - * so we install an artificial 1<<31 period regardless of - * the generic event period: - */ - .max_period = (1ULL << 31) - 1, - .get_event_constraints = intel_get_event_constraints, - .put_event_constraints = intel_put_event_constraints, - .pebs_aliases = intel_pebs_aliases_core2, - - .format_attrs = intel_arch3_formats_attr, - .events_sysfs_show = intel_event_sysfs_show, - - .cpu_prepare = intel_pmu_cpu_prepare, - .cpu_starting = intel_pmu_cpu_starting, - .cpu_dying = intel_pmu_cpu_dying, - .guest_get_msrs = intel_guest_get_msrs, - .sched_task = intel_pmu_sched_task, -}; - -static __init void intel_clovertown_quirk(void) -{ - /* - * PEBS is unreliable due to: - * - * AJ67 - PEBS may experience CPL leaks - * AJ68 - PEBS PMI may be delayed by one event - * AJ69 - GLOBAL_STATUS[62] will only be set when DEBUGCTL[12] - * AJ106 - FREEZE_LBRS_ON_PMI doesn't work in combination with PEBS - * - * AJ67 could be worked around by restricting the OS/USR flags. - * AJ69 could be worked around by setting PMU_FREEZE_ON_PMI. - * - * AJ106 could possibly be worked around by not allowing LBR - * usage from PEBS, including the fixup. - * AJ68 could possibly be worked around by always programming - * a pebs_event_reset[0] value and coping with the lost events. - * - * But taken together it might just make sense to not enable PEBS on - * these chips. - */ - pr_warn("PEBS disabled due to CPU errata\n"); - x86_pmu.pebs = 0; - x86_pmu.pebs_constraints = NULL; -} - -static int intel_snb_pebs_broken(int cpu) -{ - u32 rev = UINT_MAX; /* default to broken for unknown models */ - - switch (cpu_data(cpu).x86_model) { - case 42: /* SNB */ - rev = 0x28; - break; - - case 45: /* SNB-EP */ - switch (cpu_data(cpu).x86_mask) { - case 6: rev = 0x618; break; - case 7: rev = 0x70c; break; - } - } - - return (cpu_data(cpu).microcode < rev); -} - -static void intel_snb_check_microcode(void) -{ - int pebs_broken = 0; - int cpu; - - get_online_cpus(); - for_each_online_cpu(cpu) { - if ((pebs_broken = intel_snb_pebs_broken(cpu))) - break; - } - put_online_cpus(); - - if (pebs_broken == x86_pmu.pebs_broken) - return; - - /* - * Serialized by the microcode lock.. - */ - if (x86_pmu.pebs_broken) { - pr_info("PEBS enabled due to microcode update\n"); - x86_pmu.pebs_broken = 0; - } else { - pr_info("PEBS disabled due to CPU errata, please upgrade microcode\n"); - x86_pmu.pebs_broken = 1; - } -} - -/* - * Under certain circumstances, access certain MSR may cause #GP. - * The function tests if the input MSR can be safely accessed. - */ -static bool check_msr(unsigned long msr, u64 mask) -{ - u64 val_old, val_new, val_tmp; - - /* - * Read the current value, change it and read it back to see if it - * matches, this is needed to detect certain hardware emulators - * (qemu/kvm) that don't trap on the MSR access and always return 0s. - */ - if (rdmsrl_safe(msr, &val_old)) - return false; - - /* - * Only change the bits which can be updated by wrmsrl. - */ - val_tmp = val_old ^ mask; - if (wrmsrl_safe(msr, val_tmp) || - rdmsrl_safe(msr, &val_new)) - return false; - - if (val_new != val_tmp) - return false; - - /* Here it's sure that the MSR can be safely accessed. - * Restore the old value and return. - */ - wrmsrl(msr, val_old); - - return true; -} - -static __init void intel_sandybridge_quirk(void) -{ - x86_pmu.check_microcode = intel_snb_check_microcode; - intel_snb_check_microcode(); -} - -static const struct { int id; char *name; } intel_arch_events_map[] __initconst = { - { PERF_COUNT_HW_CPU_CYCLES, "cpu cycles" }, - { PERF_COUNT_HW_INSTRUCTIONS, "instructions" }, - { PERF_COUNT_HW_BUS_CYCLES, "bus cycles" }, - { PERF_COUNT_HW_CACHE_REFERENCES, "cache references" }, - { PERF_COUNT_HW_CACHE_MISSES, "cache misses" }, - { PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branch instructions" }, - { PERF_COUNT_HW_BRANCH_MISSES, "branch misses" }, -}; - -static __init void intel_arch_events_quirk(void) -{ - int bit; - - /* disable event that reported as not presend by cpuid */ - for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_arch_events_map)) { - intel_perfmon_event_map[intel_arch_events_map[bit].id] = 0; - pr_warn("CPUID marked event: \'%s\' unavailable\n", - intel_arch_events_map[bit].name); - } -} - -static __init void intel_nehalem_quirk(void) -{ - union cpuid10_ebx ebx; - - ebx.full = x86_pmu.events_maskl; - if (ebx.split.no_branch_misses_retired) { - /* - * Erratum AAJ80 detected, we work it around by using - * the BR_MISP_EXEC.ANY event. This will over-count - * branch-misses, but it's still much better than the - * architectural event which is often completely bogus: - */ - intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89; - ebx.split.no_branch_misses_retired = 0; - x86_pmu.events_maskl = ebx.full; - pr_info("CPU erratum AAJ80 worked around\n"); - } -} - -/* - * enable software workaround for errata: - * SNB: BJ122 - * IVB: BV98 - * HSW: HSD29 - * - * Only needed when HT is enabled. However detecting - * if HT is enabled is difficult (model specific). So instead, - * we enable the workaround in the early boot, and verify if - * it is needed in a later initcall phase once we have valid - * topology information to check if HT is actually enabled - */ -static __init void intel_ht_bug(void) -{ - x86_pmu.flags |= PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED; - - x86_pmu.start_scheduling = intel_start_scheduling; - x86_pmu.commit_scheduling = intel_commit_scheduling; - x86_pmu.stop_scheduling = intel_stop_scheduling; -} - -EVENT_ATTR_STR(mem-loads, mem_ld_hsw, "event=0xcd,umask=0x1,ldlat=3"); -EVENT_ATTR_STR(mem-stores, mem_st_hsw, "event=0xd0,umask=0x82") - -/* Haswell special events */ -EVENT_ATTR_STR(tx-start, tx_start, "event=0xc9,umask=0x1"); -EVENT_ATTR_STR(tx-commit, tx_commit, "event=0xc9,umask=0x2"); -EVENT_ATTR_STR(tx-abort, tx_abort, "event=0xc9,umask=0x4"); -EVENT_ATTR_STR(tx-capacity, tx_capacity, "event=0x54,umask=0x2"); -EVENT_ATTR_STR(tx-conflict, tx_conflict, "event=0x54,umask=0x1"); -EVENT_ATTR_STR(el-start, el_start, "event=0xc8,umask=0x1"); -EVENT_ATTR_STR(el-commit, el_commit, "event=0xc8,umask=0x2"); -EVENT_ATTR_STR(el-abort, el_abort, "event=0xc8,umask=0x4"); -EVENT_ATTR_STR(el-capacity, el_capacity, "event=0x54,umask=0x2"); -EVENT_ATTR_STR(el-conflict, el_conflict, "event=0x54,umask=0x1"); -EVENT_ATTR_STR(cycles-t, cycles_t, "event=0x3c,in_tx=1"); -EVENT_ATTR_STR(cycles-ct, cycles_ct, "event=0x3c,in_tx=1,in_tx_cp=1"); - -static struct attribute *hsw_events_attrs[] = { - EVENT_PTR(tx_start), - EVENT_PTR(tx_commit), - EVENT_PTR(tx_abort), - EVENT_PTR(tx_capacity), - EVENT_PTR(tx_conflict), - EVENT_PTR(el_start), - EVENT_PTR(el_commit), - EVENT_PTR(el_abort), - EVENT_PTR(el_capacity), - EVENT_PTR(el_conflict), - EVENT_PTR(cycles_t), - EVENT_PTR(cycles_ct), - EVENT_PTR(mem_ld_hsw), - EVENT_PTR(mem_st_hsw), - NULL -}; - -__init int intel_pmu_init(void) -{ - union cpuid10_edx edx; - union cpuid10_eax eax; - union cpuid10_ebx ebx; - struct event_constraint *c; - unsigned int unused; - struct extra_reg *er; - int version, i; - - if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { - switch (boot_cpu_data.x86) { - case 0x6: - return p6_pmu_init(); - case 0xb: - return knc_pmu_init(); - case 0xf: - return p4_pmu_init(); - } - return -ENODEV; - } - - /* - * Check whether the Architectural PerfMon supports - * Branch Misses Retired hw_event or not. - */ - cpuid(10, &eax.full, &ebx.full, &unused, &edx.full); - if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT) - return -ENODEV; - - version = eax.split.version_id; - if (version < 2) - x86_pmu = core_pmu; - else - x86_pmu = intel_pmu; - - x86_pmu.version = version; - x86_pmu.num_counters = eax.split.num_counters; - x86_pmu.cntval_bits = eax.split.bit_width; - x86_pmu.cntval_mask = (1ULL << eax.split.bit_width) - 1; - - x86_pmu.events_maskl = ebx.full; - x86_pmu.events_mask_len = eax.split.mask_length; - - x86_pmu.max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, x86_pmu.num_counters); - - /* - * Quirk: v2 perfmon does not report fixed-purpose events, so - * assume at least 3 events: - */ - if (version > 1) - x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3); - - if (boot_cpu_has(X86_FEATURE_PDCM)) { - u64 capabilities; - - rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities); - x86_pmu.intel_cap.capabilities = capabilities; - } - - intel_ds_init(); - - x86_add_quirk(intel_arch_events_quirk); /* Install first, so it runs last */ - - /* - * Install the hw-cache-events table: - */ - switch (boot_cpu_data.x86_model) { - case 14: /* 65nm Core "Yonah" */ - pr_cont("Core events, "); - break; - - case 15: /* 65nm Core2 "Merom" */ - x86_add_quirk(intel_clovertown_quirk); - case 22: /* 65nm Core2 "Merom-L" */ - case 23: /* 45nm Core2 "Penryn" */ - case 29: /* 45nm Core2 "Dunnington (MP) */ - memcpy(hw_cache_event_ids, core2_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); - - intel_pmu_lbr_init_core(); - - x86_pmu.event_constraints = intel_core2_event_constraints; - x86_pmu.pebs_constraints = intel_core2_pebs_event_constraints; - pr_cont("Core2 events, "); - break; - - case 30: /* 45nm Nehalem */ - case 26: /* 45nm Nehalem-EP */ - case 46: /* 45nm Nehalem-EX */ - memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); - memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs, - sizeof(hw_cache_extra_regs)); - - intel_pmu_lbr_init_nhm(); - - x86_pmu.event_constraints = intel_nehalem_event_constraints; - x86_pmu.pebs_constraints = intel_nehalem_pebs_event_constraints; - x86_pmu.enable_all = intel_pmu_nhm_enable_all; - x86_pmu.extra_regs = intel_nehalem_extra_regs; - - x86_pmu.cpu_events = nhm_events_attrs; - - /* UOPS_ISSUED.STALLED_CYCLES */ - intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = - X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1); - /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */ - intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = - X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1); - - x86_add_quirk(intel_nehalem_quirk); - - pr_cont("Nehalem events, "); - break; - - case 28: /* 45nm Atom "Pineview" */ - case 38: /* 45nm Atom "Lincroft" */ - case 39: /* 32nm Atom "Penwell" */ - case 53: /* 32nm Atom "Cloverview" */ - case 54: /* 32nm Atom "Cedarview" */ - memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); - - intel_pmu_lbr_init_atom(); - - x86_pmu.event_constraints = intel_gen_event_constraints; - x86_pmu.pebs_constraints = intel_atom_pebs_event_constraints; - x86_pmu.pebs_aliases = intel_pebs_aliases_core2; - pr_cont("Atom events, "); - break; - - case 55: /* 22nm Atom "Silvermont" */ - case 76: /* 14nm Atom "Airmont" */ - case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */ - memcpy(hw_cache_event_ids, slm_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); - memcpy(hw_cache_extra_regs, slm_hw_cache_extra_regs, - sizeof(hw_cache_extra_regs)); - - intel_pmu_lbr_init_atom(); - - x86_pmu.event_constraints = intel_slm_event_constraints; - x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints; - x86_pmu.extra_regs = intel_slm_extra_regs; - x86_pmu.flags |= PMU_FL_HAS_RSP_1; - pr_cont("Silvermont events, "); - break; - - case 37: /* 32nm Westmere */ - case 44: /* 32nm Westmere-EP */ - case 47: /* 32nm Westmere-EX */ - memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); - memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs, - sizeof(hw_cache_extra_regs)); - - intel_pmu_lbr_init_nhm(); - - x86_pmu.event_constraints = intel_westmere_event_constraints; - x86_pmu.enable_all = intel_pmu_nhm_enable_all; - x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints; - x86_pmu.extra_regs = intel_westmere_extra_regs; - x86_pmu.flags |= PMU_FL_HAS_RSP_1; - - x86_pmu.cpu_events = nhm_events_attrs; - - /* UOPS_ISSUED.STALLED_CYCLES */ - intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = - X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1); - /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */ - intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = - X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1); - - pr_cont("Westmere events, "); - break; - - case 42: /* 32nm SandyBridge */ - case 45: /* 32nm SandyBridge-E/EN/EP */ - x86_add_quirk(intel_sandybridge_quirk); - x86_add_quirk(intel_ht_bug); - memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); - memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, - sizeof(hw_cache_extra_regs)); - - intel_pmu_lbr_init_snb(); - - x86_pmu.event_constraints = intel_snb_event_constraints; - x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints; - x86_pmu.pebs_aliases = intel_pebs_aliases_snb; - if (boot_cpu_data.x86_model == 45) - x86_pmu.extra_regs = intel_snbep_extra_regs; - else - x86_pmu.extra_regs = intel_snb_extra_regs; - - - /* all extra regs are per-cpu when HT is on */ - x86_pmu.flags |= PMU_FL_HAS_RSP_1; - x86_pmu.flags |= PMU_FL_NO_HT_SHARING; - - x86_pmu.cpu_events = snb_events_attrs; - - /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */ - intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = - X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1); - /* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/ - intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = - X86_CONFIG(.event=0xb1, .umask=0x01, .inv=1, .cmask=1); - - pr_cont("SandyBridge events, "); - break; - - case 58: /* 22nm IvyBridge */ - case 62: /* 22nm IvyBridge-EP/EX */ - x86_add_quirk(intel_ht_bug); - memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); - /* dTLB-load-misses on IVB is different than SNB */ - hw_cache_event_ids[C(DTLB)][C(OP_READ)][C(RESULT_MISS)] = 0x8108; /* DTLB_LOAD_MISSES.DEMAND_LD_MISS_CAUSES_A_WALK */ - - memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, - sizeof(hw_cache_extra_regs)); - - intel_pmu_lbr_init_snb(); - - x86_pmu.event_constraints = intel_ivb_event_constraints; - x86_pmu.pebs_constraints = intel_ivb_pebs_event_constraints; - x86_pmu.pebs_aliases = intel_pebs_aliases_ivb; - x86_pmu.pebs_prec_dist = true; - if (boot_cpu_data.x86_model == 62) - x86_pmu.extra_regs = intel_snbep_extra_regs; - else - x86_pmu.extra_regs = intel_snb_extra_regs; - /* all extra regs are per-cpu when HT is on */ - x86_pmu.flags |= PMU_FL_HAS_RSP_1; - x86_pmu.flags |= PMU_FL_NO_HT_SHARING; - - x86_pmu.cpu_events = snb_events_attrs; - - /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */ - intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = - X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1); - - pr_cont("IvyBridge events, "); - break; - - - case 60: /* 22nm Haswell Core */ - case 63: /* 22nm Haswell Server */ - case 69: /* 22nm Haswell ULT */ - case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */ - x86_add_quirk(intel_ht_bug); - x86_pmu.late_ack = true; - memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids)); - memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); - - intel_pmu_lbr_init_hsw(); - - x86_pmu.event_constraints = intel_hsw_event_constraints; - x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints; - x86_pmu.extra_regs = intel_snbep_extra_regs; - x86_pmu.pebs_aliases = intel_pebs_aliases_ivb; - x86_pmu.pebs_prec_dist = true; - /* all extra regs are per-cpu when HT is on */ - x86_pmu.flags |= PMU_FL_HAS_RSP_1; - x86_pmu.flags |= PMU_FL_NO_HT_SHARING; - - x86_pmu.hw_config = hsw_hw_config; - x86_pmu.get_event_constraints = hsw_get_event_constraints; - x86_pmu.cpu_events = hsw_events_attrs; - x86_pmu.lbr_double_abort = true; - pr_cont("Haswell events, "); - break; - - case 61: /* 14nm Broadwell Core-M */ - case 86: /* 14nm Broadwell Xeon D */ - case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */ - case 79: /* 14nm Broadwell Server */ - x86_pmu.late_ack = true; - memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids)); - memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); - - /* L3_MISS_LOCAL_DRAM is BIT(26) in Broadwell */ - hw_cache_extra_regs[C(LL)][C(OP_READ)][C(RESULT_MISS)] = HSW_DEMAND_READ | - BDW_L3_MISS|HSW_SNOOP_DRAM; - hw_cache_extra_regs[C(LL)][C(OP_WRITE)][C(RESULT_MISS)] = HSW_DEMAND_WRITE|BDW_L3_MISS| - HSW_SNOOP_DRAM; - hw_cache_extra_regs[C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = HSW_DEMAND_READ| - BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM; - hw_cache_extra_regs[C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = HSW_DEMAND_WRITE| - BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM; - - intel_pmu_lbr_init_hsw(); - - x86_pmu.event_constraints = intel_bdw_event_constraints; - x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints; - x86_pmu.extra_regs = intel_snbep_extra_regs; - x86_pmu.pebs_aliases = intel_pebs_aliases_ivb; - x86_pmu.pebs_prec_dist = true; - /* all extra regs are per-cpu when HT is on */ - x86_pmu.flags |= PMU_FL_HAS_RSP_1; - x86_pmu.flags |= PMU_FL_NO_HT_SHARING; - - x86_pmu.hw_config = hsw_hw_config; - x86_pmu.get_event_constraints = hsw_get_event_constraints; - x86_pmu.cpu_events = hsw_events_attrs; - x86_pmu.limit_period = bdw_limit_period; - pr_cont("Broadwell events, "); - break; - - case 87: /* Knights Landing Xeon Phi */ - memcpy(hw_cache_event_ids, - slm_hw_cache_event_ids, sizeof(hw_cache_event_ids)); - memcpy(hw_cache_extra_regs, - knl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); - intel_pmu_lbr_init_knl(); - - x86_pmu.event_constraints = intel_slm_event_constraints; - x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints; - x86_pmu.extra_regs = intel_knl_extra_regs; - - /* all extra regs are per-cpu when HT is on */ - x86_pmu.flags |= PMU_FL_HAS_RSP_1; - x86_pmu.flags |= PMU_FL_NO_HT_SHARING; - - pr_cont("Knights Landing events, "); - break; - - case 78: /* 14nm Skylake Mobile */ - case 94: /* 14nm Skylake Desktop */ - x86_pmu.late_ack = true; - memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids)); - memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); - intel_pmu_lbr_init_skl(); - - x86_pmu.event_constraints = intel_skl_event_constraints; - x86_pmu.pebs_constraints = intel_skl_pebs_event_constraints; - x86_pmu.extra_regs = intel_skl_extra_regs; - x86_pmu.pebs_aliases = intel_pebs_aliases_skl; - x86_pmu.pebs_prec_dist = true; - /* all extra regs are per-cpu when HT is on */ - x86_pmu.flags |= PMU_FL_HAS_RSP_1; - x86_pmu.flags |= PMU_FL_NO_HT_SHARING; - - x86_pmu.hw_config = hsw_hw_config; - x86_pmu.get_event_constraints = hsw_get_event_constraints; - x86_pmu.format_attrs = merge_attr(intel_arch3_formats_attr, - skl_format_attr); - WARN_ON(!x86_pmu.format_attrs); - x86_pmu.cpu_events = hsw_events_attrs; - pr_cont("Skylake events, "); - break; - - default: - switch (x86_pmu.version) { - case 1: - x86_pmu.event_constraints = intel_v1_event_constraints; - pr_cont("generic architected perfmon v1, "); - break; - default: - /* - * default constraints for v2 and up - */ - x86_pmu.event_constraints = intel_gen_event_constraints; - pr_cont("generic architected perfmon, "); - break; - } - } - - if (x86_pmu.num_counters > INTEL_PMC_MAX_GENERIC) { - WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", - x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC); - x86_pmu.num_counters = INTEL_PMC_MAX_GENERIC; - } - x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1; - - if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED) { - WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", - x86_pmu.num_counters_fixed, INTEL_PMC_MAX_FIXED); - x86_pmu.num_counters_fixed = INTEL_PMC_MAX_FIXED; - } - - x86_pmu.intel_ctrl |= - ((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED; - - if (x86_pmu.event_constraints) { - /* - * event on fixed counter2 (REF_CYCLES) only works on this - * counter, so do not extend mask to generic counters - */ - for_each_event_constraint(c, x86_pmu.event_constraints) { - if (c->cmask == FIXED_EVENT_FLAGS - && c->idxmsk64 != INTEL_PMC_MSK_FIXED_REF_CYCLES) { - c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1; - } - c->idxmsk64 &= - ~(~0UL << (INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed)); - c->weight = hweight64(c->idxmsk64); - } - } - - /* - * Access LBR MSR may cause #GP under certain circumstances. - * E.g. KVM doesn't support LBR MSR - * Check all LBT MSR here. - * Disable LBR access if any LBR MSRs can not be accessed. - */ - if (x86_pmu.lbr_nr && !check_msr(x86_pmu.lbr_tos, 0x3UL)) - x86_pmu.lbr_nr = 0; - for (i = 0; i < x86_pmu.lbr_nr; i++) { - if (!(check_msr(x86_pmu.lbr_from + i, 0xffffUL) && - check_msr(x86_pmu.lbr_to + i, 0xffffUL))) - x86_pmu.lbr_nr = 0; - } - - /* - * Access extra MSR may cause #GP under certain circumstances. - * E.g. KVM doesn't support offcore event - * Check all extra_regs here. - */ - if (x86_pmu.extra_regs) { - for (er = x86_pmu.extra_regs; er->msr; er++) { - er->extra_msr_access = check_msr(er->msr, 0x11UL); - /* Disable LBR select mapping */ - if ((er->idx == EXTRA_REG_LBR) && !er->extra_msr_access) - x86_pmu.lbr_sel_map = NULL; - } - } - - /* Support full width counters using alternative MSR range */ - if (x86_pmu.intel_cap.full_width_write) { - x86_pmu.max_period = x86_pmu.cntval_mask; - x86_pmu.perfctr = MSR_IA32_PMC0; - pr_cont("full-width counters, "); - } - - return 0; -} - -/* - * HT bug: phase 2 init - * Called once we have valid topology information to check - * whether or not HT is enabled - * If HT is off, then we disable the workaround - */ -static __init int fixup_ht_bug(void) -{ - int cpu = smp_processor_id(); - int w, c; - /* - * problem not present on this CPU model, nothing to do - */ - if (!(x86_pmu.flags & PMU_FL_EXCL_ENABLED)) - return 0; - - w = cpumask_weight(topology_sibling_cpumask(cpu)); - if (w > 1) { - pr_info("PMU erratum BJ122, BV98, HSD29 worked around, HT is on\n"); - return 0; - } - - if (lockup_detector_suspend() != 0) { - pr_debug("failed to disable PMU erratum BJ122, BV98, HSD29 workaround\n"); - return 0; - } - - x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED); - - x86_pmu.start_scheduling = NULL; - x86_pmu.commit_scheduling = NULL; - x86_pmu.stop_scheduling = NULL; - - lockup_detector_resume(); - - get_online_cpus(); - - for_each_online_cpu(c) { - free_excl_cntrs(c); - } - - put_online_cpus(); - pr_info("PMU erratum BJ122, BV98, HSD29 workaround disabled, HT off\n"); - return 0; -} -subsys_initcall(fixup_ht_bug) diff --git a/arch/x86/kernel/cpu/perf_event_intel_bts.c b/arch/x86/kernel/cpu/perf_event_intel_bts.c deleted file mode 100644 index 2cad71d1b14c..000000000000 --- a/arch/x86/kernel/cpu/perf_event_intel_bts.c +++ /dev/null @@ -1,544 +0,0 @@ -/* - * BTS PMU driver for perf - * Copyright (c) 2013-2014, Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - */ - -#undef DEBUG - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/bitops.h> -#include <linux/types.h> -#include <linux/slab.h> -#include <linux/debugfs.h> -#include <linux/device.h> -#include <linux/coredump.h> - -#include <asm-generic/sizes.h> -#include <asm/perf_event.h> - -#include "perf_event.h" - -struct bts_ctx { - struct perf_output_handle handle; - struct debug_store ds_back; - int started; -}; - -static DEFINE_PER_CPU(struct bts_ctx, bts_ctx); - -#define BTS_RECORD_SIZE 24 -#define BTS_SAFETY_MARGIN 4080 - -struct bts_phys { - struct page *page; - unsigned long size; - unsigned long offset; - unsigned long displacement; -}; - -struct bts_buffer { - size_t real_size; /* multiple of BTS_RECORD_SIZE */ - unsigned int nr_pages; - unsigned int nr_bufs; - unsigned int cur_buf; - bool snapshot; - local_t data_size; - local_t lost; - local_t head; - unsigned long end; - void **data_pages; - struct bts_phys buf[0]; -}; - -struct pmu bts_pmu; - -static size_t buf_size(struct page *page) -{ - return 1 << (PAGE_SHIFT + page_private(page)); -} - -static void * -bts_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool overwrite) -{ - struct bts_buffer *buf; - struct page *page; - int node = (cpu == -1) ? cpu : cpu_to_node(cpu); - unsigned long offset; - size_t size = nr_pages << PAGE_SHIFT; - int pg, nbuf, pad; - - /* count all the high order buffers */ - for (pg = 0, nbuf = 0; pg < nr_pages;) { - page = virt_to_page(pages[pg]); - if (WARN_ON_ONCE(!PagePrivate(page) && nr_pages > 1)) - return NULL; - pg += 1 << page_private(page); - nbuf++; - } - - /* - * to avoid interrupts in overwrite mode, only allow one physical - */ - if (overwrite && nbuf > 1) - return NULL; - - buf = kzalloc_node(offsetof(struct bts_buffer, buf[nbuf]), GFP_KERNEL, node); - if (!buf) - return NULL; - - buf->nr_pages = nr_pages; - buf->nr_bufs = nbuf; - buf->snapshot = overwrite; - buf->data_pages = pages; - buf->real_size = size - size % BTS_RECORD_SIZE; - - for (pg = 0, nbuf = 0, offset = 0, pad = 0; nbuf < buf->nr_bufs; nbuf++) { - unsigned int __nr_pages; - - page = virt_to_page(pages[pg]); - __nr_pages = PagePrivate(page) ? 1 << page_private(page) : 1; - buf->buf[nbuf].page = page; - buf->buf[nbuf].offset = offset; - buf->buf[nbuf].displacement = (pad ? BTS_RECORD_SIZE - pad : 0); - buf->buf[nbuf].size = buf_size(page) - buf->buf[nbuf].displacement; - pad = buf->buf[nbuf].size % BTS_RECORD_SIZE; - buf->buf[nbuf].size -= pad; - - pg += __nr_pages; - offset += __nr_pages << PAGE_SHIFT; - } - - return buf; -} - -static void bts_buffer_free_aux(void *data) -{ - kfree(data); -} - -static unsigned long bts_buffer_offset(struct bts_buffer *buf, unsigned int idx) -{ - return buf->buf[idx].offset + buf->buf[idx].displacement; -} - -static void -bts_config_buffer(struct bts_buffer *buf) -{ - int cpu = raw_smp_processor_id(); - struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; - struct bts_phys *phys = &buf->buf[buf->cur_buf]; - unsigned long index, thresh = 0, end = phys->size; - struct page *page = phys->page; - - index = local_read(&buf->head); - - if (!buf->snapshot) { - if (buf->end < phys->offset + buf_size(page)) - end = buf->end - phys->offset - phys->displacement; - - index -= phys->offset + phys->displacement; - - if (end - index > BTS_SAFETY_MARGIN) - thresh = end - BTS_SAFETY_MARGIN; - else if (end - index > BTS_RECORD_SIZE) - thresh = end - BTS_RECORD_SIZE; - else - thresh = end; - } - - ds->bts_buffer_base = (u64)(long)page_address(page) + phys->displacement; - ds->bts_index = ds->bts_buffer_base + index; - ds->bts_absolute_maximum = ds->bts_buffer_base + end; - ds->bts_interrupt_threshold = !buf->snapshot - ? ds->bts_buffer_base + thresh - : ds->bts_absolute_maximum + BTS_RECORD_SIZE; -} - -static void bts_buffer_pad_out(struct bts_phys *phys, unsigned long head) -{ - unsigned long index = head - phys->offset; - - memset(page_address(phys->page) + index, 0, phys->size - index); -} - -static bool bts_buffer_is_full(struct bts_buffer *buf, struct bts_ctx *bts) -{ - if (buf->snapshot) - return false; - - if (local_read(&buf->data_size) >= bts->handle.size || - bts->handle.size - local_read(&buf->data_size) < BTS_RECORD_SIZE) - return true; - - return false; -} - -static void bts_update(struct bts_ctx *bts) -{ - int cpu = raw_smp_processor_id(); - struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; - struct bts_buffer *buf = perf_get_aux(&bts->handle); - unsigned long index = ds->bts_index - ds->bts_buffer_base, old, head; - - if (!buf) - return; - - head = index + bts_buffer_offset(buf, buf->cur_buf); - old = local_xchg(&buf->head, head); - - if (!buf->snapshot) { - if (old == head) - return; - - if (ds->bts_index >= ds->bts_absolute_maximum) - local_inc(&buf->lost); - - /* - * old and head are always in the same physical buffer, so we - * can subtract them to get the data size. - */ - local_add(head - old, &buf->data_size); - } else { - local_set(&buf->data_size, head); - } -} - -static void __bts_event_start(struct perf_event *event) -{ - struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); - struct bts_buffer *buf = perf_get_aux(&bts->handle); - u64 config = 0; - - if (!buf || bts_buffer_is_full(buf, bts)) - return; - - event->hw.itrace_started = 1; - event->hw.state = 0; - - if (!buf->snapshot) - config |= ARCH_PERFMON_EVENTSEL_INT; - if (!event->attr.exclude_kernel) - config |= ARCH_PERFMON_EVENTSEL_OS; - if (!event->attr.exclude_user) - config |= ARCH_PERFMON_EVENTSEL_USR; - - bts_config_buffer(buf); - - /* - * local barrier to make sure that ds configuration made it - * before we enable BTS - */ - wmb(); - - intel_pmu_enable_bts(config); -} - -static void bts_event_start(struct perf_event *event, int flags) -{ - struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); - - __bts_event_start(event); - - /* PMI handler: this counter is running and likely generating PMIs */ - ACCESS_ONCE(bts->started) = 1; -} - -static void __bts_event_stop(struct perf_event *event) -{ - /* - * No extra synchronization is mandated by the documentation to have - * BTS data stores globally visible. - */ - intel_pmu_disable_bts(); - - if (event->hw.state & PERF_HES_STOPPED) - return; - - ACCESS_ONCE(event->hw.state) |= PERF_HES_STOPPED; -} - -static void bts_event_stop(struct perf_event *event, int flags) -{ - struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); - - /* PMI handler: don't restart this counter */ - ACCESS_ONCE(bts->started) = 0; - - __bts_event_stop(event); - - if (flags & PERF_EF_UPDATE) - bts_update(bts); -} - -void intel_bts_enable_local(void) -{ - struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); - - if (bts->handle.event && bts->started) - __bts_event_start(bts->handle.event); -} - -void intel_bts_disable_local(void) -{ - struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); - - if (bts->handle.event) - __bts_event_stop(bts->handle.event); -} - -static int -bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle) -{ - unsigned long head, space, next_space, pad, gap, skip, wakeup; - unsigned int next_buf; - struct bts_phys *phys, *next_phys; - int ret; - - if (buf->snapshot) - return 0; - - head = handle->head & ((buf->nr_pages << PAGE_SHIFT) - 1); - if (WARN_ON_ONCE(head != local_read(&buf->head))) - return -EINVAL; - - phys = &buf->buf[buf->cur_buf]; - space = phys->offset + phys->displacement + phys->size - head; - pad = space; - if (space > handle->size) { - space = handle->size; - space -= space % BTS_RECORD_SIZE; - } - if (space <= BTS_SAFETY_MARGIN) { - /* See if next phys buffer has more space */ - next_buf = buf->cur_buf + 1; - if (next_buf >= buf->nr_bufs) - next_buf = 0; - next_phys = &buf->buf[next_buf]; - gap = buf_size(phys->page) - phys->displacement - phys->size + - next_phys->displacement; - skip = pad + gap; - if (handle->size >= skip) { - next_space = next_phys->size; - if (next_space + skip > handle->size) { - next_space = handle->size - skip; - next_space -= next_space % BTS_RECORD_SIZE; - } - if (next_space > space || !space) { - if (pad) - bts_buffer_pad_out(phys, head); - ret = perf_aux_output_skip(handle, skip); - if (ret) - return ret; - /* Advance to next phys buffer */ - phys = next_phys; - space = next_space; - head = phys->offset + phys->displacement; - /* - * After this, cur_buf and head won't match ds - * anymore, so we must not be racing with - * bts_update(). - */ - buf->cur_buf = next_buf; - local_set(&buf->head, head); - } - } - } - - /* Don't go far beyond wakeup watermark */ - wakeup = BTS_SAFETY_MARGIN + BTS_RECORD_SIZE + handle->wakeup - - handle->head; - if (space > wakeup) { - space = wakeup; - space -= space % BTS_RECORD_SIZE; - } - - buf->end = head + space; - - /* - * If we have no space, the lost notification would have been sent when - * we hit absolute_maximum - see bts_update() - */ - if (!space) - return -ENOSPC; - - return 0; -} - -int intel_bts_interrupt(void) -{ - struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); - struct perf_event *event = bts->handle.event; - struct bts_buffer *buf; - s64 old_head; - int err; - - if (!event || !bts->started) - return 0; - - buf = perf_get_aux(&bts->handle); - /* - * Skip snapshot counters: they don't use the interrupt, but - * there's no other way of telling, because the pointer will - * keep moving - */ - if (!buf || buf->snapshot) - return 0; - - old_head = local_read(&buf->head); - bts_update(bts); - - /* no new data */ - if (old_head == local_read(&buf->head)) - return 0; - - perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0), - !!local_xchg(&buf->lost, 0)); - - buf = perf_aux_output_begin(&bts->handle, event); - if (!buf) - return 1; - - err = bts_buffer_reset(buf, &bts->handle); - if (err) - perf_aux_output_end(&bts->handle, 0, false); - - return 1; -} - -static void bts_event_del(struct perf_event *event, int mode) -{ - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); - struct bts_buffer *buf = perf_get_aux(&bts->handle); - - bts_event_stop(event, PERF_EF_UPDATE); - - if (buf) { - if (buf->snapshot) - bts->handle.head = - local_xchg(&buf->data_size, - buf->nr_pages << PAGE_SHIFT); - perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0), - !!local_xchg(&buf->lost, 0)); - } - - cpuc->ds->bts_index = bts->ds_back.bts_buffer_base; - cpuc->ds->bts_buffer_base = bts->ds_back.bts_buffer_base; - cpuc->ds->bts_absolute_maximum = bts->ds_back.bts_absolute_maximum; - cpuc->ds->bts_interrupt_threshold = bts->ds_back.bts_interrupt_threshold; -} - -static int bts_event_add(struct perf_event *event, int mode) -{ - struct bts_buffer *buf; - struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - struct hw_perf_event *hwc = &event->hw; - int ret = -EBUSY; - - event->hw.state = PERF_HES_STOPPED; - - if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) - return -EBUSY; - - if (bts->handle.event) - return -EBUSY; - - buf = perf_aux_output_begin(&bts->handle, event); - if (!buf) - return -EINVAL; - - ret = bts_buffer_reset(buf, &bts->handle); - if (ret) { - perf_aux_output_end(&bts->handle, 0, false); - return ret; - } - - bts->ds_back.bts_buffer_base = cpuc->ds->bts_buffer_base; - bts->ds_back.bts_absolute_maximum = cpuc->ds->bts_absolute_maximum; - bts->ds_back.bts_interrupt_threshold = cpuc->ds->bts_interrupt_threshold; - - if (mode & PERF_EF_START) { - bts_event_start(event, 0); - if (hwc->state & PERF_HES_STOPPED) { - bts_event_del(event, 0); - return -EBUSY; - } - } - - return 0; -} - -static void bts_event_destroy(struct perf_event *event) -{ - x86_release_hardware(); - x86_del_exclusive(x86_lbr_exclusive_bts); -} - -static int bts_event_init(struct perf_event *event) -{ - int ret; - - if (event->attr.type != bts_pmu.type) - return -ENOENT; - - if (x86_add_exclusive(x86_lbr_exclusive_bts)) - return -EBUSY; - - /* - * BTS leaks kernel addresses even when CPL0 tracing is - * disabled, so disallow intel_bts driver for unprivileged - * users on paranoid systems since it provides trace data - * to the user in a zero-copy fashion. - * - * Note that the default paranoia setting permits unprivileged - * users to profile the kernel. - */ - if (event->attr.exclude_kernel && perf_paranoid_kernel() && - !capable(CAP_SYS_ADMIN)) - return -EACCES; - - ret = x86_reserve_hardware(); - if (ret) { - x86_del_exclusive(x86_lbr_exclusive_bts); - return ret; - } - - event->destroy = bts_event_destroy; - - return 0; -} - -static void bts_event_read(struct perf_event *event) -{ -} - -static __init int bts_init(void) -{ - if (!boot_cpu_has(X86_FEATURE_DTES64) || !x86_pmu.bts) - return -ENODEV; - - bts_pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE; - bts_pmu.task_ctx_nr = perf_sw_context; - bts_pmu.event_init = bts_event_init; - bts_pmu.add = bts_event_add; - bts_pmu.del = bts_event_del; - bts_pmu.start = bts_event_start; - bts_pmu.stop = bts_event_stop; - bts_pmu.read = bts_event_read; - bts_pmu.setup_aux = bts_buffer_setup_aux; - bts_pmu.free_aux = bts_buffer_free_aux; - - return perf_pmu_register(&bts_pmu, "intel_bts", -1); -} -arch_initcall(bts_init); diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c deleted file mode 100644 index a316ca96f1b6..000000000000 --- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c +++ /dev/null @@ -1,1391 +0,0 @@ -/* - * Intel Cache Quality-of-Service Monitoring (CQM) support. - * - * Based very, very heavily on work by Peter Zijlstra. - */ - -#include <linux/perf_event.h> -#include <linux/slab.h> -#include <asm/cpu_device_id.h> -#include "perf_event.h" - -#define MSR_IA32_PQR_ASSOC 0x0c8f -#define MSR_IA32_QM_CTR 0x0c8e -#define MSR_IA32_QM_EVTSEL 0x0c8d - -static u32 cqm_max_rmid = -1; -static unsigned int cqm_l3_scale; /* supposedly cacheline size */ - -/** - * struct intel_pqr_state - State cache for the PQR MSR - * @rmid: The cached Resource Monitoring ID - * @closid: The cached Class Of Service ID - * @rmid_usecnt: The usage counter for rmid - * - * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the - * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always - * contains both parts, so we need to cache them. - * - * The cache also helps to avoid pointless updates if the value does - * not change. - */ -struct intel_pqr_state { - u32 rmid; - u32 closid; - int rmid_usecnt; -}; - -/* - * The cached intel_pqr_state is strictly per CPU and can never be - * updated from a remote CPU. Both functions which modify the state - * (intel_cqm_event_start and intel_cqm_event_stop) are called with - * interrupts disabled, which is sufficient for the protection. - */ -static DEFINE_PER_CPU(struct intel_pqr_state, pqr_state); - -/* - * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru. - * Also protects event->hw.cqm_rmid - * - * Hold either for stability, both for modification of ->hw.cqm_rmid. - */ -static DEFINE_MUTEX(cache_mutex); -static DEFINE_RAW_SPINLOCK(cache_lock); - -/* - * Groups of events that have the same target(s), one RMID per group. - */ -static LIST_HEAD(cache_groups); - -/* - * Mask of CPUs for reading CQM values. We only need one per-socket. - */ -static cpumask_t cqm_cpumask; - -#define RMID_VAL_ERROR (1ULL << 63) -#define RMID_VAL_UNAVAIL (1ULL << 62) - -#define QOS_L3_OCCUP_EVENT_ID (1 << 0) - -#define QOS_EVENT_MASK QOS_L3_OCCUP_EVENT_ID - -/* - * This is central to the rotation algorithm in __intel_cqm_rmid_rotate(). - * - * This rmid is always free and is guaranteed to have an associated - * near-zero occupancy value, i.e. no cachelines are tagged with this - * RMID, once __intel_cqm_rmid_rotate() returns. - */ -static u32 intel_cqm_rotation_rmid; - -#define INVALID_RMID (-1) - -/* - * Is @rmid valid for programming the hardware? - * - * rmid 0 is reserved by the hardware for all non-monitored tasks, which - * means that we should never come across an rmid with that value. - * Likewise, an rmid value of -1 is used to indicate "no rmid currently - * assigned" and is used as part of the rotation code. - */ -static inline bool __rmid_valid(u32 rmid) -{ - if (!rmid || rmid == INVALID_RMID) - return false; - - return true; -} - -static u64 __rmid_read(u32 rmid) -{ - u64 val; - - /* - * Ignore the SDM, this thing is _NOTHING_ like a regular perfcnt, - * it just says that to increase confusion. - */ - wrmsr(MSR_IA32_QM_EVTSEL, QOS_L3_OCCUP_EVENT_ID, rmid); - rdmsrl(MSR_IA32_QM_CTR, val); - - /* - * Aside from the ERROR and UNAVAIL bits, assume this thing returns - * the number of cachelines tagged with @rmid. - */ - return val; -} - -enum rmid_recycle_state { - RMID_YOUNG = 0, - RMID_AVAILABLE, - RMID_DIRTY, -}; - -struct cqm_rmid_entry { - u32 rmid; - enum rmid_recycle_state state; - struct list_head list; - unsigned long queue_time; -}; - -/* - * cqm_rmid_free_lru - A least recently used list of RMIDs. - * - * Oldest entry at the head, newest (most recently used) entry at the - * tail. This list is never traversed, it's only used to keep track of - * the lru order. That is, we only pick entries of the head or insert - * them on the tail. - * - * All entries on the list are 'free', and their RMIDs are not currently - * in use. To mark an RMID as in use, remove its entry from the lru - * list. - * - * - * cqm_rmid_limbo_lru - list of currently unused but (potentially) dirty RMIDs. - * - * This list is contains RMIDs that no one is currently using but that - * may have a non-zero occupancy value associated with them. The - * rotation worker moves RMIDs from the limbo list to the free list once - * the occupancy value drops below __intel_cqm_threshold. - * - * Both lists are protected by cache_mutex. - */ -static LIST_HEAD(cqm_rmid_free_lru); -static LIST_HEAD(cqm_rmid_limbo_lru); - -/* - * We use a simple array of pointers so that we can lookup a struct - * cqm_rmid_entry in O(1). This alleviates the callers of __get_rmid() - * and __put_rmid() from having to worry about dealing with struct - * cqm_rmid_entry - they just deal with rmids, i.e. integers. - * - * Once this array is initialized it is read-only. No locks are required - * to access it. - * - * All entries for all RMIDs can be looked up in the this array at all - * times. - */ -static struct cqm_rmid_entry **cqm_rmid_ptrs; - -static inline struct cqm_rmid_entry *__rmid_entry(u32 rmid) -{ - struct cqm_rmid_entry *entry; - - entry = cqm_rmid_ptrs[rmid]; - WARN_ON(entry->rmid != rmid); - - return entry; -} - -/* - * Returns < 0 on fail. - * - * We expect to be called with cache_mutex held. - */ -static u32 __get_rmid(void) -{ - struct cqm_rmid_entry *entry; - - lockdep_assert_held(&cache_mutex); - - if (list_empty(&cqm_rmid_free_lru)) - return INVALID_RMID; - - entry = list_first_entry(&cqm_rmid_free_lru, struct cqm_rmid_entry, list); - list_del(&entry->list); - - return entry->rmid; -} - -static void __put_rmid(u32 rmid) -{ - struct cqm_rmid_entry *entry; - - lockdep_assert_held(&cache_mutex); - - WARN_ON(!__rmid_valid(rmid)); - entry = __rmid_entry(rmid); - - entry->queue_time = jiffies; - entry->state = RMID_YOUNG; - - list_add_tail(&entry->list, &cqm_rmid_limbo_lru); -} - -static int intel_cqm_setup_rmid_cache(void) -{ - struct cqm_rmid_entry *entry; - unsigned int nr_rmids; - int r = 0; - - nr_rmids = cqm_max_rmid + 1; - cqm_rmid_ptrs = kmalloc(sizeof(struct cqm_rmid_entry *) * - nr_rmids, GFP_KERNEL); - if (!cqm_rmid_ptrs) - return -ENOMEM; - - for (; r <= cqm_max_rmid; r++) { - struct cqm_rmid_entry *entry; - - entry = kmalloc(sizeof(*entry), GFP_KERNEL); - if (!entry) - goto fail; - - INIT_LIST_HEAD(&entry->list); - entry->rmid = r; - cqm_rmid_ptrs[r] = entry; - - list_add_tail(&entry->list, &cqm_rmid_free_lru); - } - - /* - * RMID 0 is special and is always allocated. It's used for all - * tasks that are not monitored. - */ - entry = __rmid_entry(0); - list_del(&entry->list); - - mutex_lock(&cache_mutex); - intel_cqm_rotation_rmid = __get_rmid(); - mutex_unlock(&cache_mutex); - - return 0; -fail: - while (r--) - kfree(cqm_rmid_ptrs[r]); - - kfree(cqm_rmid_ptrs); - return -ENOMEM; -} - -/* - * Determine if @a and @b measure the same set of tasks. - * - * If @a and @b measure the same set of tasks then we want to share a - * single RMID. - */ -static bool __match_event(struct perf_event *a, struct perf_event *b) -{ - /* Per-cpu and task events don't mix */ - if ((a->attach_state & PERF_ATTACH_TASK) != - (b->attach_state & PERF_ATTACH_TASK)) - return false; - -#ifdef CONFIG_CGROUP_PERF - if (a->cgrp != b->cgrp) - return false; -#endif - - /* If not task event, we're machine wide */ - if (!(b->attach_state & PERF_ATTACH_TASK)) - return true; - - /* - * Events that target same task are placed into the same cache group. - */ - if (a->hw.target == b->hw.target) - return true; - - /* - * Are we an inherited event? - */ - if (b->parent == a) - return true; - - return false; -} - -#ifdef CONFIG_CGROUP_PERF -static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event) -{ - if (event->attach_state & PERF_ATTACH_TASK) - return perf_cgroup_from_task(event->hw.target, event->ctx); - - return event->cgrp; -} -#endif - -/* - * Determine if @a's tasks intersect with @b's tasks - * - * There are combinations of events that we explicitly prohibit, - * - * PROHIBITS - * system-wide -> cgroup and task - * cgroup -> system-wide - * -> task in cgroup - * task -> system-wide - * -> task in cgroup - * - * Call this function before allocating an RMID. - */ -static bool __conflict_event(struct perf_event *a, struct perf_event *b) -{ -#ifdef CONFIG_CGROUP_PERF - /* - * We can have any number of cgroups but only one system-wide - * event at a time. - */ - if (a->cgrp && b->cgrp) { - struct perf_cgroup *ac = a->cgrp; - struct perf_cgroup *bc = b->cgrp; - - /* - * This condition should have been caught in - * __match_event() and we should be sharing an RMID. - */ - WARN_ON_ONCE(ac == bc); - - if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) || - cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup)) - return true; - - return false; - } - - if (a->cgrp || b->cgrp) { - struct perf_cgroup *ac, *bc; - - /* - * cgroup and system-wide events are mutually exclusive - */ - if ((a->cgrp && !(b->attach_state & PERF_ATTACH_TASK)) || - (b->cgrp && !(a->attach_state & PERF_ATTACH_TASK))) - return true; - - /* - * Ensure neither event is part of the other's cgroup - */ - ac = event_to_cgroup(a); - bc = event_to_cgroup(b); - if (ac == bc) - return true; - - /* - * Must have cgroup and non-intersecting task events. - */ - if (!ac || !bc) - return false; - - /* - * We have cgroup and task events, and the task belongs - * to a cgroup. Check for for overlap. - */ - if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) || - cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup)) - return true; - - return false; - } -#endif - /* - * If one of them is not a task, same story as above with cgroups. - */ - if (!(a->attach_state & PERF_ATTACH_TASK) || - !(b->attach_state & PERF_ATTACH_TASK)) - return true; - - /* - * Must be non-overlapping. - */ - return false; -} - -struct rmid_read { - u32 rmid; - atomic64_t value; -}; - -static void __intel_cqm_event_count(void *info); - -/* - * Exchange the RMID of a group of events. - */ -static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid) -{ - struct perf_event *event; - struct list_head *head = &group->hw.cqm_group_entry; - u32 old_rmid = group->hw.cqm_rmid; - - lockdep_assert_held(&cache_mutex); - - /* - * If our RMID is being deallocated, perform a read now. - */ - if (__rmid_valid(old_rmid) && !__rmid_valid(rmid)) { - struct rmid_read rr = { - .value = ATOMIC64_INIT(0), - .rmid = old_rmid, - }; - - on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, - &rr, 1); - local64_set(&group->count, atomic64_read(&rr.value)); - } - - raw_spin_lock_irq(&cache_lock); - - group->hw.cqm_rmid = rmid; - list_for_each_entry(event, head, hw.cqm_group_entry) - event->hw.cqm_rmid = rmid; - - raw_spin_unlock_irq(&cache_lock); - - return old_rmid; -} - -/* - * If we fail to assign a new RMID for intel_cqm_rotation_rmid because - * cachelines are still tagged with RMIDs in limbo, we progressively - * increment the threshold until we find an RMID in limbo with <= - * __intel_cqm_threshold lines tagged. This is designed to mitigate the - * problem where cachelines tagged with an RMID are not steadily being - * evicted. - * - * On successful rotations we decrease the threshold back towards zero. - * - * __intel_cqm_max_threshold provides an upper bound on the threshold, - * and is measured in bytes because it's exposed to userland. - */ -static unsigned int __intel_cqm_threshold; -static unsigned int __intel_cqm_max_threshold; - -/* - * Test whether an RMID has a zero occupancy value on this cpu. - */ -static void intel_cqm_stable(void *arg) -{ - struct cqm_rmid_entry *entry; - - list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) { - if (entry->state != RMID_AVAILABLE) - break; - - if (__rmid_read(entry->rmid) > __intel_cqm_threshold) - entry->state = RMID_DIRTY; - } -} - -/* - * If we have group events waiting for an RMID that don't conflict with - * events already running, assign @rmid. - */ -static bool intel_cqm_sched_in_event(u32 rmid) -{ - struct perf_event *leader, *event; - - lockdep_assert_held(&cache_mutex); - - leader = list_first_entry(&cache_groups, struct perf_event, - hw.cqm_groups_entry); - event = leader; - - list_for_each_entry_continue(event, &cache_groups, - hw.cqm_groups_entry) { - if (__rmid_valid(event->hw.cqm_rmid)) - continue; - - if (__conflict_event(event, leader)) - continue; - - intel_cqm_xchg_rmid(event, rmid); - return true; - } - - return false; -} - -/* - * Initially use this constant for both the limbo queue time and the - * rotation timer interval, pmu::hrtimer_interval_ms. - * - * They don't need to be the same, but the two are related since if you - * rotate faster than you recycle RMIDs, you may run out of available - * RMIDs. - */ -#define RMID_DEFAULT_QUEUE_TIME 250 /* ms */ - -static unsigned int __rmid_queue_time_ms = RMID_DEFAULT_QUEUE_TIME; - -/* - * intel_cqm_rmid_stabilize - move RMIDs from limbo to free list - * @nr_available: number of freeable RMIDs on the limbo list - * - * Quiescent state; wait for all 'freed' RMIDs to become unused, i.e. no - * cachelines are tagged with those RMIDs. After this we can reuse them - * and know that the current set of active RMIDs is stable. - * - * Return %true or %false depending on whether stabilization needs to be - * reattempted. - * - * If we return %true then @nr_available is updated to indicate the - * number of RMIDs on the limbo list that have been queued for the - * minimum queue time (RMID_AVAILABLE), but whose data occupancy values - * are above __intel_cqm_threshold. - */ -static bool intel_cqm_rmid_stabilize(unsigned int *available) -{ - struct cqm_rmid_entry *entry, *tmp; - - lockdep_assert_held(&cache_mutex); - - *available = 0; - list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) { - unsigned long min_queue_time; - unsigned long now = jiffies; - - /* - * We hold RMIDs placed into limbo for a minimum queue - * time. Before the minimum queue time has elapsed we do - * not recycle RMIDs. - * - * The reasoning is that until a sufficient time has - * passed since we stopped using an RMID, any RMID - * placed onto the limbo list will likely still have - * data tagged in the cache, which means we'll probably - * fail to recycle it anyway. - * - * We can save ourselves an expensive IPI by skipping - * any RMIDs that have not been queued for the minimum - * time. - */ - min_queue_time = entry->queue_time + - msecs_to_jiffies(__rmid_queue_time_ms); - - if (time_after(min_queue_time, now)) - break; - - entry->state = RMID_AVAILABLE; - (*available)++; - } - - /* - * Fast return if none of the RMIDs on the limbo list have been - * sitting on the queue for the minimum queue time. - */ - if (!*available) - return false; - - /* - * Test whether an RMID is free for each package. - */ - on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL, true); - - list_for_each_entry_safe(entry, tmp, &cqm_rmid_limbo_lru, list) { - /* - * Exhausted all RMIDs that have waited min queue time. - */ - if (entry->state == RMID_YOUNG) - break; - - if (entry->state == RMID_DIRTY) - continue; - - list_del(&entry->list); /* remove from limbo */ - - /* - * The rotation RMID gets priority if it's - * currently invalid. In which case, skip adding - * the RMID to the the free lru. - */ - if (!__rmid_valid(intel_cqm_rotation_rmid)) { - intel_cqm_rotation_rmid = entry->rmid; - continue; - } - - /* - * If we have groups waiting for RMIDs, hand - * them one now provided they don't conflict. - */ - if (intel_cqm_sched_in_event(entry->rmid)) - continue; - - /* - * Otherwise place it onto the free list. - */ - list_add_tail(&entry->list, &cqm_rmid_free_lru); - } - - - return __rmid_valid(intel_cqm_rotation_rmid); -} - -/* - * Pick a victim group and move it to the tail of the group list. - * @next: The first group without an RMID - */ -static void __intel_cqm_pick_and_rotate(struct perf_event *next) -{ - struct perf_event *rotor; - u32 rmid; - - lockdep_assert_held(&cache_mutex); - - rotor = list_first_entry(&cache_groups, struct perf_event, - hw.cqm_groups_entry); - - /* - * The group at the front of the list should always have a valid - * RMID. If it doesn't then no groups have RMIDs assigned and we - * don't need to rotate the list. - */ - if (next == rotor) - return; - - rmid = intel_cqm_xchg_rmid(rotor, INVALID_RMID); - __put_rmid(rmid); - - list_rotate_left(&cache_groups); -} - -/* - * Deallocate the RMIDs from any events that conflict with @event, and - * place them on the back of the group list. - */ -static void intel_cqm_sched_out_conflicting_events(struct perf_event *event) -{ - struct perf_event *group, *g; - u32 rmid; - - lockdep_assert_held(&cache_mutex); - - list_for_each_entry_safe(group, g, &cache_groups, hw.cqm_groups_entry) { - if (group == event) - continue; - - rmid = group->hw.cqm_rmid; - - /* - * Skip events that don't have a valid RMID. - */ - if (!__rmid_valid(rmid)) - continue; - - /* - * No conflict? No problem! Leave the event alone. - */ - if (!__conflict_event(group, event)) - continue; - - intel_cqm_xchg_rmid(group, INVALID_RMID); - __put_rmid(rmid); - } -} - -/* - * Attempt to rotate the groups and assign new RMIDs. - * - * We rotate for two reasons, - * 1. To handle the scheduling of conflicting events - * 2. To recycle RMIDs - * - * Rotating RMIDs is complicated because the hardware doesn't give us - * any clues. - * - * There's problems with the hardware interface; when you change the - * task:RMID map cachelines retain their 'old' tags, giving a skewed - * picture. In order to work around this, we must always keep one free - * RMID - intel_cqm_rotation_rmid. - * - * Rotation works by taking away an RMID from a group (the old RMID), - * and assigning the free RMID to another group (the new RMID). We must - * then wait for the old RMID to not be used (no cachelines tagged). - * This ensure that all cachelines are tagged with 'active' RMIDs. At - * this point we can start reading values for the new RMID and treat the - * old RMID as the free RMID for the next rotation. - * - * Return %true or %false depending on whether we did any rotating. - */ -static bool __intel_cqm_rmid_rotate(void) -{ - struct perf_event *group, *start = NULL; - unsigned int threshold_limit; - unsigned int nr_needed = 0; - unsigned int nr_available; - bool rotated = false; - - mutex_lock(&cache_mutex); - -again: - /* - * Fast path through this function if there are no groups and no - * RMIDs that need cleaning. - */ - if (list_empty(&cache_groups) && list_empty(&cqm_rmid_limbo_lru)) - goto out; - - list_for_each_entry(group, &cache_groups, hw.cqm_groups_entry) { - if (!__rmid_valid(group->hw.cqm_rmid)) { - if (!start) - start = group; - nr_needed++; - } - } - - /* - * We have some event groups, but they all have RMIDs assigned - * and no RMIDs need cleaning. - */ - if (!nr_needed && list_empty(&cqm_rmid_limbo_lru)) - goto out; - - if (!nr_needed) - goto stabilize; - - /* - * We have more event groups without RMIDs than available RMIDs, - * or we have event groups that conflict with the ones currently - * scheduled. - * - * We force deallocate the rmid of the group at the head of - * cache_groups. The first event group without an RMID then gets - * assigned intel_cqm_rotation_rmid. This ensures we always make - * forward progress. - * - * Rotate the cache_groups list so the previous head is now the - * tail. - */ - __intel_cqm_pick_and_rotate(start); - - /* - * If the rotation is going to succeed, reduce the threshold so - * that we don't needlessly reuse dirty RMIDs. - */ - if (__rmid_valid(intel_cqm_rotation_rmid)) { - intel_cqm_xchg_rmid(start, intel_cqm_rotation_rmid); - intel_cqm_rotation_rmid = __get_rmid(); - - intel_cqm_sched_out_conflicting_events(start); - - if (__intel_cqm_threshold) - __intel_cqm_threshold--; - } - - rotated = true; - -stabilize: - /* - * We now need to stablize the RMID we freed above (if any) to - * ensure that the next time we rotate we have an RMID with zero - * occupancy value. - * - * Alternatively, if we didn't need to perform any rotation, - * we'll have a bunch of RMIDs in limbo that need stabilizing. - */ - threshold_limit = __intel_cqm_max_threshold / cqm_l3_scale; - - while (intel_cqm_rmid_stabilize(&nr_available) && - __intel_cqm_threshold < threshold_limit) { - unsigned int steal_limit; - - /* - * Don't spin if nobody is actively waiting for an RMID, - * the rotation worker will be kicked as soon as an - * event needs an RMID anyway. - */ - if (!nr_needed) - break; - - /* Allow max 25% of RMIDs to be in limbo. */ - steal_limit = (cqm_max_rmid + 1) / 4; - - /* - * We failed to stabilize any RMIDs so our rotation - * logic is now stuck. In order to make forward progress - * we have a few options: - * - * 1. rotate ("steal") another RMID - * 2. increase the threshold - * 3. do nothing - * - * We do both of 1. and 2. until we hit the steal limit. - * - * The steal limit prevents all RMIDs ending up on the - * limbo list. This can happen if every RMID has a - * non-zero occupancy above threshold_limit, and the - * occupancy values aren't dropping fast enough. - * - * Note that there is prioritisation at work here - we'd - * rather increase the number of RMIDs on the limbo list - * than increase the threshold, because increasing the - * threshold skews the event data (because we reuse - * dirty RMIDs) - threshold bumps are a last resort. - */ - if (nr_available < steal_limit) - goto again; - - __intel_cqm_threshold++; - } - -out: - mutex_unlock(&cache_mutex); - return rotated; -} - -static void intel_cqm_rmid_rotate(struct work_struct *work); - -static DECLARE_DELAYED_WORK(intel_cqm_rmid_work, intel_cqm_rmid_rotate); - -static struct pmu intel_cqm_pmu; - -static void intel_cqm_rmid_rotate(struct work_struct *work) -{ - unsigned long delay; - - __intel_cqm_rmid_rotate(); - - delay = msecs_to_jiffies(intel_cqm_pmu.hrtimer_interval_ms); - schedule_delayed_work(&intel_cqm_rmid_work, delay); -} - -/* - * Find a group and setup RMID. - * - * If we're part of a group, we use the group's RMID. - */ -static void intel_cqm_setup_event(struct perf_event *event, - struct perf_event **group) -{ - struct perf_event *iter; - bool conflict = false; - u32 rmid; - - list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) { - rmid = iter->hw.cqm_rmid; - - if (__match_event(iter, event)) { - /* All tasks in a group share an RMID */ - event->hw.cqm_rmid = rmid; - *group = iter; - return; - } - - /* - * We only care about conflicts for events that are - * actually scheduled in (and hence have a valid RMID). - */ - if (__conflict_event(iter, event) && __rmid_valid(rmid)) - conflict = true; - } - - if (conflict) - rmid = INVALID_RMID; - else - rmid = __get_rmid(); - - event->hw.cqm_rmid = rmid; -} - -static void intel_cqm_event_read(struct perf_event *event) -{ - unsigned long flags; - u32 rmid; - u64 val; - - /* - * Task events are handled by intel_cqm_event_count(). - */ - if (event->cpu == -1) - return; - - raw_spin_lock_irqsave(&cache_lock, flags); - rmid = event->hw.cqm_rmid; - - if (!__rmid_valid(rmid)) - goto out; - - val = __rmid_read(rmid); - - /* - * Ignore this reading on error states and do not update the value. - */ - if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) - goto out; - - local64_set(&event->count, val); -out: - raw_spin_unlock_irqrestore(&cache_lock, flags); -} - -static void __intel_cqm_event_count(void *info) -{ - struct rmid_read *rr = info; - u64 val; - - val = __rmid_read(rr->rmid); - - if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) - return; - - atomic64_add(val, &rr->value); -} - -static inline bool cqm_group_leader(struct perf_event *event) -{ - return !list_empty(&event->hw.cqm_groups_entry); -} - -static u64 intel_cqm_event_count(struct perf_event *event) -{ - unsigned long flags; - struct rmid_read rr = { - .value = ATOMIC64_INIT(0), - }; - - /* - * We only need to worry about task events. System-wide events - * are handled like usual, i.e. entirely with - * intel_cqm_event_read(). - */ - if (event->cpu != -1) - return __perf_event_count(event); - - /* - * Only the group leader gets to report values. This stops us - * reporting duplicate values to userspace, and gives us a clear - * rule for which task gets to report the values. - * - * Note that it is impossible to attribute these values to - * specific packages - we forfeit that ability when we create - * task events. - */ - if (!cqm_group_leader(event)) - return 0; - - /* - * Getting up-to-date values requires an SMP IPI which is not - * possible if we're being called in interrupt context. Return - * the cached values instead. - */ - if (unlikely(in_interrupt())) - goto out; - - /* - * Notice that we don't perform the reading of an RMID - * atomically, because we can't hold a spin lock across the - * IPIs. - * - * Speculatively perform the read, since @event might be - * assigned a different (possibly invalid) RMID while we're - * busying performing the IPI calls. It's therefore necessary to - * check @event's RMID afterwards, and if it has changed, - * discard the result of the read. - */ - rr.rmid = ACCESS_ONCE(event->hw.cqm_rmid); - - if (!__rmid_valid(rr.rmid)) - goto out; - - on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1); - - raw_spin_lock_irqsave(&cache_lock, flags); - if (event->hw.cqm_rmid == rr.rmid) - local64_set(&event->count, atomic64_read(&rr.value)); - raw_spin_unlock_irqrestore(&cache_lock, flags); -out: - return __perf_event_count(event); -} - -static void intel_cqm_event_start(struct perf_event *event, int mode) -{ - struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); - u32 rmid = event->hw.cqm_rmid; - - if (!(event->hw.cqm_state & PERF_HES_STOPPED)) - return; - - event->hw.cqm_state &= ~PERF_HES_STOPPED; - - if (state->rmid_usecnt++) { - if (!WARN_ON_ONCE(state->rmid != rmid)) - return; - } else { - WARN_ON_ONCE(state->rmid); - } - - state->rmid = rmid; - wrmsr(MSR_IA32_PQR_ASSOC, rmid, state->closid); -} - -static void intel_cqm_event_stop(struct perf_event *event, int mode) -{ - struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); - - if (event->hw.cqm_state & PERF_HES_STOPPED) - return; - - event->hw.cqm_state |= PERF_HES_STOPPED; - - intel_cqm_event_read(event); - - if (!--state->rmid_usecnt) { - state->rmid = 0; - wrmsr(MSR_IA32_PQR_ASSOC, 0, state->closid); - } else { - WARN_ON_ONCE(!state->rmid); - } -} - -static int intel_cqm_event_add(struct perf_event *event, int mode) -{ - unsigned long flags; - u32 rmid; - - raw_spin_lock_irqsave(&cache_lock, flags); - - event->hw.cqm_state = PERF_HES_STOPPED; - rmid = event->hw.cqm_rmid; - - if (__rmid_valid(rmid) && (mode & PERF_EF_START)) - intel_cqm_event_start(event, mode); - - raw_spin_unlock_irqrestore(&cache_lock, flags); - - return 0; -} - -static void intel_cqm_event_destroy(struct perf_event *event) -{ - struct perf_event *group_other = NULL; - - mutex_lock(&cache_mutex); - - /* - * If there's another event in this group... - */ - if (!list_empty(&event->hw.cqm_group_entry)) { - group_other = list_first_entry(&event->hw.cqm_group_entry, - struct perf_event, - hw.cqm_group_entry); - list_del(&event->hw.cqm_group_entry); - } - - /* - * And we're the group leader.. - */ - if (cqm_group_leader(event)) { - /* - * If there was a group_other, make that leader, otherwise - * destroy the group and return the RMID. - */ - if (group_other) { - list_replace(&event->hw.cqm_groups_entry, - &group_other->hw.cqm_groups_entry); - } else { - u32 rmid = event->hw.cqm_rmid; - - if (__rmid_valid(rmid)) - __put_rmid(rmid); - list_del(&event->hw.cqm_groups_entry); - } - } - - mutex_unlock(&cache_mutex); -} - -static int intel_cqm_event_init(struct perf_event *event) -{ - struct perf_event *group = NULL; - bool rotate = false; - - if (event->attr.type != intel_cqm_pmu.type) - return -ENOENT; - - if (event->attr.config & ~QOS_EVENT_MASK) - return -EINVAL; - - /* unsupported modes and filters */ - if (event->attr.exclude_user || - event->attr.exclude_kernel || - event->attr.exclude_hv || - event->attr.exclude_idle || - event->attr.exclude_host || - event->attr.exclude_guest || - event->attr.sample_period) /* no sampling */ - return -EINVAL; - - INIT_LIST_HEAD(&event->hw.cqm_group_entry); - INIT_LIST_HEAD(&event->hw.cqm_groups_entry); - - event->destroy = intel_cqm_event_destroy; - - mutex_lock(&cache_mutex); - - /* Will also set rmid */ - intel_cqm_setup_event(event, &group); - - if (group) { - list_add_tail(&event->hw.cqm_group_entry, - &group->hw.cqm_group_entry); - } else { - list_add_tail(&event->hw.cqm_groups_entry, - &cache_groups); - - /* - * All RMIDs are either in use or have recently been - * used. Kick the rotation worker to clean/free some. - * - * We only do this for the group leader, rather than for - * every event in a group to save on needless work. - */ - if (!__rmid_valid(event->hw.cqm_rmid)) - rotate = true; - } - - mutex_unlock(&cache_mutex); - - if (rotate) - schedule_delayed_work(&intel_cqm_rmid_work, 0); - - return 0; -} - -EVENT_ATTR_STR(llc_occupancy, intel_cqm_llc, "event=0x01"); -EVENT_ATTR_STR(llc_occupancy.per-pkg, intel_cqm_llc_pkg, "1"); -EVENT_ATTR_STR(llc_occupancy.unit, intel_cqm_llc_unit, "Bytes"); -EVENT_ATTR_STR(llc_occupancy.scale, intel_cqm_llc_scale, NULL); -EVENT_ATTR_STR(llc_occupancy.snapshot, intel_cqm_llc_snapshot, "1"); - -static struct attribute *intel_cqm_events_attr[] = { - EVENT_PTR(intel_cqm_llc), - EVENT_PTR(intel_cqm_llc_pkg), - EVENT_PTR(intel_cqm_llc_unit), - EVENT_PTR(intel_cqm_llc_scale), - EVENT_PTR(intel_cqm_llc_snapshot), - NULL, -}; - -static struct attribute_group intel_cqm_events_group = { - .name = "events", - .attrs = intel_cqm_events_attr, -}; - -PMU_FORMAT_ATTR(event, "config:0-7"); -static struct attribute *intel_cqm_formats_attr[] = { - &format_attr_event.attr, - NULL, -}; - -static struct attribute_group intel_cqm_format_group = { - .name = "format", - .attrs = intel_cqm_formats_attr, -}; - -static ssize_t -max_recycle_threshold_show(struct device *dev, struct device_attribute *attr, - char *page) -{ - ssize_t rv; - - mutex_lock(&cache_mutex); - rv = snprintf(page, PAGE_SIZE-1, "%u\n", __intel_cqm_max_threshold); - mutex_unlock(&cache_mutex); - - return rv; -} - -static ssize_t -max_recycle_threshold_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - unsigned int bytes, cachelines; - int ret; - - ret = kstrtouint(buf, 0, &bytes); - if (ret) - return ret; - - mutex_lock(&cache_mutex); - - __intel_cqm_max_threshold = bytes; - cachelines = bytes / cqm_l3_scale; - - /* - * The new maximum takes effect immediately. - */ - if (__intel_cqm_threshold > cachelines) - __intel_cqm_threshold = cachelines; - - mutex_unlock(&cache_mutex); - - return count; -} - -static DEVICE_ATTR_RW(max_recycle_threshold); - -static struct attribute *intel_cqm_attrs[] = { - &dev_attr_max_recycle_threshold.attr, - NULL, -}; - -static const struct attribute_group intel_cqm_group = { - .attrs = intel_cqm_attrs, -}; - -static const struct attribute_group *intel_cqm_attr_groups[] = { - &intel_cqm_events_group, - &intel_cqm_format_group, - &intel_cqm_group, - NULL, -}; - -static struct pmu intel_cqm_pmu = { - .hrtimer_interval_ms = RMID_DEFAULT_QUEUE_TIME, - .attr_groups = intel_cqm_attr_groups, - .task_ctx_nr = perf_sw_context, - .event_init = intel_cqm_event_init, - .add = intel_cqm_event_add, - .del = intel_cqm_event_stop, - .start = intel_cqm_event_start, - .stop = intel_cqm_event_stop, - .read = intel_cqm_event_read, - .count = intel_cqm_event_count, -}; - -static inline void cqm_pick_event_reader(int cpu) -{ - int phys_id = topology_physical_package_id(cpu); - int i; - - for_each_cpu(i, &cqm_cpumask) { - if (phys_id == topology_physical_package_id(i)) - return; /* already got reader for this socket */ - } - - cpumask_set_cpu(cpu, &cqm_cpumask); -} - -static void intel_cqm_cpu_starting(unsigned int cpu) -{ - struct intel_pqr_state *state = &per_cpu(pqr_state, cpu); - struct cpuinfo_x86 *c = &cpu_data(cpu); - - state->rmid = 0; - state->closid = 0; - state->rmid_usecnt = 0; - - WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid); - WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale); -} - -static void intel_cqm_cpu_exit(unsigned int cpu) -{ - int phys_id = topology_physical_package_id(cpu); - int i; - - /* - * Is @cpu a designated cqm reader? - */ - if (!cpumask_test_and_clear_cpu(cpu, &cqm_cpumask)) - return; - - for_each_online_cpu(i) { - if (i == cpu) - continue; - - if (phys_id == topology_physical_package_id(i)) { - cpumask_set_cpu(i, &cqm_cpumask); - break; - } - } -} - -static int intel_cqm_cpu_notifier(struct notifier_block *nb, - unsigned long action, void *hcpu) -{ - unsigned int cpu = (unsigned long)hcpu; - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_DOWN_PREPARE: - intel_cqm_cpu_exit(cpu); - break; - case CPU_STARTING: - intel_cqm_cpu_starting(cpu); - cqm_pick_event_reader(cpu); - break; - } - - return NOTIFY_OK; -} - -static const struct x86_cpu_id intel_cqm_match[] = { - { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_OCCUP_LLC }, - {} -}; - -static int __init intel_cqm_init(void) -{ - char *str, scale[20]; - int i, cpu, ret; - - if (!x86_match_cpu(intel_cqm_match)) - return -ENODEV; - - cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale; - - /* - * It's possible that not all resources support the same number - * of RMIDs. Instead of making scheduling much more complicated - * (where we have to match a task's RMID to a cpu that supports - * that many RMIDs) just find the minimum RMIDs supported across - * all cpus. - * - * Also, check that the scales match on all cpus. - */ - cpu_notifier_register_begin(); - - for_each_online_cpu(cpu) { - struct cpuinfo_x86 *c = &cpu_data(cpu); - - if (c->x86_cache_max_rmid < cqm_max_rmid) - cqm_max_rmid = c->x86_cache_max_rmid; - - if (c->x86_cache_occ_scale != cqm_l3_scale) { - pr_err("Multiple LLC scale values, disabling\n"); - ret = -EINVAL; - goto out; - } - } - - /* - * A reasonable upper limit on the max threshold is the number - * of lines tagged per RMID if all RMIDs have the same number of - * lines tagged in the LLC. - * - * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC. - */ - __intel_cqm_max_threshold = - boot_cpu_data.x86_cache_size * 1024 / (cqm_max_rmid + 1); - - snprintf(scale, sizeof(scale), "%u", cqm_l3_scale); - str = kstrdup(scale, GFP_KERNEL); - if (!str) { - ret = -ENOMEM; - goto out; - } - - event_attr_intel_cqm_llc_scale.event_str = str; - - ret = intel_cqm_setup_rmid_cache(); - if (ret) - goto out; - - for_each_online_cpu(i) { - intel_cqm_cpu_starting(i); - cqm_pick_event_reader(i); - } - - __perf_cpu_notifier(intel_cqm_cpu_notifier); - - ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1); - if (ret) - pr_err("Intel CQM perf registration failed: %d\n", ret); - else - pr_info("Intel CQM monitoring enabled\n"); - -out: - cpu_notifier_register_done(); - - return ret; -} -device_initcall(intel_cqm_init); diff --git a/arch/x86/kernel/cpu/perf_event_intel_cstate.c b/arch/x86/kernel/cpu/perf_event_intel_cstate.c deleted file mode 100644 index 75a38b5a2e26..000000000000 --- a/arch/x86/kernel/cpu/perf_event_intel_cstate.c +++ /dev/null @@ -1,694 +0,0 @@ -/* - * perf_event_intel_cstate.c: support cstate residency counters - * - * Copyright (C) 2015, Intel Corp. - * Author: Kan Liang (kan.liang@intel.com) - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Library General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Library General Public License for more details. - * - */ - -/* - * This file export cstate related free running (read-only) counters - * for perf. These counters may be use simultaneously by other tools, - * such as turbostat. However, it still make sense to implement them - * in perf. Because we can conveniently collect them together with - * other events, and allow to use them from tools without special MSR - * access code. - * - * The events only support system-wide mode counting. There is no - * sampling support because it is not supported by the hardware. - * - * According to counters' scope and category, two PMUs are registered - * with the perf_event core subsystem. - * - 'cstate_core': The counter is available for each physical core. - * The counters include CORE_C*_RESIDENCY. - * - 'cstate_pkg': The counter is available for each physical package. - * The counters include PKG_C*_RESIDENCY. - * - * All of these counters are specified in the Intel® 64 and IA-32 - * Architectures Software Developer.s Manual Vol3b. - * - * Model specific counters: - * MSR_CORE_C1_RES: CORE C1 Residency Counter - * perf code: 0x00 - * Available model: SLM,AMT - * Scope: Core (each processor core has a MSR) - * MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter - * perf code: 0x01 - * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL - * Scope: Core - * MSR_CORE_C6_RESIDENCY: CORE C6 Residency Counter - * perf code: 0x02 - * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,SKL - * Scope: Core - * MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter - * perf code: 0x03 - * Available model: SNB,IVB,HSW,BDW,SKL - * Scope: Core - * MSR_PKG_C2_RESIDENCY: Package C2 Residency Counter. - * perf code: 0x00 - * Available model: SNB,IVB,HSW,BDW,SKL - * Scope: Package (physical package) - * MSR_PKG_C3_RESIDENCY: Package C3 Residency Counter. - * perf code: 0x01 - * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL - * Scope: Package (physical package) - * MSR_PKG_C6_RESIDENCY: Package C6 Residency Counter. - * perf code: 0x02 - * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,SKL - * Scope: Package (physical package) - * MSR_PKG_C7_RESIDENCY: Package C7 Residency Counter. - * perf code: 0x03 - * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL - * Scope: Package (physical package) - * MSR_PKG_C8_RESIDENCY: Package C8 Residency Counter. - * perf code: 0x04 - * Available model: HSW ULT only - * Scope: Package (physical package) - * MSR_PKG_C9_RESIDENCY: Package C9 Residency Counter. - * perf code: 0x05 - * Available model: HSW ULT only - * Scope: Package (physical package) - * MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter. - * perf code: 0x06 - * Available model: HSW ULT only - * Scope: Package (physical package) - * - */ - -#include <linux/module.h> -#include <linux/slab.h> -#include <linux/perf_event.h> -#include <asm/cpu_device_id.h> -#include "perf_event.h" - -#define DEFINE_CSTATE_FORMAT_ATTR(_var, _name, _format) \ -static ssize_t __cstate_##_var##_show(struct kobject *kobj, \ - struct kobj_attribute *attr, \ - char *page) \ -{ \ - BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \ - return sprintf(page, _format "\n"); \ -} \ -static struct kobj_attribute format_attr_##_var = \ - __ATTR(_name, 0444, __cstate_##_var##_show, NULL) - -static ssize_t cstate_get_attr_cpumask(struct device *dev, - struct device_attribute *attr, - char *buf); - -struct perf_cstate_msr { - u64 msr; - struct perf_pmu_events_attr *attr; - bool (*test)(int idx); -}; - - -/* cstate_core PMU */ - -static struct pmu cstate_core_pmu; -static bool has_cstate_core; - -enum perf_cstate_core_id { - /* - * cstate_core events - */ - PERF_CSTATE_CORE_C1_RES = 0, - PERF_CSTATE_CORE_C3_RES, - PERF_CSTATE_CORE_C6_RES, - PERF_CSTATE_CORE_C7_RES, - - PERF_CSTATE_CORE_EVENT_MAX, -}; - -bool test_core(int idx) -{ - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || - boot_cpu_data.x86 != 6) - return false; - - switch (boot_cpu_data.x86_model) { - case 30: /* 45nm Nehalem */ - case 26: /* 45nm Nehalem-EP */ - case 46: /* 45nm Nehalem-EX */ - - case 37: /* 32nm Westmere */ - case 44: /* 32nm Westmere-EP */ - case 47: /* 32nm Westmere-EX */ - if (idx == PERF_CSTATE_CORE_C3_RES || - idx == PERF_CSTATE_CORE_C6_RES) - return true; - break; - case 42: /* 32nm SandyBridge */ - case 45: /* 32nm SandyBridge-E/EN/EP */ - - case 58: /* 22nm IvyBridge */ - case 62: /* 22nm IvyBridge-EP/EX */ - - case 60: /* 22nm Haswell Core */ - case 63: /* 22nm Haswell Server */ - case 69: /* 22nm Haswell ULT */ - case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */ - - case 61: /* 14nm Broadwell Core-M */ - case 86: /* 14nm Broadwell Xeon D */ - case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */ - case 79: /* 14nm Broadwell Server */ - - case 78: /* 14nm Skylake Mobile */ - case 94: /* 14nm Skylake Desktop */ - if (idx == PERF_CSTATE_CORE_C3_RES || - idx == PERF_CSTATE_CORE_C6_RES || - idx == PERF_CSTATE_CORE_C7_RES) - return true; - break; - case 55: /* 22nm Atom "Silvermont" */ - case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */ - case 76: /* 14nm Atom "Airmont" */ - if (idx == PERF_CSTATE_CORE_C1_RES || - idx == PERF_CSTATE_CORE_C6_RES) - return true; - break; - } - - return false; -} - -PMU_EVENT_ATTR_STRING(c1-residency, evattr_cstate_core_c1, "event=0x00"); -PMU_EVENT_ATTR_STRING(c3-residency, evattr_cstate_core_c3, "event=0x01"); -PMU_EVENT_ATTR_STRING(c6-residency, evattr_cstate_core_c6, "event=0x02"); -PMU_EVENT_ATTR_STRING(c7-residency, evattr_cstate_core_c7, "event=0x03"); - -static struct perf_cstate_msr core_msr[] = { - [PERF_CSTATE_CORE_C1_RES] = { MSR_CORE_C1_RES, &evattr_cstate_core_c1, test_core, }, - [PERF_CSTATE_CORE_C3_RES] = { MSR_CORE_C3_RESIDENCY, &evattr_cstate_core_c3, test_core, }, - [PERF_CSTATE_CORE_C6_RES] = { MSR_CORE_C6_RESIDENCY, &evattr_cstate_core_c6, test_core, }, - [PERF_CSTATE_CORE_C7_RES] = { MSR_CORE_C7_RESIDENCY, &evattr_cstate_core_c7, test_core, }, -}; - -static struct attribute *core_events_attrs[PERF_CSTATE_CORE_EVENT_MAX + 1] = { - NULL, -}; - -static struct attribute_group core_events_attr_group = { - .name = "events", - .attrs = core_events_attrs, -}; - -DEFINE_CSTATE_FORMAT_ATTR(core_event, event, "config:0-63"); -static struct attribute *core_format_attrs[] = { - &format_attr_core_event.attr, - NULL, -}; - -static struct attribute_group core_format_attr_group = { - .name = "format", - .attrs = core_format_attrs, -}; - -static cpumask_t cstate_core_cpu_mask; -static DEVICE_ATTR(cpumask, S_IRUGO, cstate_get_attr_cpumask, NULL); - -static struct attribute *cstate_cpumask_attrs[] = { - &dev_attr_cpumask.attr, - NULL, -}; - -static struct attribute_group cpumask_attr_group = { - .attrs = cstate_cpumask_attrs, -}; - -static const struct attribute_group *core_attr_groups[] = { - &core_events_attr_group, - &core_format_attr_group, - &cpumask_attr_group, - NULL, -}; - -/* cstate_core PMU end */ - - -/* cstate_pkg PMU */ - -static struct pmu cstate_pkg_pmu; -static bool has_cstate_pkg; - -enum perf_cstate_pkg_id { - /* - * cstate_pkg events - */ - PERF_CSTATE_PKG_C2_RES = 0, - PERF_CSTATE_PKG_C3_RES, - PERF_CSTATE_PKG_C6_RES, - PERF_CSTATE_PKG_C7_RES, - PERF_CSTATE_PKG_C8_RES, - PERF_CSTATE_PKG_C9_RES, - PERF_CSTATE_PKG_C10_RES, - - PERF_CSTATE_PKG_EVENT_MAX, -}; - -bool test_pkg(int idx) -{ - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || - boot_cpu_data.x86 != 6) - return false; - - switch (boot_cpu_data.x86_model) { - case 30: /* 45nm Nehalem */ - case 26: /* 45nm Nehalem-EP */ - case 46: /* 45nm Nehalem-EX */ - - case 37: /* 32nm Westmere */ - case 44: /* 32nm Westmere-EP */ - case 47: /* 32nm Westmere-EX */ - if (idx == PERF_CSTATE_CORE_C3_RES || - idx == PERF_CSTATE_CORE_C6_RES || - idx == PERF_CSTATE_CORE_C7_RES) - return true; - break; - case 42: /* 32nm SandyBridge */ - case 45: /* 32nm SandyBridge-E/EN/EP */ - - case 58: /* 22nm IvyBridge */ - case 62: /* 22nm IvyBridge-EP/EX */ - - case 60: /* 22nm Haswell Core */ - case 63: /* 22nm Haswell Server */ - case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */ - - case 61: /* 14nm Broadwell Core-M */ - case 86: /* 14nm Broadwell Xeon D */ - case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */ - case 79: /* 14nm Broadwell Server */ - - case 78: /* 14nm Skylake Mobile */ - case 94: /* 14nm Skylake Desktop */ - if (idx == PERF_CSTATE_PKG_C2_RES || - idx == PERF_CSTATE_PKG_C3_RES || - idx == PERF_CSTATE_PKG_C6_RES || - idx == PERF_CSTATE_PKG_C7_RES) - return true; - break; - case 55: /* 22nm Atom "Silvermont" */ - case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */ - case 76: /* 14nm Atom "Airmont" */ - if (idx == PERF_CSTATE_CORE_C6_RES) - return true; - break; - case 69: /* 22nm Haswell ULT */ - if (idx == PERF_CSTATE_PKG_C2_RES || - idx == PERF_CSTATE_PKG_C3_RES || - idx == PERF_CSTATE_PKG_C6_RES || - idx == PERF_CSTATE_PKG_C7_RES || - idx == PERF_CSTATE_PKG_C8_RES || - idx == PERF_CSTATE_PKG_C9_RES || - idx == PERF_CSTATE_PKG_C10_RES) - return true; - break; - } - - return false; -} - -PMU_EVENT_ATTR_STRING(c2-residency, evattr_cstate_pkg_c2, "event=0x00"); -PMU_EVENT_ATTR_STRING(c3-residency, evattr_cstate_pkg_c3, "event=0x01"); -PMU_EVENT_ATTR_STRING(c6-residency, evattr_cstate_pkg_c6, "event=0x02"); -PMU_EVENT_ATTR_STRING(c7-residency, evattr_cstate_pkg_c7, "event=0x03"); -PMU_EVENT_ATTR_STRING(c8-residency, evattr_cstate_pkg_c8, "event=0x04"); -PMU_EVENT_ATTR_STRING(c9-residency, evattr_cstate_pkg_c9, "event=0x05"); -PMU_EVENT_ATTR_STRING(c10-residency, evattr_cstate_pkg_c10, "event=0x06"); - -static struct perf_cstate_msr pkg_msr[] = { - [PERF_CSTATE_PKG_C2_RES] = { MSR_PKG_C2_RESIDENCY, &evattr_cstate_pkg_c2, test_pkg, }, - [PERF_CSTATE_PKG_C3_RES] = { MSR_PKG_C3_RESIDENCY, &evattr_cstate_pkg_c3, test_pkg, }, - [PERF_CSTATE_PKG_C6_RES] = { MSR_PKG_C6_RESIDENCY, &evattr_cstate_pkg_c6, test_pkg, }, - [PERF_CSTATE_PKG_C7_RES] = { MSR_PKG_C7_RESIDENCY, &evattr_cstate_pkg_c7, test_pkg, }, - [PERF_CSTATE_PKG_C8_RES] = { MSR_PKG_C8_RESIDENCY, &evattr_cstate_pkg_c8, test_pkg, }, - [PERF_CSTATE_PKG_C9_RES] = { MSR_PKG_C9_RESIDENCY, &evattr_cstate_pkg_c9, test_pkg, }, - [PERF_CSTATE_PKG_C10_RES] = { MSR_PKG_C10_RESIDENCY, &evattr_cstate_pkg_c10, test_pkg, }, -}; - -static struct attribute *pkg_events_attrs[PERF_CSTATE_PKG_EVENT_MAX + 1] = { - NULL, -}; - -static struct attribute_group pkg_events_attr_group = { - .name = "events", - .attrs = pkg_events_attrs, -}; - -DEFINE_CSTATE_FORMAT_ATTR(pkg_event, event, "config:0-63"); -static struct attribute *pkg_format_attrs[] = { - &format_attr_pkg_event.attr, - NULL, -}; -static struct attribute_group pkg_format_attr_group = { - .name = "format", - .attrs = pkg_format_attrs, -}; - -static cpumask_t cstate_pkg_cpu_mask; - -static const struct attribute_group *pkg_attr_groups[] = { - &pkg_events_attr_group, - &pkg_format_attr_group, - &cpumask_attr_group, - NULL, -}; - -/* cstate_pkg PMU end*/ - -static ssize_t cstate_get_attr_cpumask(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct pmu *pmu = dev_get_drvdata(dev); - - if (pmu == &cstate_core_pmu) - return cpumap_print_to_pagebuf(true, buf, &cstate_core_cpu_mask); - else if (pmu == &cstate_pkg_pmu) - return cpumap_print_to_pagebuf(true, buf, &cstate_pkg_cpu_mask); - else - return 0; -} - -static int cstate_pmu_event_init(struct perf_event *event) -{ - u64 cfg = event->attr.config; - int ret = 0; - - if (event->attr.type != event->pmu->type) - return -ENOENT; - - /* unsupported modes and filters */ - if (event->attr.exclude_user || - event->attr.exclude_kernel || - event->attr.exclude_hv || - event->attr.exclude_idle || - event->attr.exclude_host || - event->attr.exclude_guest || - event->attr.sample_period) /* no sampling */ - return -EINVAL; - - if (event->pmu == &cstate_core_pmu) { - if (cfg >= PERF_CSTATE_CORE_EVENT_MAX) - return -EINVAL; - if (!core_msr[cfg].attr) - return -EINVAL; - event->hw.event_base = core_msr[cfg].msr; - } else if (event->pmu == &cstate_pkg_pmu) { - if (cfg >= PERF_CSTATE_PKG_EVENT_MAX) - return -EINVAL; - if (!pkg_msr[cfg].attr) - return -EINVAL; - event->hw.event_base = pkg_msr[cfg].msr; - } else - return -ENOENT; - - /* must be done before validate_group */ - event->hw.config = cfg; - event->hw.idx = -1; - - return ret; -} - -static inline u64 cstate_pmu_read_counter(struct perf_event *event) -{ - u64 val; - - rdmsrl(event->hw.event_base, val); - return val; -} - -static void cstate_pmu_event_update(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - u64 prev_raw_count, new_raw_count; - -again: - prev_raw_count = local64_read(&hwc->prev_count); - new_raw_count = cstate_pmu_read_counter(event); - - if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, - new_raw_count) != prev_raw_count) - goto again; - - local64_add(new_raw_count - prev_raw_count, &event->count); -} - -static void cstate_pmu_event_start(struct perf_event *event, int mode) -{ - local64_set(&event->hw.prev_count, cstate_pmu_read_counter(event)); -} - -static void cstate_pmu_event_stop(struct perf_event *event, int mode) -{ - cstate_pmu_event_update(event); -} - -static void cstate_pmu_event_del(struct perf_event *event, int mode) -{ - cstate_pmu_event_stop(event, PERF_EF_UPDATE); -} - -static int cstate_pmu_event_add(struct perf_event *event, int mode) -{ - if (mode & PERF_EF_START) - cstate_pmu_event_start(event, mode); - - return 0; -} - -static void cstate_cpu_exit(int cpu) -{ - int i, id, target; - - /* cpu exit for cstate core */ - if (has_cstate_core) { - id = topology_core_id(cpu); - target = -1; - - for_each_online_cpu(i) { - if (i == cpu) - continue; - if (id == topology_core_id(i)) { - target = i; - break; - } - } - if (cpumask_test_and_clear_cpu(cpu, &cstate_core_cpu_mask) && target >= 0) - cpumask_set_cpu(target, &cstate_core_cpu_mask); - WARN_ON(cpumask_empty(&cstate_core_cpu_mask)); - if (target >= 0) - perf_pmu_migrate_context(&cstate_core_pmu, cpu, target); - } - - /* cpu exit for cstate pkg */ - if (has_cstate_pkg) { - id = topology_physical_package_id(cpu); - target = -1; - - for_each_online_cpu(i) { - if (i == cpu) - continue; - if (id == topology_physical_package_id(i)) { - target = i; - break; - } - } - if (cpumask_test_and_clear_cpu(cpu, &cstate_pkg_cpu_mask) && target >= 0) - cpumask_set_cpu(target, &cstate_pkg_cpu_mask); - WARN_ON(cpumask_empty(&cstate_pkg_cpu_mask)); - if (target >= 0) - perf_pmu_migrate_context(&cstate_pkg_pmu, cpu, target); - } -} - -static void cstate_cpu_init(int cpu) -{ - int i, id; - - /* cpu init for cstate core */ - if (has_cstate_core) { - id = topology_core_id(cpu); - for_each_cpu(i, &cstate_core_cpu_mask) { - if (id == topology_core_id(i)) - break; - } - if (i >= nr_cpu_ids) - cpumask_set_cpu(cpu, &cstate_core_cpu_mask); - } - - /* cpu init for cstate pkg */ - if (has_cstate_pkg) { - id = topology_physical_package_id(cpu); - for_each_cpu(i, &cstate_pkg_cpu_mask) { - if (id == topology_physical_package_id(i)) - break; - } - if (i >= nr_cpu_ids) - cpumask_set_cpu(cpu, &cstate_pkg_cpu_mask); - } -} - -static int cstate_cpu_notifier(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - unsigned int cpu = (long)hcpu; - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_UP_PREPARE: - break; - case CPU_STARTING: - cstate_cpu_init(cpu); - break; - case CPU_UP_CANCELED: - case CPU_DYING: - break; - case CPU_ONLINE: - case CPU_DEAD: - break; - case CPU_DOWN_PREPARE: - cstate_cpu_exit(cpu); - break; - default: - break; - } - - return NOTIFY_OK; -} - -/* - * Probe the cstate events and insert the available one into sysfs attrs - * Return false if there is no available events. - */ -static bool cstate_probe_msr(struct perf_cstate_msr *msr, - struct attribute **events_attrs, - int max_event_nr) -{ - int i, j = 0; - u64 val; - - /* Probe the cstate events. */ - for (i = 0; i < max_event_nr; i++) { - if (!msr[i].test(i) || rdmsrl_safe(msr[i].msr, &val)) - msr[i].attr = NULL; - } - - /* List remaining events in the sysfs attrs. */ - for (i = 0; i < max_event_nr; i++) { - if (msr[i].attr) - events_attrs[j++] = &msr[i].attr->attr.attr; - } - events_attrs[j] = NULL; - - return (j > 0) ? true : false; -} - -static int __init cstate_init(void) -{ - /* SLM has different MSR for PKG C6 */ - switch (boot_cpu_data.x86_model) { - case 55: - case 76: - case 77: - pkg_msr[PERF_CSTATE_PKG_C6_RES].msr = MSR_PKG_C7_RESIDENCY; - } - - if (cstate_probe_msr(core_msr, core_events_attrs, PERF_CSTATE_CORE_EVENT_MAX)) - has_cstate_core = true; - - if (cstate_probe_msr(pkg_msr, pkg_events_attrs, PERF_CSTATE_PKG_EVENT_MAX)) - has_cstate_pkg = true; - - return (has_cstate_core || has_cstate_pkg) ? 0 : -ENODEV; -} - -static void __init cstate_cpumask_init(void) -{ - int cpu; - - cpu_notifier_register_begin(); - - for_each_online_cpu(cpu) - cstate_cpu_init(cpu); - - __perf_cpu_notifier(cstate_cpu_notifier); - - cpu_notifier_register_done(); -} - -static struct pmu cstate_core_pmu = { - .attr_groups = core_attr_groups, - .name = "cstate_core", - .task_ctx_nr = perf_invalid_context, - .event_init = cstate_pmu_event_init, - .add = cstate_pmu_event_add, /* must have */ - .del = cstate_pmu_event_del, /* must have */ - .start = cstate_pmu_event_start, - .stop = cstate_pmu_event_stop, - .read = cstate_pmu_event_update, - .capabilities = PERF_PMU_CAP_NO_INTERRUPT, -}; - -static struct pmu cstate_pkg_pmu = { - .attr_groups = pkg_attr_groups, - .name = "cstate_pkg", - .task_ctx_nr = perf_invalid_context, - .event_init = cstate_pmu_event_init, - .add = cstate_pmu_event_add, /* must have */ - .del = cstate_pmu_event_del, /* must have */ - .start = cstate_pmu_event_start, - .stop = cstate_pmu_event_stop, - .read = cstate_pmu_event_update, - .capabilities = PERF_PMU_CAP_NO_INTERRUPT, -}; - -static void __init cstate_pmus_register(void) -{ - int err; - - if (has_cstate_core) { - err = perf_pmu_register(&cstate_core_pmu, cstate_core_pmu.name, -1); - if (WARN_ON(err)) - pr_info("Failed to register PMU %s error %d\n", - cstate_core_pmu.name, err); - } - - if (has_cstate_pkg) { - err = perf_pmu_register(&cstate_pkg_pmu, cstate_pkg_pmu.name, -1); - if (WARN_ON(err)) - pr_info("Failed to register PMU %s error %d\n", - cstate_pkg_pmu.name, err); - } -} - -static int __init cstate_pmu_init(void) -{ - int err; - - if (cpu_has_hypervisor) - return -ENODEV; - - err = cstate_init(); - if (err) - return err; - - cstate_cpumask_init(); - - cstate_pmus_register(); - - return 0; -} - -device_initcall(cstate_pmu_init); diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c deleted file mode 100644 index 10602f0a438f..000000000000 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ /dev/null @@ -1,1368 +0,0 @@ -#include <linux/bitops.h> -#include <linux/types.h> -#include <linux/slab.h> - -#include <asm/perf_event.h> -#include <asm/insn.h> - -#include "perf_event.h" - -/* The size of a BTS record in bytes: */ -#define BTS_RECORD_SIZE 24 - -#define BTS_BUFFER_SIZE (PAGE_SIZE << 4) -#define PEBS_BUFFER_SIZE (PAGE_SIZE << 4) -#define PEBS_FIXUP_SIZE PAGE_SIZE - -/* - * pebs_record_32 for p4 and core not supported - -struct pebs_record_32 { - u32 flags, ip; - u32 ax, bc, cx, dx; - u32 si, di, bp, sp; -}; - - */ - -union intel_x86_pebs_dse { - u64 val; - struct { - unsigned int ld_dse:4; - unsigned int ld_stlb_miss:1; - unsigned int ld_locked:1; - unsigned int ld_reserved:26; - }; - struct { - unsigned int st_l1d_hit:1; - unsigned int st_reserved1:3; - unsigned int st_stlb_miss:1; - unsigned int st_locked:1; - unsigned int st_reserved2:26; - }; -}; - - -/* - * Map PEBS Load Latency Data Source encodings to generic - * memory data source information - */ -#define P(a, b) PERF_MEM_S(a, b) -#define OP_LH (P(OP, LOAD) | P(LVL, HIT)) -#define SNOOP_NONE_MISS (P(SNOOP, NONE) | P(SNOOP, MISS)) - -static const u64 pebs_data_source[] = { - P(OP, LOAD) | P(LVL, MISS) | P(LVL, L3) | P(SNOOP, NA),/* 0x00:ukn L3 */ - OP_LH | P(LVL, L1) | P(SNOOP, NONE), /* 0x01: L1 local */ - OP_LH | P(LVL, LFB) | P(SNOOP, NONE), /* 0x02: LFB hit */ - OP_LH | P(LVL, L2) | P(SNOOP, NONE), /* 0x03: L2 hit */ - OP_LH | P(LVL, L3) | P(SNOOP, NONE), /* 0x04: L3 hit */ - OP_LH | P(LVL, L3) | P(SNOOP, MISS), /* 0x05: L3 hit, snoop miss */ - OP_LH | P(LVL, L3) | P(SNOOP, HIT), /* 0x06: L3 hit, snoop hit */ - OP_LH | P(LVL, L3) | P(SNOOP, HITM), /* 0x07: L3 hit, snoop hitm */ - OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HIT), /* 0x08: L3 miss snoop hit */ - OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HITM), /* 0x09: L3 miss snoop hitm*/ - OP_LH | P(LVL, LOC_RAM) | P(SNOOP, HIT), /* 0x0a: L3 miss, shared */ - OP_LH | P(LVL, REM_RAM1) | P(SNOOP, HIT), /* 0x0b: L3 miss, shared */ - OP_LH | P(LVL, LOC_RAM) | SNOOP_NONE_MISS,/* 0x0c: L3 miss, excl */ - OP_LH | P(LVL, REM_RAM1) | SNOOP_NONE_MISS,/* 0x0d: L3 miss, excl */ - OP_LH | P(LVL, IO) | P(SNOOP, NONE), /* 0x0e: I/O */ - OP_LH | P(LVL, UNC) | P(SNOOP, NONE), /* 0x0f: uncached */ -}; - -static u64 precise_store_data(u64 status) -{ - union intel_x86_pebs_dse dse; - u64 val = P(OP, STORE) | P(SNOOP, NA) | P(LVL, L1) | P(TLB, L2); - - dse.val = status; - - /* - * bit 4: TLB access - * 1 = stored missed 2nd level TLB - * - * so it either hit the walker or the OS - * otherwise hit 2nd level TLB - */ - if (dse.st_stlb_miss) - val |= P(TLB, MISS); - else - val |= P(TLB, HIT); - - /* - * bit 0: hit L1 data cache - * if not set, then all we know is that - * it missed L1D - */ - if (dse.st_l1d_hit) - val |= P(LVL, HIT); - else - val |= P(LVL, MISS); - - /* - * bit 5: Locked prefix - */ - if (dse.st_locked) - val |= P(LOCK, LOCKED); - - return val; -} - -static u64 precise_datala_hsw(struct perf_event *event, u64 status) -{ - union perf_mem_data_src dse; - - dse.val = PERF_MEM_NA; - - if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW) - dse.mem_op = PERF_MEM_OP_STORE; - else if (event->hw.flags & PERF_X86_EVENT_PEBS_LD_HSW) - dse.mem_op = PERF_MEM_OP_LOAD; - - /* - * L1 info only valid for following events: - * - * MEM_UOPS_RETIRED.STLB_MISS_STORES - * MEM_UOPS_RETIRED.LOCK_STORES - * MEM_UOPS_RETIRED.SPLIT_STORES - * MEM_UOPS_RETIRED.ALL_STORES - */ - if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW) { - if (status & 1) - dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT; - else - dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_MISS; - } - return dse.val; -} - -static u64 load_latency_data(u64 status) -{ - union intel_x86_pebs_dse dse; - u64 val; - int model = boot_cpu_data.x86_model; - int fam = boot_cpu_data.x86; - - dse.val = status; - - /* - * use the mapping table for bit 0-3 - */ - val = pebs_data_source[dse.ld_dse]; - - /* - * Nehalem models do not support TLB, Lock infos - */ - if (fam == 0x6 && (model == 26 || model == 30 - || model == 31 || model == 46)) { - val |= P(TLB, NA) | P(LOCK, NA); - return val; - } - /* - * bit 4: TLB access - * 0 = did not miss 2nd level TLB - * 1 = missed 2nd level TLB - */ - if (dse.ld_stlb_miss) - val |= P(TLB, MISS) | P(TLB, L2); - else - val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2); - - /* - * bit 5: locked prefix - */ - if (dse.ld_locked) - val |= P(LOCK, LOCKED); - - return val; -} - -struct pebs_record_core { - u64 flags, ip; - u64 ax, bx, cx, dx; - u64 si, di, bp, sp; - u64 r8, r9, r10, r11; - u64 r12, r13, r14, r15; -}; - -struct pebs_record_nhm { - u64 flags, ip; - u64 ax, bx, cx, dx; - u64 si, di, bp, sp; - u64 r8, r9, r10, r11; - u64 r12, r13, r14, r15; - u64 status, dla, dse, lat; -}; - -/* - * Same as pebs_record_nhm, with two additional fields. - */ -struct pebs_record_hsw { - u64 flags, ip; - u64 ax, bx, cx, dx; - u64 si, di, bp, sp; - u64 r8, r9, r10, r11; - u64 r12, r13, r14, r15; - u64 status, dla, dse, lat; - u64 real_ip, tsx_tuning; -}; - -union hsw_tsx_tuning { - struct { - u32 cycles_last_block : 32, - hle_abort : 1, - rtm_abort : 1, - instruction_abort : 1, - non_instruction_abort : 1, - retry : 1, - data_conflict : 1, - capacity_writes : 1, - capacity_reads : 1; - }; - u64 value; -}; - -#define PEBS_HSW_TSX_FLAGS 0xff00000000ULL - -/* Same as HSW, plus TSC */ - -struct pebs_record_skl { - u64 flags, ip; - u64 ax, bx, cx, dx; - u64 si, di, bp, sp; - u64 r8, r9, r10, r11; - u64 r12, r13, r14, r15; - u64 status, dla, dse, lat; - u64 real_ip, tsx_tuning; - u64 tsc; -}; - -void init_debug_store_on_cpu(int cpu) -{ - struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; - - if (!ds) - return; - - wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, - (u32)((u64)(unsigned long)ds), - (u32)((u64)(unsigned long)ds >> 32)); -} - -void fini_debug_store_on_cpu(int cpu) -{ - if (!per_cpu(cpu_hw_events, cpu).ds) - return; - - wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0); -} - -static DEFINE_PER_CPU(void *, insn_buffer); - -static int alloc_pebs_buffer(int cpu) -{ - struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; - int node = cpu_to_node(cpu); - int max; - void *buffer, *ibuffer; - - if (!x86_pmu.pebs) - return 0; - - buffer = kzalloc_node(PEBS_BUFFER_SIZE, GFP_KERNEL, node); - if (unlikely(!buffer)) - return -ENOMEM; - - /* - * HSW+ already provides us the eventing ip; no need to allocate this - * buffer then. - */ - if (x86_pmu.intel_cap.pebs_format < 2) { - ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node); - if (!ibuffer) { - kfree(buffer); - return -ENOMEM; - } - per_cpu(insn_buffer, cpu) = ibuffer; - } - - max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size; - - ds->pebs_buffer_base = (u64)(unsigned long)buffer; - ds->pebs_index = ds->pebs_buffer_base; - ds->pebs_absolute_maximum = ds->pebs_buffer_base + - max * x86_pmu.pebs_record_size; - - return 0; -} - -static void release_pebs_buffer(int cpu) -{ - struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; - - if (!ds || !x86_pmu.pebs) - return; - - kfree(per_cpu(insn_buffer, cpu)); - per_cpu(insn_buffer, cpu) = NULL; - - kfree((void *)(unsigned long)ds->pebs_buffer_base); - ds->pebs_buffer_base = 0; -} - -static int alloc_bts_buffer(int cpu) -{ - struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; - int node = cpu_to_node(cpu); - int max, thresh; - void *buffer; - - if (!x86_pmu.bts) - return 0; - - buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node); - if (unlikely(!buffer)) { - WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__); - return -ENOMEM; - } - - max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE; - thresh = max / 16; - - ds->bts_buffer_base = (u64)(unsigned long)buffer; - ds->bts_index = ds->bts_buffer_base; - ds->bts_absolute_maximum = ds->bts_buffer_base + - max * BTS_RECORD_SIZE; - ds->bts_interrupt_threshold = ds->bts_absolute_maximum - - thresh * BTS_RECORD_SIZE; - - return 0; -} - -static void release_bts_buffer(int cpu) -{ - struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; - - if (!ds || !x86_pmu.bts) - return; - - kfree((void *)(unsigned long)ds->bts_buffer_base); - ds->bts_buffer_base = 0; -} - -static int alloc_ds_buffer(int cpu) -{ - int node = cpu_to_node(cpu); - struct debug_store *ds; - - ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node); - if (unlikely(!ds)) - return -ENOMEM; - - per_cpu(cpu_hw_events, cpu).ds = ds; - - return 0; -} - -static void release_ds_buffer(int cpu) -{ - struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; - - if (!ds) - return; - - per_cpu(cpu_hw_events, cpu).ds = NULL; - kfree(ds); -} - -void release_ds_buffers(void) -{ - int cpu; - - if (!x86_pmu.bts && !x86_pmu.pebs) - return; - - get_online_cpus(); - for_each_online_cpu(cpu) - fini_debug_store_on_cpu(cpu); - - for_each_possible_cpu(cpu) { - release_pebs_buffer(cpu); - release_bts_buffer(cpu); - release_ds_buffer(cpu); - } - put_online_cpus(); -} - -void reserve_ds_buffers(void) -{ - int bts_err = 0, pebs_err = 0; - int cpu; - - x86_pmu.bts_active = 0; - x86_pmu.pebs_active = 0; - - if (!x86_pmu.bts && !x86_pmu.pebs) - return; - - if (!x86_pmu.bts) - bts_err = 1; - - if (!x86_pmu.pebs) - pebs_err = 1; - - get_online_cpus(); - - for_each_possible_cpu(cpu) { - if (alloc_ds_buffer(cpu)) { - bts_err = 1; - pebs_err = 1; - } - - if (!bts_err && alloc_bts_buffer(cpu)) - bts_err = 1; - - if (!pebs_err && alloc_pebs_buffer(cpu)) - pebs_err = 1; - - if (bts_err && pebs_err) - break; - } - - if (bts_err) { - for_each_possible_cpu(cpu) - release_bts_buffer(cpu); - } - - if (pebs_err) { - for_each_possible_cpu(cpu) - release_pebs_buffer(cpu); - } - - if (bts_err && pebs_err) { - for_each_possible_cpu(cpu) - release_ds_buffer(cpu); - } else { - if (x86_pmu.bts && !bts_err) - x86_pmu.bts_active = 1; - - if (x86_pmu.pebs && !pebs_err) - x86_pmu.pebs_active = 1; - - for_each_online_cpu(cpu) - init_debug_store_on_cpu(cpu); - } - - put_online_cpus(); -} - -/* - * BTS - */ - -struct event_constraint bts_constraint = - EVENT_CONSTRAINT(0, 1ULL << INTEL_PMC_IDX_FIXED_BTS, 0); - -void intel_pmu_enable_bts(u64 config) -{ - unsigned long debugctlmsr; - - debugctlmsr = get_debugctlmsr(); - - debugctlmsr |= DEBUGCTLMSR_TR; - debugctlmsr |= DEBUGCTLMSR_BTS; - if (config & ARCH_PERFMON_EVENTSEL_INT) - debugctlmsr |= DEBUGCTLMSR_BTINT; - - if (!(config & ARCH_PERFMON_EVENTSEL_OS)) - debugctlmsr |= DEBUGCTLMSR_BTS_OFF_OS; - - if (!(config & ARCH_PERFMON_EVENTSEL_USR)) - debugctlmsr |= DEBUGCTLMSR_BTS_OFF_USR; - - update_debugctlmsr(debugctlmsr); -} - -void intel_pmu_disable_bts(void) -{ - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - unsigned long debugctlmsr; - - if (!cpuc->ds) - return; - - debugctlmsr = get_debugctlmsr(); - - debugctlmsr &= - ~(DEBUGCTLMSR_TR | DEBUGCTLMSR_BTS | DEBUGCTLMSR_BTINT | - DEBUGCTLMSR_BTS_OFF_OS | DEBUGCTLMSR_BTS_OFF_USR); - - update_debugctlmsr(debugctlmsr); -} - -int intel_pmu_drain_bts_buffer(void) -{ - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - struct debug_store *ds = cpuc->ds; - struct bts_record { - u64 from; - u64 to; - u64 flags; - }; - struct perf_event *event = cpuc->events[INTEL_PMC_IDX_FIXED_BTS]; - struct bts_record *at, *base, *top; - struct perf_output_handle handle; - struct perf_event_header header; - struct perf_sample_data data; - unsigned long skip = 0; - struct pt_regs regs; - - if (!event) - return 0; - - if (!x86_pmu.bts_active) - return 0; - - base = (struct bts_record *)(unsigned long)ds->bts_buffer_base; - top = (struct bts_record *)(unsigned long)ds->bts_index; - - if (top <= base) - return 0; - - memset(®s, 0, sizeof(regs)); - - ds->bts_index = ds->bts_buffer_base; - - perf_sample_data_init(&data, 0, event->hw.last_period); - - /* - * BTS leaks kernel addresses in branches across the cpl boundary, - * such as traps or system calls, so unless the user is asking for - * kernel tracing (and right now it's not possible), we'd need to - * filter them out. But first we need to count how many of those we - * have in the current batch. This is an extra O(n) pass, however, - * it's much faster than the other one especially considering that - * n <= 2560 (BTS_BUFFER_SIZE / BTS_RECORD_SIZE * 15/16; see the - * alloc_bts_buffer()). - */ - for (at = base; at < top; at++) { - /* - * Note that right now *this* BTS code only works if - * attr::exclude_kernel is set, but let's keep this extra - * check here in case that changes. - */ - if (event->attr.exclude_kernel && - (kernel_ip(at->from) || kernel_ip(at->to))) - skip++; - } - - /* - * Prepare a generic sample, i.e. fill in the invariant fields. - * We will overwrite the from and to address before we output - * the sample. - */ - perf_prepare_sample(&header, &data, event, ®s); - - if (perf_output_begin(&handle, event, header.size * - (top - base - skip))) - return 1; - - for (at = base; at < top; at++) { - /* Filter out any records that contain kernel addresses. */ - if (event->attr.exclude_kernel && - (kernel_ip(at->from) || kernel_ip(at->to))) - continue; - - data.ip = at->from; - data.addr = at->to; - - perf_output_sample(&handle, &header, &data, event); - } - - perf_output_end(&handle); - - /* There's new data available. */ - event->hw.interrupts++; - event->pending_kill = POLL_IN; - return 1; -} - -static inline void intel_pmu_drain_pebs_buffer(void) -{ - struct pt_regs regs; - - x86_pmu.drain_pebs(®s); -} - -void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in) -{ - if (!sched_in) - intel_pmu_drain_pebs_buffer(); -} - -/* - * PEBS - */ -struct event_constraint intel_core2_pebs_event_constraints[] = { - INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */ - INTEL_FLAGS_UEVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */ - INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */ - INTEL_FLAGS_UEVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */ - INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */ - /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */ - INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x01), - EVENT_CONSTRAINT_END -}; - -struct event_constraint intel_atom_pebs_event_constraints[] = { - INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */ - INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */ - INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */ - /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */ - INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x01), - /* Allow all events as PEBS with no flags */ - INTEL_ALL_EVENT_CONSTRAINT(0, 0x1), - EVENT_CONSTRAINT_END -}; - -struct event_constraint intel_slm_pebs_event_constraints[] = { - /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */ - INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x1), - /* Allow all events as PEBS with no flags */ - INTEL_ALL_EVENT_CONSTRAINT(0, 0x1), - EVENT_CONSTRAINT_END -}; - -struct event_constraint intel_nehalem_pebs_event_constraints[] = { - INTEL_PLD_CONSTRAINT(0x100b, 0xf), /* MEM_INST_RETIRED.* */ - INTEL_FLAGS_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */ - INTEL_FLAGS_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */ - INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xf), /* INST_RETIRED.ANY */ - INTEL_EVENT_CONSTRAINT(0xc2, 0xf), /* UOPS_RETIRED.* */ - INTEL_FLAGS_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ - INTEL_FLAGS_UEVENT_CONSTRAINT(0x02c5, 0xf), /* BR_MISP_RETIRED.NEAR_CALL */ - INTEL_FLAGS_EVENT_CONSTRAINT(0xc7, 0xf), /* SSEX_UOPS_RETIRED.* */ - INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */ - INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */ - INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */ - /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */ - INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f), - EVENT_CONSTRAINT_END -}; - -struct event_constraint intel_westmere_pebs_event_constraints[] = { - INTEL_PLD_CONSTRAINT(0x100b, 0xf), /* MEM_INST_RETIRED.* */ - INTEL_FLAGS_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */ - INTEL_FLAGS_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */ - INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xf), /* INSTR_RETIRED.* */ - INTEL_EVENT_CONSTRAINT(0xc2, 0xf), /* UOPS_RETIRED.* */ - INTEL_FLAGS_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ - INTEL_FLAGS_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */ - INTEL_FLAGS_EVENT_CONSTRAINT(0xc7, 0xf), /* SSEX_UOPS_RETIRED.* */ - INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */ - INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */ - INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */ - /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */ - INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f), - EVENT_CONSTRAINT_END -}; - -struct event_constraint intel_snb_pebs_event_constraints[] = { - INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */ - INTEL_PLD_CONSTRAINT(0x01cd, 0x8), /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */ - INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */ - /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */ - INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf), - INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */ - INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ - INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ - INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */ - /* Allow all events as PEBS with no flags */ - INTEL_ALL_EVENT_CONSTRAINT(0, 0xf), - EVENT_CONSTRAINT_END -}; - -struct event_constraint intel_ivb_pebs_event_constraints[] = { - INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */ - INTEL_PLD_CONSTRAINT(0x01cd, 0x8), /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */ - INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */ - /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */ - INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf), - /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */ - INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2), - INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */ - INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ - INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ - INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */ - /* Allow all events as PEBS with no flags */ - INTEL_ALL_EVENT_CONSTRAINT(0, 0xf), - EVENT_CONSTRAINT_END -}; - -struct event_constraint intel_hsw_pebs_event_constraints[] = { - INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */ - INTEL_PLD_CONSTRAINT(0x01cd, 0xf), /* MEM_TRANS_RETIRED.* */ - /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */ - INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf), - /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */ - INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2), - INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */ - INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */ - INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */ - INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */ - INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */ - INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x12d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_STORES */ - INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x42d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_STORES */ - INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */ - INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ - INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd2, 0xf), /* MEM_LOAD_UOPS_L3_HIT_RETIRED.* */ - INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd3, 0xf), /* MEM_LOAD_UOPS_L3_MISS_RETIRED.* */ - /* Allow all events as PEBS with no flags */ - INTEL_ALL_EVENT_CONSTRAINT(0, 0xf), - EVENT_CONSTRAINT_END -}; - -struct event_constraint intel_skl_pebs_event_constraints[] = { - INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x2), /* INST_RETIRED.PREC_DIST */ - /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */ - INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2), - /* INST_RETIRED.TOTAL_CYCLES_PS (inv=1, cmask=16) (cycles:p). */ - INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f), - INTEL_PLD_CONSTRAINT(0x1cd, 0xf), /* MEM_TRANS_RETIRED.* */ - INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_LOADS */ - INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_STORES */ - INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_INST_RETIRED.LOCK_LOADS */ - INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x22d0, 0xf), /* MEM_INST_RETIRED.LOCK_STORES */ - INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_INST_RETIRED.SPLIT_LOADS */ - INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_INST_RETIRED.SPLIT_STORES */ - INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_INST_RETIRED.ALL_LOADS */ - INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_INST_RETIRED.ALL_STORES */ - INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd1, 0xf), /* MEM_LOAD_RETIRED.* */ - INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd2, 0xf), /* MEM_LOAD_L3_HIT_RETIRED.* */ - INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd3, 0xf), /* MEM_LOAD_L3_MISS_RETIRED.* */ - /* Allow all events as PEBS with no flags */ - INTEL_ALL_EVENT_CONSTRAINT(0, 0xf), - EVENT_CONSTRAINT_END -}; - -struct event_constraint *intel_pebs_constraints(struct perf_event *event) -{ - struct event_constraint *c; - - if (!event->attr.precise_ip) - return NULL; - - if (x86_pmu.pebs_constraints) { - for_each_event_constraint(c, x86_pmu.pebs_constraints) { - if ((event->hw.config & c->cmask) == c->code) { - event->hw.flags |= c->flags; - return c; - } - } - } - - return &emptyconstraint; -} - -static inline bool pebs_is_enabled(struct cpu_hw_events *cpuc) -{ - return (cpuc->pebs_enabled & ((1ULL << MAX_PEBS_EVENTS) - 1)); -} - -void intel_pmu_pebs_enable(struct perf_event *event) -{ - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - struct hw_perf_event *hwc = &event->hw; - struct debug_store *ds = cpuc->ds; - bool first_pebs; - u64 threshold; - - hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT; - - first_pebs = !pebs_is_enabled(cpuc); - cpuc->pebs_enabled |= 1ULL << hwc->idx; - - if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT) - cpuc->pebs_enabled |= 1ULL << (hwc->idx + 32); - else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST) - cpuc->pebs_enabled |= 1ULL << 63; - - /* - * When the event is constrained enough we can use a larger - * threshold and run the event with less frequent PMI. - */ - if (hwc->flags & PERF_X86_EVENT_FREERUNNING) { - threshold = ds->pebs_absolute_maximum - - x86_pmu.max_pebs_events * x86_pmu.pebs_record_size; - - if (first_pebs) - perf_sched_cb_inc(event->ctx->pmu); - } else { - threshold = ds->pebs_buffer_base + x86_pmu.pebs_record_size; - - /* - * If not all events can use larger buffer, - * roll back to threshold = 1 - */ - if (!first_pebs && - (ds->pebs_interrupt_threshold > threshold)) - perf_sched_cb_dec(event->ctx->pmu); - } - - /* Use auto-reload if possible to save a MSR write in the PMI */ - if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) { - ds->pebs_event_reset[hwc->idx] = - (u64)(-hwc->sample_period) & x86_pmu.cntval_mask; - } - - if (first_pebs || ds->pebs_interrupt_threshold > threshold) - ds->pebs_interrupt_threshold = threshold; -} - -void intel_pmu_pebs_disable(struct perf_event *event) -{ - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - struct hw_perf_event *hwc = &event->hw; - struct debug_store *ds = cpuc->ds; - bool large_pebs = ds->pebs_interrupt_threshold > - ds->pebs_buffer_base + x86_pmu.pebs_record_size; - - if (large_pebs) - intel_pmu_drain_pebs_buffer(); - - cpuc->pebs_enabled &= ~(1ULL << hwc->idx); - - if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT) - cpuc->pebs_enabled &= ~(1ULL << (hwc->idx + 32)); - else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST) - cpuc->pebs_enabled &= ~(1ULL << 63); - - if (large_pebs && !pebs_is_enabled(cpuc)) - perf_sched_cb_dec(event->ctx->pmu); - - if (cpuc->enabled) - wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled); - - hwc->config |= ARCH_PERFMON_EVENTSEL_INT; -} - -void intel_pmu_pebs_enable_all(void) -{ - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - - if (cpuc->pebs_enabled) - wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled); -} - -void intel_pmu_pebs_disable_all(void) -{ - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - - if (cpuc->pebs_enabled) - wrmsrl(MSR_IA32_PEBS_ENABLE, 0); -} - -static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) -{ - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - unsigned long from = cpuc->lbr_entries[0].from; - unsigned long old_to, to = cpuc->lbr_entries[0].to; - unsigned long ip = regs->ip; - int is_64bit = 0; - void *kaddr; - int size; - - /* - * We don't need to fixup if the PEBS assist is fault like - */ - if (!x86_pmu.intel_cap.pebs_trap) - return 1; - - /* - * No LBR entry, no basic block, no rewinding - */ - if (!cpuc->lbr_stack.nr || !from || !to) - return 0; - - /* - * Basic blocks should never cross user/kernel boundaries - */ - if (kernel_ip(ip) != kernel_ip(to)) - return 0; - - /* - * unsigned math, either ip is before the start (impossible) or - * the basic block is larger than 1 page (sanity) - */ - if ((ip - to) > PEBS_FIXUP_SIZE) - return 0; - - /* - * We sampled a branch insn, rewind using the LBR stack - */ - if (ip == to) { - set_linear_ip(regs, from); - return 1; - } - - size = ip - to; - if (!kernel_ip(ip)) { - int bytes; - u8 *buf = this_cpu_read(insn_buffer); - - /* 'size' must fit our buffer, see above */ - bytes = copy_from_user_nmi(buf, (void __user *)to, size); - if (bytes != 0) - return 0; - - kaddr = buf; - } else { - kaddr = (void *)to; - } - - do { - struct insn insn; - - old_to = to; - -#ifdef CONFIG_X86_64 - is_64bit = kernel_ip(to) || !test_thread_flag(TIF_IA32); -#endif - insn_init(&insn, kaddr, size, is_64bit); - insn_get_length(&insn); - /* - * Make sure there was not a problem decoding the - * instruction and getting the length. This is - * doubly important because we have an infinite - * loop if insn.length=0. - */ - if (!insn.length) - break; - - to += insn.length; - kaddr += insn.length; - size -= insn.length; - } while (to < ip); - - if (to == ip) { - set_linear_ip(regs, old_to); - return 1; - } - - /* - * Even though we decoded the basic block, the instruction stream - * never matched the given IP, either the TO or the IP got corrupted. - */ - return 0; -} - -static inline u64 intel_hsw_weight(struct pebs_record_skl *pebs) -{ - if (pebs->tsx_tuning) { - union hsw_tsx_tuning tsx = { .value = pebs->tsx_tuning }; - return tsx.cycles_last_block; - } - return 0; -} - -static inline u64 intel_hsw_transaction(struct pebs_record_skl *pebs) -{ - u64 txn = (pebs->tsx_tuning & PEBS_HSW_TSX_FLAGS) >> 32; - - /* For RTM XABORTs also log the abort code from AX */ - if ((txn & PERF_TXN_TRANSACTION) && (pebs->ax & 1)) - txn |= ((pebs->ax >> 24) & 0xff) << PERF_TXN_ABORT_SHIFT; - return txn; -} - -static void setup_pebs_sample_data(struct perf_event *event, - struct pt_regs *iregs, void *__pebs, - struct perf_sample_data *data, - struct pt_regs *regs) -{ -#define PERF_X86_EVENT_PEBS_HSW_PREC \ - (PERF_X86_EVENT_PEBS_ST_HSW | \ - PERF_X86_EVENT_PEBS_LD_HSW | \ - PERF_X86_EVENT_PEBS_NA_HSW) - /* - * We cast to the biggest pebs_record but are careful not to - * unconditionally access the 'extra' entries. - */ - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - struct pebs_record_skl *pebs = __pebs; - u64 sample_type; - int fll, fst, dsrc; - int fl = event->hw.flags; - - if (pebs == NULL) - return; - - sample_type = event->attr.sample_type; - dsrc = sample_type & PERF_SAMPLE_DATA_SRC; - - fll = fl & PERF_X86_EVENT_PEBS_LDLAT; - fst = fl & (PERF_X86_EVENT_PEBS_ST | PERF_X86_EVENT_PEBS_HSW_PREC); - - perf_sample_data_init(data, 0, event->hw.last_period); - - data->period = event->hw.last_period; - - /* - * Use latency for weight (only avail with PEBS-LL) - */ - if (fll && (sample_type & PERF_SAMPLE_WEIGHT)) - data->weight = pebs->lat; - - /* - * data.data_src encodes the data source - */ - if (dsrc) { - u64 val = PERF_MEM_NA; - if (fll) - val = load_latency_data(pebs->dse); - else if (fst && (fl & PERF_X86_EVENT_PEBS_HSW_PREC)) - val = precise_datala_hsw(event, pebs->dse); - else if (fst) - val = precise_store_data(pebs->dse); - data->data_src.val = val; - } - - /* - * We use the interrupt regs as a base because the PEBS record - * does not contain a full regs set, specifically it seems to - * lack segment descriptors, which get used by things like - * user_mode(). - * - * In the simple case fix up only the IP and BP,SP regs, for - * PERF_SAMPLE_IP and PERF_SAMPLE_CALLCHAIN to function properly. - * A possible PERF_SAMPLE_REGS will have to transfer all regs. - */ - *regs = *iregs; - regs->flags = pebs->flags; - set_linear_ip(regs, pebs->ip); - regs->bp = pebs->bp; - regs->sp = pebs->sp; - - if (sample_type & PERF_SAMPLE_REGS_INTR) { - regs->ax = pebs->ax; - regs->bx = pebs->bx; - regs->cx = pebs->cx; - regs->dx = pebs->dx; - regs->si = pebs->si; - regs->di = pebs->di; - regs->bp = pebs->bp; - regs->sp = pebs->sp; - - regs->flags = pebs->flags; -#ifndef CONFIG_X86_32 - regs->r8 = pebs->r8; - regs->r9 = pebs->r9; - regs->r10 = pebs->r10; - regs->r11 = pebs->r11; - regs->r12 = pebs->r12; - regs->r13 = pebs->r13; - regs->r14 = pebs->r14; - regs->r15 = pebs->r15; -#endif - } - - if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format >= 2) { - regs->ip = pebs->real_ip; - regs->flags |= PERF_EFLAGS_EXACT; - } else if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(regs)) - regs->flags |= PERF_EFLAGS_EXACT; - else - regs->flags &= ~PERF_EFLAGS_EXACT; - - if ((sample_type & PERF_SAMPLE_ADDR) && - x86_pmu.intel_cap.pebs_format >= 1) - data->addr = pebs->dla; - - if (x86_pmu.intel_cap.pebs_format >= 2) { - /* Only set the TSX weight when no memory weight. */ - if ((sample_type & PERF_SAMPLE_WEIGHT) && !fll) - data->weight = intel_hsw_weight(pebs); - - if (sample_type & PERF_SAMPLE_TRANSACTION) - data->txn = intel_hsw_transaction(pebs); - } - - /* - * v3 supplies an accurate time stamp, so we use that - * for the time stamp. - * - * We can only do this for the default trace clock. - */ - if (x86_pmu.intel_cap.pebs_format >= 3 && - event->attr.use_clockid == 0) - data->time = native_sched_clock_from_tsc(pebs->tsc); - - if (has_branch_stack(event)) - data->br_stack = &cpuc->lbr_stack; -} - -static inline void * -get_next_pebs_record_by_bit(void *base, void *top, int bit) -{ - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - void *at; - u64 pebs_status; - - /* - * fmt0 does not have a status bitfield (does not use - * perf_record_nhm format) - */ - if (x86_pmu.intel_cap.pebs_format < 1) - return base; - - if (base == NULL) - return NULL; - - for (at = base; at < top; at += x86_pmu.pebs_record_size) { - struct pebs_record_nhm *p = at; - - if (test_bit(bit, (unsigned long *)&p->status)) { - /* PEBS v3 has accurate status bits */ - if (x86_pmu.intel_cap.pebs_format >= 3) - return at; - - if (p->status == (1 << bit)) - return at; - - /* clear non-PEBS bit and re-check */ - pebs_status = p->status & cpuc->pebs_enabled; - pebs_status &= (1ULL << MAX_PEBS_EVENTS) - 1; - if (pebs_status == (1 << bit)) - return at; - } - } - return NULL; -} - -static void __intel_pmu_pebs_event(struct perf_event *event, - struct pt_regs *iregs, - void *base, void *top, - int bit, int count) -{ - struct perf_sample_data data; - struct pt_regs regs; - void *at = get_next_pebs_record_by_bit(base, top, bit); - - if (!intel_pmu_save_and_restart(event) && - !(event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD)) - return; - - while (count > 1) { - setup_pebs_sample_data(event, iregs, at, &data, ®s); - perf_event_output(event, &data, ®s); - at += x86_pmu.pebs_record_size; - at = get_next_pebs_record_by_bit(at, top, bit); - count--; - } - - setup_pebs_sample_data(event, iregs, at, &data, ®s); - - /* - * All but the last records are processed. - * The last one is left to be able to call the overflow handler. - */ - if (perf_event_overflow(event, &data, ®s)) { - x86_pmu_stop(event, 0); - return; - } - -} - -static void intel_pmu_drain_pebs_core(struct pt_regs *iregs) -{ - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - struct debug_store *ds = cpuc->ds; - struct perf_event *event = cpuc->events[0]; /* PMC0 only */ - struct pebs_record_core *at, *top; - int n; - - if (!x86_pmu.pebs_active) - return; - - at = (struct pebs_record_core *)(unsigned long)ds->pebs_buffer_base; - top = (struct pebs_record_core *)(unsigned long)ds->pebs_index; - - /* - * Whatever else happens, drain the thing - */ - ds->pebs_index = ds->pebs_buffer_base; - - if (!test_bit(0, cpuc->active_mask)) - return; - - WARN_ON_ONCE(!event); - - if (!event->attr.precise_ip) - return; - - n = top - at; - if (n <= 0) - return; - - __intel_pmu_pebs_event(event, iregs, at, top, 0, n); -} - -static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) -{ - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - struct debug_store *ds = cpuc->ds; - struct perf_event *event; - void *base, *at, *top; - short counts[MAX_PEBS_EVENTS] = {}; - short error[MAX_PEBS_EVENTS] = {}; - int bit, i; - - if (!x86_pmu.pebs_active) - return; - - base = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base; - top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index; - - ds->pebs_index = ds->pebs_buffer_base; - - if (unlikely(base >= top)) - return; - - for (at = base; at < top; at += x86_pmu.pebs_record_size) { - struct pebs_record_nhm *p = at; - u64 pebs_status; - - /* PEBS v3 has accurate status bits */ - if (x86_pmu.intel_cap.pebs_format >= 3) { - for_each_set_bit(bit, (unsigned long *)&p->status, - MAX_PEBS_EVENTS) - counts[bit]++; - - continue; - } - - pebs_status = p->status & cpuc->pebs_enabled; - pebs_status &= (1ULL << x86_pmu.max_pebs_events) - 1; - - /* - * On some CPUs the PEBS status can be zero when PEBS is - * racing with clearing of GLOBAL_STATUS. - * - * Normally we would drop that record, but in the - * case when there is only a single active PEBS event - * we can assume it's for that event. - */ - if (!pebs_status && cpuc->pebs_enabled && - !(cpuc->pebs_enabled & (cpuc->pebs_enabled-1))) - pebs_status = cpuc->pebs_enabled; - - bit = find_first_bit((unsigned long *)&pebs_status, - x86_pmu.max_pebs_events); - if (bit >= x86_pmu.max_pebs_events) - continue; - - /* - * The PEBS hardware does not deal well with the situation - * when events happen near to each other and multiple bits - * are set. But it should happen rarely. - * - * If these events include one PEBS and multiple non-PEBS - * events, it doesn't impact PEBS record. The record will - * be handled normally. (slow path) - * - * If these events include two or more PEBS events, the - * records for the events can be collapsed into a single - * one, and it's not possible to reconstruct all events - * that caused the PEBS record. It's called collision. - * If collision happened, the record will be dropped. - */ - if (p->status != (1ULL << bit)) { - for_each_set_bit(i, (unsigned long *)&pebs_status, - x86_pmu.max_pebs_events) - error[i]++; - continue; - } - - counts[bit]++; - } - - for (bit = 0; bit < x86_pmu.max_pebs_events; bit++) { - if ((counts[bit] == 0) && (error[bit] == 0)) - continue; - - event = cpuc->events[bit]; - WARN_ON_ONCE(!event); - WARN_ON_ONCE(!event->attr.precise_ip); - - /* log dropped samples number */ - if (error[bit]) - perf_log_lost_samples(event, error[bit]); - - if (counts[bit]) { - __intel_pmu_pebs_event(event, iregs, base, - top, bit, counts[bit]); - } - } -} - -/* - * BTS, PEBS probe and setup - */ - -void __init intel_ds_init(void) -{ - /* - * No support for 32bit formats - */ - if (!boot_cpu_has(X86_FEATURE_DTES64)) - return; - - x86_pmu.bts = boot_cpu_has(X86_FEATURE_BTS); - x86_pmu.pebs = boot_cpu_has(X86_FEATURE_PEBS); - if (x86_pmu.pebs) { - char pebs_type = x86_pmu.intel_cap.pebs_trap ? '+' : '-'; - int format = x86_pmu.intel_cap.pebs_format; - - switch (format) { - case 0: - printk(KERN_CONT "PEBS fmt0%c, ", pebs_type); - x86_pmu.pebs_record_size = sizeof(struct pebs_record_core); - x86_pmu.drain_pebs = intel_pmu_drain_pebs_core; - break; - - case 1: - printk(KERN_CONT "PEBS fmt1%c, ", pebs_type); - x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm); - x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm; - break; - - case 2: - pr_cont("PEBS fmt2%c, ", pebs_type); - x86_pmu.pebs_record_size = sizeof(struct pebs_record_hsw); - x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm; - break; - - case 3: - pr_cont("PEBS fmt3%c, ", pebs_type); - x86_pmu.pebs_record_size = - sizeof(struct pebs_record_skl); - x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm; - x86_pmu.free_running_flags |= PERF_SAMPLE_TIME; - break; - - default: - printk(KERN_CONT "no PEBS fmt%d%c, ", format, pebs_type); - x86_pmu.pebs = 0; - } - } -} - -void perf_restore_debug_store(void) -{ - struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds); - - if (!x86_pmu.bts && !x86_pmu.pebs) - return; - - wrmsrl(MSR_IA32_DS_AREA, (unsigned long)ds); -} diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c deleted file mode 100644 index 653f88d25987..000000000000 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ /dev/null @@ -1,1062 +0,0 @@ -#include <linux/perf_event.h> -#include <linux/types.h> - -#include <asm/perf_event.h> -#include <asm/msr.h> -#include <asm/insn.h> - -#include "perf_event.h" - -enum { - LBR_FORMAT_32 = 0x00, - LBR_FORMAT_LIP = 0x01, - LBR_FORMAT_EIP = 0x02, - LBR_FORMAT_EIP_FLAGS = 0x03, - LBR_FORMAT_EIP_FLAGS2 = 0x04, - LBR_FORMAT_INFO = 0x05, - LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_INFO, -}; - -static enum { - LBR_EIP_FLAGS = 1, - LBR_TSX = 2, -} lbr_desc[LBR_FORMAT_MAX_KNOWN + 1] = { - [LBR_FORMAT_EIP_FLAGS] = LBR_EIP_FLAGS, - [LBR_FORMAT_EIP_FLAGS2] = LBR_EIP_FLAGS | LBR_TSX, -}; - -/* - * Intel LBR_SELECT bits - * Intel Vol3a, April 2011, Section 16.7 Table 16-10 - * - * Hardware branch filter (not available on all CPUs) - */ -#define LBR_KERNEL_BIT 0 /* do not capture at ring0 */ -#define LBR_USER_BIT 1 /* do not capture at ring > 0 */ -#define LBR_JCC_BIT 2 /* do not capture conditional branches */ -#define LBR_REL_CALL_BIT 3 /* do not capture relative calls */ -#define LBR_IND_CALL_BIT 4 /* do not capture indirect calls */ -#define LBR_RETURN_BIT 5 /* do not capture near returns */ -#define LBR_IND_JMP_BIT 6 /* do not capture indirect jumps */ -#define LBR_REL_JMP_BIT 7 /* do not capture relative jumps */ -#define LBR_FAR_BIT 8 /* do not capture far branches */ -#define LBR_CALL_STACK_BIT 9 /* enable call stack */ - -/* - * Following bit only exists in Linux; we mask it out before writing it to - * the actual MSR. But it helps the constraint perf code to understand - * that this is a separate configuration. - */ -#define LBR_NO_INFO_BIT 63 /* don't read LBR_INFO. */ - -#define LBR_KERNEL (1 << LBR_KERNEL_BIT) -#define LBR_USER (1 << LBR_USER_BIT) -#define LBR_JCC (1 << LBR_JCC_BIT) -#define LBR_REL_CALL (1 << LBR_REL_CALL_BIT) -#define LBR_IND_CALL (1 << LBR_IND_CALL_BIT) -#define LBR_RETURN (1 << LBR_RETURN_BIT) -#define LBR_REL_JMP (1 << LBR_REL_JMP_BIT) -#define LBR_IND_JMP (1 << LBR_IND_JMP_BIT) -#define LBR_FAR (1 << LBR_FAR_BIT) -#define LBR_CALL_STACK (1 << LBR_CALL_STACK_BIT) -#define LBR_NO_INFO (1ULL << LBR_NO_INFO_BIT) - -#define LBR_PLM (LBR_KERNEL | LBR_USER) - -#define LBR_SEL_MASK 0x1ff /* valid bits in LBR_SELECT */ -#define LBR_NOT_SUPP -1 /* LBR filter not supported */ -#define LBR_IGN 0 /* ignored */ - -#define LBR_ANY \ - (LBR_JCC |\ - LBR_REL_CALL |\ - LBR_IND_CALL |\ - LBR_RETURN |\ - LBR_REL_JMP |\ - LBR_IND_JMP |\ - LBR_FAR) - -#define LBR_FROM_FLAG_MISPRED (1ULL << 63) -#define LBR_FROM_FLAG_IN_TX (1ULL << 62) -#define LBR_FROM_FLAG_ABORT (1ULL << 61) - -/* - * x86control flow change classification - * x86control flow changes include branches, interrupts, traps, faults - */ -enum { - X86_BR_NONE = 0, /* unknown */ - - X86_BR_USER = 1 << 0, /* branch target is user */ - X86_BR_KERNEL = 1 << 1, /* branch target is kernel */ - - X86_BR_CALL = 1 << 2, /* call */ - X86_BR_RET = 1 << 3, /* return */ - X86_BR_SYSCALL = 1 << 4, /* syscall */ - X86_BR_SYSRET = 1 << 5, /* syscall return */ - X86_BR_INT = 1 << 6, /* sw interrupt */ - X86_BR_IRET = 1 << 7, /* return from interrupt */ - X86_BR_JCC = 1 << 8, /* conditional */ - X86_BR_JMP = 1 << 9, /* jump */ - X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */ - X86_BR_IND_CALL = 1 << 11,/* indirect calls */ - X86_BR_ABORT = 1 << 12,/* transaction abort */ - X86_BR_IN_TX = 1 << 13,/* in transaction */ - X86_BR_NO_TX = 1 << 14,/* not in transaction */ - X86_BR_ZERO_CALL = 1 << 15,/* zero length call */ - X86_BR_CALL_STACK = 1 << 16,/* call stack */ - X86_BR_IND_JMP = 1 << 17,/* indirect jump */ -}; - -#define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL) -#define X86_BR_ANYTX (X86_BR_NO_TX | X86_BR_IN_TX) - -#define X86_BR_ANY \ - (X86_BR_CALL |\ - X86_BR_RET |\ - X86_BR_SYSCALL |\ - X86_BR_SYSRET |\ - X86_BR_INT |\ - X86_BR_IRET |\ - X86_BR_JCC |\ - X86_BR_JMP |\ - X86_BR_IRQ |\ - X86_BR_ABORT |\ - X86_BR_IND_CALL |\ - X86_BR_IND_JMP |\ - X86_BR_ZERO_CALL) - -#define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY) - -#define X86_BR_ANY_CALL \ - (X86_BR_CALL |\ - X86_BR_IND_CALL |\ - X86_BR_ZERO_CALL |\ - X86_BR_SYSCALL |\ - X86_BR_IRQ |\ - X86_BR_INT) - -static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc); - -/* - * We only support LBR implementations that have FREEZE_LBRS_ON_PMI - * otherwise it becomes near impossible to get a reliable stack. - */ - -static void __intel_pmu_lbr_enable(bool pmi) -{ - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - u64 debugctl, lbr_select = 0, orig_debugctl; - - /* - * No need to unfreeze manually, as v4 can do that as part - * of the GLOBAL_STATUS ack. - */ - if (pmi && x86_pmu.version >= 4) - return; - - /* - * No need to reprogram LBR_SELECT in a PMI, as it - * did not change. - */ - if (cpuc->lbr_sel) - lbr_select = cpuc->lbr_sel->config & x86_pmu.lbr_sel_mask; - if (!pmi && cpuc->lbr_sel) - wrmsrl(MSR_LBR_SELECT, lbr_select); - - rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); - orig_debugctl = debugctl; - debugctl |= DEBUGCTLMSR_LBR; - /* - * LBR callstack does not work well with FREEZE_LBRS_ON_PMI. - * If FREEZE_LBRS_ON_PMI is set, PMI near call/return instructions - * may cause superfluous increase/decrease of LBR_TOS. - */ - if (!(lbr_select & LBR_CALL_STACK)) - debugctl |= DEBUGCTLMSR_FREEZE_LBRS_ON_PMI; - if (orig_debugctl != debugctl) - wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); -} - -static void __intel_pmu_lbr_disable(void) -{ - u64 debugctl; - - rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); - debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); - wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); -} - -static void intel_pmu_lbr_reset_32(void) -{ - int i; - - for (i = 0; i < x86_pmu.lbr_nr; i++) - wrmsrl(x86_pmu.lbr_from + i, 0); -} - -static void intel_pmu_lbr_reset_64(void) -{ - int i; - - for (i = 0; i < x86_pmu.lbr_nr; i++) { - wrmsrl(x86_pmu.lbr_from + i, 0); - wrmsrl(x86_pmu.lbr_to + i, 0); - if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) - wrmsrl(MSR_LBR_INFO_0 + i, 0); - } -} - -void intel_pmu_lbr_reset(void) -{ - if (!x86_pmu.lbr_nr) - return; - - if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32) - intel_pmu_lbr_reset_32(); - else - intel_pmu_lbr_reset_64(); -} - -/* - * TOS = most recently recorded branch - */ -static inline u64 intel_pmu_lbr_tos(void) -{ - u64 tos; - - rdmsrl(x86_pmu.lbr_tos, tos); - return tos; -} - -enum { - LBR_NONE, - LBR_VALID, -}; - -static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx) -{ - int i; - unsigned lbr_idx, mask; - u64 tos; - - if (task_ctx->lbr_callstack_users == 0 || - task_ctx->lbr_stack_state == LBR_NONE) { - intel_pmu_lbr_reset(); - return; - } - - mask = x86_pmu.lbr_nr - 1; - tos = task_ctx->tos; - for (i = 0; i < tos; i++) { - lbr_idx = (tos - i) & mask; - wrmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]); - wrmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]); - if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) - wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]); - } - wrmsrl(x86_pmu.lbr_tos, tos); - task_ctx->lbr_stack_state = LBR_NONE; -} - -static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx) -{ - int i; - unsigned lbr_idx, mask; - u64 tos; - - if (task_ctx->lbr_callstack_users == 0) { - task_ctx->lbr_stack_state = LBR_NONE; - return; - } - - mask = x86_pmu.lbr_nr - 1; - tos = intel_pmu_lbr_tos(); - for (i = 0; i < tos; i++) { - lbr_idx = (tos - i) & mask; - rdmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]); - rdmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]); - if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) - rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]); - } - task_ctx->tos = tos; - task_ctx->lbr_stack_state = LBR_VALID; -} - -void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in) -{ - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - struct x86_perf_task_context *task_ctx; - - /* - * If LBR callstack feature is enabled and the stack was saved when - * the task was scheduled out, restore the stack. Otherwise flush - * the LBR stack. - */ - task_ctx = ctx ? ctx->task_ctx_data : NULL; - if (task_ctx) { - if (sched_in) { - __intel_pmu_lbr_restore(task_ctx); - cpuc->lbr_context = ctx; - } else { - __intel_pmu_lbr_save(task_ctx); - } - return; - } - - /* - * When sampling the branck stack in system-wide, it may be - * necessary to flush the stack on context switch. This happens - * when the branch stack does not tag its entries with the pid - * of the current task. Otherwise it becomes impossible to - * associate a branch entry with a task. This ambiguity is more - * likely to appear when the branch stack supports priv level - * filtering and the user sets it to monitor only at the user - * level (which could be a useful measurement in system-wide - * mode). In that case, the risk is high of having a branch - * stack with branch from multiple tasks. - */ - if (sched_in) { - intel_pmu_lbr_reset(); - cpuc->lbr_context = ctx; - } -} - -static inline bool branch_user_callstack(unsigned br_sel) -{ - return (br_sel & X86_BR_USER) && (br_sel & X86_BR_CALL_STACK); -} - -void intel_pmu_lbr_enable(struct perf_event *event) -{ - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - struct x86_perf_task_context *task_ctx; - - if (!x86_pmu.lbr_nr) - return; - - /* - * Reset the LBR stack if we changed task context to - * avoid data leaks. - */ - if (event->ctx->task && cpuc->lbr_context != event->ctx) { - intel_pmu_lbr_reset(); - cpuc->lbr_context = event->ctx; - } - cpuc->br_sel = event->hw.branch_reg.reg; - - if (branch_user_callstack(cpuc->br_sel) && event->ctx && - event->ctx->task_ctx_data) { - task_ctx = event->ctx->task_ctx_data; - task_ctx->lbr_callstack_users++; - } - - cpuc->lbr_users++; - perf_sched_cb_inc(event->ctx->pmu); -} - -void intel_pmu_lbr_disable(struct perf_event *event) -{ - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - struct x86_perf_task_context *task_ctx; - - if (!x86_pmu.lbr_nr) - return; - - if (branch_user_callstack(cpuc->br_sel) && event->ctx && - event->ctx->task_ctx_data) { - task_ctx = event->ctx->task_ctx_data; - task_ctx->lbr_callstack_users--; - } - - cpuc->lbr_users--; - WARN_ON_ONCE(cpuc->lbr_users < 0); - perf_sched_cb_dec(event->ctx->pmu); - - if (cpuc->enabled && !cpuc->lbr_users) { - __intel_pmu_lbr_disable(); - /* avoid stale pointer */ - cpuc->lbr_context = NULL; - } -} - -void intel_pmu_lbr_enable_all(bool pmi) -{ - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - - if (cpuc->lbr_users) - __intel_pmu_lbr_enable(pmi); -} - -void intel_pmu_lbr_disable_all(void) -{ - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - - if (cpuc->lbr_users) - __intel_pmu_lbr_disable(); -} - -static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) -{ - unsigned long mask = x86_pmu.lbr_nr - 1; - u64 tos = intel_pmu_lbr_tos(); - int i; - - for (i = 0; i < x86_pmu.lbr_nr; i++) { - unsigned long lbr_idx = (tos - i) & mask; - union { - struct { - u32 from; - u32 to; - }; - u64 lbr; - } msr_lastbranch; - - rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr); - - cpuc->lbr_entries[i].from = msr_lastbranch.from; - cpuc->lbr_entries[i].to = msr_lastbranch.to; - cpuc->lbr_entries[i].mispred = 0; - cpuc->lbr_entries[i].predicted = 0; - cpuc->lbr_entries[i].reserved = 0; - } - cpuc->lbr_stack.nr = i; -} - -/* - * Due to lack of segmentation in Linux the effective address (offset) - * is the same as the linear address, allowing us to merge the LIP and EIP - * LBR formats. - */ -static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) -{ - bool need_info = false; - unsigned long mask = x86_pmu.lbr_nr - 1; - int lbr_format = x86_pmu.intel_cap.lbr_format; - u64 tos = intel_pmu_lbr_tos(); - int i; - int out = 0; - int num = x86_pmu.lbr_nr; - - if (cpuc->lbr_sel) { - need_info = !(cpuc->lbr_sel->config & LBR_NO_INFO); - if (cpuc->lbr_sel->config & LBR_CALL_STACK) - num = tos; - } - - for (i = 0; i < num; i++) { - unsigned long lbr_idx = (tos - i) & mask; - u64 from, to, mis = 0, pred = 0, in_tx = 0, abort = 0; - int skip = 0; - u16 cycles = 0; - int lbr_flags = lbr_desc[lbr_format]; - - rdmsrl(x86_pmu.lbr_from + lbr_idx, from); - rdmsrl(x86_pmu.lbr_to + lbr_idx, to); - - if (lbr_format == LBR_FORMAT_INFO && need_info) { - u64 info; - - rdmsrl(MSR_LBR_INFO_0 + lbr_idx, info); - mis = !!(info & LBR_INFO_MISPRED); - pred = !mis; - in_tx = !!(info & LBR_INFO_IN_TX); - abort = !!(info & LBR_INFO_ABORT); - cycles = (info & LBR_INFO_CYCLES); - } - if (lbr_flags & LBR_EIP_FLAGS) { - mis = !!(from & LBR_FROM_FLAG_MISPRED); - pred = !mis; - skip = 1; - } - if (lbr_flags & LBR_TSX) { - in_tx = !!(from & LBR_FROM_FLAG_IN_TX); - abort = !!(from & LBR_FROM_FLAG_ABORT); - skip = 3; - } - from = (u64)((((s64)from) << skip) >> skip); - - /* - * Some CPUs report duplicated abort records, - * with the second entry not having an abort bit set. - * Skip them here. This loop runs backwards, - * so we need to undo the previous record. - * If the abort just happened outside the window - * the extra entry cannot be removed. - */ - if (abort && x86_pmu.lbr_double_abort && out > 0) - out--; - - cpuc->lbr_entries[out].from = from; - cpuc->lbr_entries[out].to = to; - cpuc->lbr_entries[out].mispred = mis; - cpuc->lbr_entries[out].predicted = pred; - cpuc->lbr_entries[out].in_tx = in_tx; - cpuc->lbr_entries[out].abort = abort; - cpuc->lbr_entries[out].cycles = cycles; - cpuc->lbr_entries[out].reserved = 0; - out++; - } - cpuc->lbr_stack.nr = out; -} - -void intel_pmu_lbr_read(void) -{ - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - - if (!cpuc->lbr_users) - return; - - if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32) - intel_pmu_lbr_read_32(cpuc); - else - intel_pmu_lbr_read_64(cpuc); - - intel_pmu_lbr_filter(cpuc); -} - -/* - * SW filter is used: - * - in case there is no HW filter - * - in case the HW filter has errata or limitations - */ -static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event) -{ - u64 br_type = event->attr.branch_sample_type; - int mask = 0; - - if (br_type & PERF_SAMPLE_BRANCH_USER) - mask |= X86_BR_USER; - - if (br_type & PERF_SAMPLE_BRANCH_KERNEL) - mask |= X86_BR_KERNEL; - - /* we ignore BRANCH_HV here */ - - if (br_type & PERF_SAMPLE_BRANCH_ANY) - mask |= X86_BR_ANY; - - if (br_type & PERF_SAMPLE_BRANCH_ANY_CALL) - mask |= X86_BR_ANY_CALL; - - if (br_type & PERF_SAMPLE_BRANCH_ANY_RETURN) - mask |= X86_BR_RET | X86_BR_IRET | X86_BR_SYSRET; - - if (br_type & PERF_SAMPLE_BRANCH_IND_CALL) - mask |= X86_BR_IND_CALL; - - if (br_type & PERF_SAMPLE_BRANCH_ABORT_TX) - mask |= X86_BR_ABORT; - - if (br_type & PERF_SAMPLE_BRANCH_IN_TX) - mask |= X86_BR_IN_TX; - - if (br_type & PERF_SAMPLE_BRANCH_NO_TX) - mask |= X86_BR_NO_TX; - - if (br_type & PERF_SAMPLE_BRANCH_COND) - mask |= X86_BR_JCC; - - if (br_type & PERF_SAMPLE_BRANCH_CALL_STACK) { - if (!x86_pmu_has_lbr_callstack()) - return -EOPNOTSUPP; - if (mask & ~(X86_BR_USER | X86_BR_KERNEL)) - return -EINVAL; - mask |= X86_BR_CALL | X86_BR_IND_CALL | X86_BR_RET | - X86_BR_CALL_STACK; - } - - if (br_type & PERF_SAMPLE_BRANCH_IND_JUMP) - mask |= X86_BR_IND_JMP; - - if (br_type & PERF_SAMPLE_BRANCH_CALL) - mask |= X86_BR_CALL | X86_BR_ZERO_CALL; - /* - * stash actual user request into reg, it may - * be used by fixup code for some CPU - */ - event->hw.branch_reg.reg = mask; - return 0; -} - -/* - * setup the HW LBR filter - * Used only when available, may not be enough to disambiguate - * all branches, may need the help of the SW filter - */ -static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event) -{ - struct hw_perf_event_extra *reg; - u64 br_type = event->attr.branch_sample_type; - u64 mask = 0, v; - int i; - - for (i = 0; i < PERF_SAMPLE_BRANCH_MAX_SHIFT; i++) { - if (!(br_type & (1ULL << i))) - continue; - - v = x86_pmu.lbr_sel_map[i]; - if (v == LBR_NOT_SUPP) - return -EOPNOTSUPP; - - if (v != LBR_IGN) - mask |= v; - } - - reg = &event->hw.branch_reg; - reg->idx = EXTRA_REG_LBR; - - /* - * The first 9 bits (LBR_SEL_MASK) in LBR_SELECT operate - * in suppress mode. So LBR_SELECT should be set to - * (~mask & LBR_SEL_MASK) | (mask & ~LBR_SEL_MASK) - */ - reg->config = mask ^ x86_pmu.lbr_sel_mask; - - if ((br_type & PERF_SAMPLE_BRANCH_NO_CYCLES) && - (br_type & PERF_SAMPLE_BRANCH_NO_FLAGS) && - (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)) - reg->config |= LBR_NO_INFO; - - return 0; -} - -int intel_pmu_setup_lbr_filter(struct perf_event *event) -{ - int ret = 0; - - /* - * no LBR on this PMU - */ - if (!x86_pmu.lbr_nr) - return -EOPNOTSUPP; - - /* - * setup SW LBR filter - */ - ret = intel_pmu_setup_sw_lbr_filter(event); - if (ret) - return ret; - - /* - * setup HW LBR filter, if any - */ - if (x86_pmu.lbr_sel_map) - ret = intel_pmu_setup_hw_lbr_filter(event); - - return ret; -} - -/* - * return the type of control flow change at address "from" - * intruction is not necessarily a branch (in case of interrupt). - * - * The branch type returned also includes the priv level of the - * target of the control flow change (X86_BR_USER, X86_BR_KERNEL). - * - * If a branch type is unknown OR the instruction cannot be - * decoded (e.g., text page not present), then X86_BR_NONE is - * returned. - */ -static int branch_type(unsigned long from, unsigned long to, int abort) -{ - struct insn insn; - void *addr; - int bytes_read, bytes_left; - int ret = X86_BR_NONE; - int ext, to_plm, from_plm; - u8 buf[MAX_INSN_SIZE]; - int is64 = 0; - - to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER; - from_plm = kernel_ip(from) ? X86_BR_KERNEL : X86_BR_USER; - - /* - * maybe zero if lbr did not fill up after a reset by the time - * we get a PMU interrupt - */ - if (from == 0 || to == 0) - return X86_BR_NONE; - - if (abort) - return X86_BR_ABORT | to_plm; - - if (from_plm == X86_BR_USER) { - /* - * can happen if measuring at the user level only - * and we interrupt in a kernel thread, e.g., idle. - */ - if (!current->mm) - return X86_BR_NONE; - - /* may fail if text not present */ - bytes_left = copy_from_user_nmi(buf, (void __user *)from, - MAX_INSN_SIZE); - bytes_read = MAX_INSN_SIZE - bytes_left; - if (!bytes_read) - return X86_BR_NONE; - - addr = buf; - } else { - /* - * The LBR logs any address in the IP, even if the IP just - * faulted. This means userspace can control the from address. - * Ensure we don't blindy read any address by validating it is - * a known text address. - */ - if (kernel_text_address(from)) { - addr = (void *)from; - /* - * Assume we can get the maximum possible size - * when grabbing kernel data. This is not - * _strictly_ true since we could possibly be - * executing up next to a memory hole, but - * it is very unlikely to be a problem. - */ - bytes_read = MAX_INSN_SIZE; - } else { - return X86_BR_NONE; - } - } - - /* - * decoder needs to know the ABI especially - * on 64-bit systems running 32-bit apps - */ -#ifdef CONFIG_X86_64 - is64 = kernel_ip((unsigned long)addr) || !test_thread_flag(TIF_IA32); -#endif - insn_init(&insn, addr, bytes_read, is64); - insn_get_opcode(&insn); - if (!insn.opcode.got) - return X86_BR_ABORT; - - switch (insn.opcode.bytes[0]) { - case 0xf: - switch (insn.opcode.bytes[1]) { - case 0x05: /* syscall */ - case 0x34: /* sysenter */ - ret = X86_BR_SYSCALL; - break; - case 0x07: /* sysret */ - case 0x35: /* sysexit */ - ret = X86_BR_SYSRET; - break; - case 0x80 ... 0x8f: /* conditional */ - ret = X86_BR_JCC; - break; - default: - ret = X86_BR_NONE; - } - break; - case 0x70 ... 0x7f: /* conditional */ - ret = X86_BR_JCC; - break; - case 0xc2: /* near ret */ - case 0xc3: /* near ret */ - case 0xca: /* far ret */ - case 0xcb: /* far ret */ - ret = X86_BR_RET; - break; - case 0xcf: /* iret */ - ret = X86_BR_IRET; - break; - case 0xcc ... 0xce: /* int */ - ret = X86_BR_INT; - break; - case 0xe8: /* call near rel */ - insn_get_immediate(&insn); - if (insn.immediate1.value == 0) { - /* zero length call */ - ret = X86_BR_ZERO_CALL; - break; - } - case 0x9a: /* call far absolute */ - ret = X86_BR_CALL; - break; - case 0xe0 ... 0xe3: /* loop jmp */ - ret = X86_BR_JCC; - break; - case 0xe9 ... 0xeb: /* jmp */ - ret = X86_BR_JMP; - break; - case 0xff: /* call near absolute, call far absolute ind */ - insn_get_modrm(&insn); - ext = (insn.modrm.bytes[0] >> 3) & 0x7; - switch (ext) { - case 2: /* near ind call */ - case 3: /* far ind call */ - ret = X86_BR_IND_CALL; - break; - case 4: - case 5: - ret = X86_BR_IND_JMP; - break; - } - break; - default: - ret = X86_BR_NONE; - } - /* - * interrupts, traps, faults (and thus ring transition) may - * occur on any instructions. Thus, to classify them correctly, - * we need to first look at the from and to priv levels. If they - * are different and to is in the kernel, then it indicates - * a ring transition. If the from instruction is not a ring - * transition instr (syscall, systenter, int), then it means - * it was a irq, trap or fault. - * - * we have no way of detecting kernel to kernel faults. - */ - if (from_plm == X86_BR_USER && to_plm == X86_BR_KERNEL - && ret != X86_BR_SYSCALL && ret != X86_BR_INT) - ret = X86_BR_IRQ; - - /* - * branch priv level determined by target as - * is done by HW when LBR_SELECT is implemented - */ - if (ret != X86_BR_NONE) - ret |= to_plm; - - return ret; -} - -/* - * implement actual branch filter based on user demand. - * Hardware may not exactly satisfy that request, thus - * we need to inspect opcodes. Mismatched branches are - * discarded. Therefore, the number of branches returned - * in PERF_SAMPLE_BRANCH_STACK sample may vary. - */ -static void -intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) -{ - u64 from, to; - int br_sel = cpuc->br_sel; - int i, j, type; - bool compress = false; - - /* if sampling all branches, then nothing to filter */ - if ((br_sel & X86_BR_ALL) == X86_BR_ALL) - return; - - for (i = 0; i < cpuc->lbr_stack.nr; i++) { - - from = cpuc->lbr_entries[i].from; - to = cpuc->lbr_entries[i].to; - - type = branch_type(from, to, cpuc->lbr_entries[i].abort); - if (type != X86_BR_NONE && (br_sel & X86_BR_ANYTX)) { - if (cpuc->lbr_entries[i].in_tx) - type |= X86_BR_IN_TX; - else - type |= X86_BR_NO_TX; - } - - /* if type does not correspond, then discard */ - if (type == X86_BR_NONE || (br_sel & type) != type) { - cpuc->lbr_entries[i].from = 0; - compress = true; - } - } - - if (!compress) - return; - - /* remove all entries with from=0 */ - for (i = 0; i < cpuc->lbr_stack.nr; ) { - if (!cpuc->lbr_entries[i].from) { - j = i; - while (++j < cpuc->lbr_stack.nr) - cpuc->lbr_entries[j-1] = cpuc->lbr_entries[j]; - cpuc->lbr_stack.nr--; - if (!cpuc->lbr_entries[i].from) - continue; - } - i++; - } -} - -/* - * Map interface branch filters onto LBR filters - */ -static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { - [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY, - [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER, - [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL, - [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN, - [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_REL_JMP - | LBR_IND_JMP | LBR_FAR, - /* - * NHM/WSM erratum: must include REL_JMP+IND_JMP to get CALL branches - */ - [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = - LBR_REL_CALL | LBR_IND_CALL | LBR_REL_JMP | LBR_IND_JMP | LBR_FAR, - /* - * NHM/WSM erratum: must include IND_JMP to capture IND_CALL - */ - [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL | LBR_IND_JMP, - [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC, - [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_IND_JMP, -}; - -static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { - [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY, - [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER, - [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL, - [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN, - [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_FAR, - [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_REL_CALL | LBR_IND_CALL - | LBR_FAR, - [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL, - [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC, - [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_IND_JMP, - [PERF_SAMPLE_BRANCH_CALL_SHIFT] = LBR_REL_CALL, -}; - -static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { - [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY, - [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER, - [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL, - [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN, - [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_FAR, - [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_REL_CALL | LBR_IND_CALL - | LBR_FAR, - [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL, - [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC, - [PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] = LBR_REL_CALL | LBR_IND_CALL - | LBR_RETURN | LBR_CALL_STACK, - [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_IND_JMP, - [PERF_SAMPLE_BRANCH_CALL_SHIFT] = LBR_REL_CALL, -}; - -/* core */ -void __init intel_pmu_lbr_init_core(void) -{ - x86_pmu.lbr_nr = 4; - x86_pmu.lbr_tos = MSR_LBR_TOS; - x86_pmu.lbr_from = MSR_LBR_CORE_FROM; - x86_pmu.lbr_to = MSR_LBR_CORE_TO; - - /* - * SW branch filter usage: - * - compensate for lack of HW filter - */ - pr_cont("4-deep LBR, "); -} - -/* nehalem/westmere */ -void __init intel_pmu_lbr_init_nhm(void) -{ - x86_pmu.lbr_nr = 16; - x86_pmu.lbr_tos = MSR_LBR_TOS; - x86_pmu.lbr_from = MSR_LBR_NHM_FROM; - x86_pmu.lbr_to = MSR_LBR_NHM_TO; - - x86_pmu.lbr_sel_mask = LBR_SEL_MASK; - x86_pmu.lbr_sel_map = nhm_lbr_sel_map; - - /* - * SW branch filter usage: - * - workaround LBR_SEL errata (see above) - * - support syscall, sysret capture. - * That requires LBR_FAR but that means far - * jmp need to be filtered out - */ - pr_cont("16-deep LBR, "); -} - -/* sandy bridge */ -void __init intel_pmu_lbr_init_snb(void) -{ - x86_pmu.lbr_nr = 16; - x86_pmu.lbr_tos = MSR_LBR_TOS; - x86_pmu.lbr_from = MSR_LBR_NHM_FROM; - x86_pmu.lbr_to = MSR_LBR_NHM_TO; - - x86_pmu.lbr_sel_mask = LBR_SEL_MASK; - x86_pmu.lbr_sel_map = snb_lbr_sel_map; - - /* - * SW branch filter usage: - * - support syscall, sysret capture. - * That requires LBR_FAR but that means far - * jmp need to be filtered out - */ - pr_cont("16-deep LBR, "); -} - -/* haswell */ -void intel_pmu_lbr_init_hsw(void) -{ - x86_pmu.lbr_nr = 16; - x86_pmu.lbr_tos = MSR_LBR_TOS; - x86_pmu.lbr_from = MSR_LBR_NHM_FROM; - x86_pmu.lbr_to = MSR_LBR_NHM_TO; - - x86_pmu.lbr_sel_mask = LBR_SEL_MASK; - x86_pmu.lbr_sel_map = hsw_lbr_sel_map; - - pr_cont("16-deep LBR, "); -} - -/* skylake */ -__init void intel_pmu_lbr_init_skl(void) -{ - x86_pmu.lbr_nr = 32; - x86_pmu.lbr_tos = MSR_LBR_TOS; - x86_pmu.lbr_from = MSR_LBR_NHM_FROM; - x86_pmu.lbr_to = MSR_LBR_NHM_TO; - - x86_pmu.lbr_sel_mask = LBR_SEL_MASK; - x86_pmu.lbr_sel_map = hsw_lbr_sel_map; - - /* - * SW branch filter usage: - * - support syscall, sysret capture. - * That requires LBR_FAR but that means far - * jmp need to be filtered out - */ - pr_cont("32-deep LBR, "); -} - -/* atom */ -void __init intel_pmu_lbr_init_atom(void) -{ - /* - * only models starting at stepping 10 seems - * to have an operational LBR which can freeze - * on PMU interrupt - */ - if (boot_cpu_data.x86_model == 28 - && boot_cpu_data.x86_mask < 10) { - pr_cont("LBR disabled due to erratum"); - return; - } - - x86_pmu.lbr_nr = 8; - x86_pmu.lbr_tos = MSR_LBR_TOS; - x86_pmu.lbr_from = MSR_LBR_CORE_FROM; - x86_pmu.lbr_to = MSR_LBR_CORE_TO; - - /* - * SW branch filter usage: - * - compensate for lack of HW filter - */ - pr_cont("8-deep LBR, "); -} - -/* Knights Landing */ -void intel_pmu_lbr_init_knl(void) -{ - x86_pmu.lbr_nr = 8; - x86_pmu.lbr_tos = MSR_LBR_TOS; - x86_pmu.lbr_from = MSR_LBR_NHM_FROM; - x86_pmu.lbr_to = MSR_LBR_NHM_TO; - - x86_pmu.lbr_sel_mask = LBR_SEL_MASK; - x86_pmu.lbr_sel_map = snb_lbr_sel_map; - - pr_cont("8-deep LBR, "); -} diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c deleted file mode 100644 index c0bbd1033b7c..000000000000 --- a/arch/x86/kernel/cpu/perf_event_intel_pt.c +++ /dev/null @@ -1,1188 +0,0 @@ -/* - * Intel(R) Processor Trace PMU driver for perf - * Copyright (c) 2013-2014, Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * Intel PT is specified in the Intel Architecture Instruction Set Extensions - * Programming Reference: - * http://software.intel.com/en-us/intel-isa-extensions - */ - -#undef DEBUG - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/types.h> -#include <linux/slab.h> -#include <linux/device.h> - -#include <asm/perf_event.h> -#include <asm/insn.h> -#include <asm/io.h> -#include <asm/intel_pt.h> - -#include "perf_event.h" -#include "intel_pt.h" - -static DEFINE_PER_CPU(struct pt, pt_ctx); - -static struct pt_pmu pt_pmu; - -enum cpuid_regs { - CR_EAX = 0, - CR_ECX, - CR_EDX, - CR_EBX -}; - -/* - * Capabilities of Intel PT hardware, such as number of address bits or - * supported output schemes, are cached and exported to userspace as "caps" - * attribute group of pt pmu device - * (/sys/bus/event_source/devices/intel_pt/caps/) so that userspace can store - * relevant bits together with intel_pt traces. - * - * These are necessary for both trace decoding (payloads_lip, contains address - * width encoded in IP-related packets), and event configuration (bitmasks with - * permitted values for certain bit fields). - */ -#define PT_CAP(_n, _l, _r, _m) \ - [PT_CAP_ ## _n] = { .name = __stringify(_n), .leaf = _l, \ - .reg = _r, .mask = _m } - -static struct pt_cap_desc { - const char *name; - u32 leaf; - u8 reg; - u32 mask; -} pt_caps[] = { - PT_CAP(max_subleaf, 0, CR_EAX, 0xffffffff), - PT_CAP(cr3_filtering, 0, CR_EBX, BIT(0)), - PT_CAP(psb_cyc, 0, CR_EBX, BIT(1)), - PT_CAP(mtc, 0, CR_EBX, BIT(3)), - PT_CAP(topa_output, 0, CR_ECX, BIT(0)), - PT_CAP(topa_multiple_entries, 0, CR_ECX, BIT(1)), - PT_CAP(single_range_output, 0, CR_ECX, BIT(2)), - PT_CAP(payloads_lip, 0, CR_ECX, BIT(31)), - PT_CAP(mtc_periods, 1, CR_EAX, 0xffff0000), - PT_CAP(cycle_thresholds, 1, CR_EBX, 0xffff), - PT_CAP(psb_periods, 1, CR_EBX, 0xffff0000), -}; - -static u32 pt_cap_get(enum pt_capabilities cap) -{ - struct pt_cap_desc *cd = &pt_caps[cap]; - u32 c = pt_pmu.caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg]; - unsigned int shift = __ffs(cd->mask); - - return (c & cd->mask) >> shift; -} - -static ssize_t pt_cap_show(struct device *cdev, - struct device_attribute *attr, - char *buf) -{ - struct dev_ext_attribute *ea = - container_of(attr, struct dev_ext_attribute, attr); - enum pt_capabilities cap = (long)ea->var; - - return snprintf(buf, PAGE_SIZE, "%x\n", pt_cap_get(cap)); -} - -static struct attribute_group pt_cap_group = { - .name = "caps", -}; - -PMU_FORMAT_ATTR(cyc, "config:1" ); -PMU_FORMAT_ATTR(mtc, "config:9" ); -PMU_FORMAT_ATTR(tsc, "config:10" ); -PMU_FORMAT_ATTR(noretcomp, "config:11" ); -PMU_FORMAT_ATTR(mtc_period, "config:14-17" ); -PMU_FORMAT_ATTR(cyc_thresh, "config:19-22" ); -PMU_FORMAT_ATTR(psb_period, "config:24-27" ); - -static struct attribute *pt_formats_attr[] = { - &format_attr_cyc.attr, - &format_attr_mtc.attr, - &format_attr_tsc.attr, - &format_attr_noretcomp.attr, - &format_attr_mtc_period.attr, - &format_attr_cyc_thresh.attr, - &format_attr_psb_period.attr, - NULL, -}; - -static struct attribute_group pt_format_group = { - .name = "format", - .attrs = pt_formats_attr, -}; - -static const struct attribute_group *pt_attr_groups[] = { - &pt_cap_group, - &pt_format_group, - NULL, -}; - -static int __init pt_pmu_hw_init(void) -{ - struct dev_ext_attribute *de_attrs; - struct attribute **attrs; - size_t size; - int ret; - long i; - - attrs = NULL; - - for (i = 0; i < PT_CPUID_LEAVES; i++) { - cpuid_count(20, i, - &pt_pmu.caps[CR_EAX + i*PT_CPUID_REGS_NUM], - &pt_pmu.caps[CR_EBX + i*PT_CPUID_REGS_NUM], - &pt_pmu.caps[CR_ECX + i*PT_CPUID_REGS_NUM], - &pt_pmu.caps[CR_EDX + i*PT_CPUID_REGS_NUM]); - } - - ret = -ENOMEM; - size = sizeof(struct attribute *) * (ARRAY_SIZE(pt_caps)+1); - attrs = kzalloc(size, GFP_KERNEL); - if (!attrs) - goto fail; - - size = sizeof(struct dev_ext_attribute) * (ARRAY_SIZE(pt_caps)+1); - de_attrs = kzalloc(size, GFP_KERNEL); - if (!de_attrs) - goto fail; - - for (i = 0; i < ARRAY_SIZE(pt_caps); i++) { - struct dev_ext_attribute *de_attr = de_attrs + i; - - de_attr->attr.attr.name = pt_caps[i].name; - - sysfs_attr_init(&de_attr->attr.attr); - - de_attr->attr.attr.mode = S_IRUGO; - de_attr->attr.show = pt_cap_show; - de_attr->var = (void *)i; - - attrs[i] = &de_attr->attr.attr; - } - - pt_cap_group.attrs = attrs; - - return 0; - -fail: - kfree(attrs); - - return ret; -} - -#define RTIT_CTL_CYC_PSB (RTIT_CTL_CYCLEACC | \ - RTIT_CTL_CYC_THRESH | \ - RTIT_CTL_PSB_FREQ) - -#define RTIT_CTL_MTC (RTIT_CTL_MTC_EN | \ - RTIT_CTL_MTC_RANGE) - -#define PT_CONFIG_MASK (RTIT_CTL_TSC_EN | \ - RTIT_CTL_DISRETC | \ - RTIT_CTL_CYC_PSB | \ - RTIT_CTL_MTC) - -static bool pt_event_valid(struct perf_event *event) -{ - u64 config = event->attr.config; - u64 allowed, requested; - - if ((config & PT_CONFIG_MASK) != config) - return false; - - if (config & RTIT_CTL_CYC_PSB) { - if (!pt_cap_get(PT_CAP_psb_cyc)) - return false; - - allowed = pt_cap_get(PT_CAP_psb_periods); - requested = (config & RTIT_CTL_PSB_FREQ) >> - RTIT_CTL_PSB_FREQ_OFFSET; - if (requested && (!(allowed & BIT(requested)))) - return false; - - allowed = pt_cap_get(PT_CAP_cycle_thresholds); - requested = (config & RTIT_CTL_CYC_THRESH) >> - RTIT_CTL_CYC_THRESH_OFFSET; - if (requested && (!(allowed & BIT(requested)))) - return false; - } - - if (config & RTIT_CTL_MTC) { - /* - * In the unlikely case that CPUID lists valid mtc periods, - * but not the mtc capability, drop out here. - * - * Spec says that setting mtc period bits while mtc bit in - * CPUID is 0 will #GP, so better safe than sorry. - */ - if (!pt_cap_get(PT_CAP_mtc)) - return false; - - allowed = pt_cap_get(PT_CAP_mtc_periods); - if (!allowed) - return false; - - requested = (config & RTIT_CTL_MTC_RANGE) >> - RTIT_CTL_MTC_RANGE_OFFSET; - - if (!(allowed & BIT(requested))) - return false; - } - - return true; -} - -/* - * PT configuration helpers - * These all are cpu affine and operate on a local PT - */ - -static void pt_config(struct perf_event *event) -{ - u64 reg; - - if (!event->hw.itrace_started) { - event->hw.itrace_started = 1; - wrmsrl(MSR_IA32_RTIT_STATUS, 0); - } - - reg = RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN; - - if (!event->attr.exclude_kernel) - reg |= RTIT_CTL_OS; - if (!event->attr.exclude_user) - reg |= RTIT_CTL_USR; - - reg |= (event->attr.config & PT_CONFIG_MASK); - - wrmsrl(MSR_IA32_RTIT_CTL, reg); -} - -static void pt_config_start(bool start) -{ - u64 ctl; - - rdmsrl(MSR_IA32_RTIT_CTL, ctl); - if (start) - ctl |= RTIT_CTL_TRACEEN; - else - ctl &= ~RTIT_CTL_TRACEEN; - wrmsrl(MSR_IA32_RTIT_CTL, ctl); - - /* - * A wrmsr that disables trace generation serializes other PT - * registers and causes all data packets to be written to memory, - * but a fence is required for the data to become globally visible. - * - * The below WMB, separating data store and aux_head store matches - * the consumer's RMB that separates aux_head load and data load. - */ - if (!start) - wmb(); -} - -static void pt_config_buffer(void *buf, unsigned int topa_idx, - unsigned int output_off) -{ - u64 reg; - - wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, virt_to_phys(buf)); - - reg = 0x7f | ((u64)topa_idx << 7) | ((u64)output_off << 32); - - wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, reg); -} - -/* - * Keep ToPA table-related metadata on the same page as the actual table, - * taking up a few words from the top - */ - -#define TENTS_PER_PAGE (((PAGE_SIZE - 40) / sizeof(struct topa_entry)) - 1) - -/** - * struct topa - page-sized ToPA table with metadata at the top - * @table: actual ToPA table entries, as understood by PT hardware - * @list: linkage to struct pt_buffer's list of tables - * @phys: physical address of this page - * @offset: offset of the first entry in this table in the buffer - * @size: total size of all entries in this table - * @last: index of the last initialized entry in this table - */ -struct topa { - struct topa_entry table[TENTS_PER_PAGE]; - struct list_head list; - u64 phys; - u64 offset; - size_t size; - int last; -}; - -/* make -1 stand for the last table entry */ -#define TOPA_ENTRY(t, i) ((i) == -1 ? &(t)->table[(t)->last] : &(t)->table[(i)]) - -/** - * topa_alloc() - allocate page-sized ToPA table - * @cpu: CPU on which to allocate. - * @gfp: Allocation flags. - * - * Return: On success, return the pointer to ToPA table page. - */ -static struct topa *topa_alloc(int cpu, gfp_t gfp) -{ - int node = cpu_to_node(cpu); - struct topa *topa; - struct page *p; - - p = alloc_pages_node(node, gfp | __GFP_ZERO, 0); - if (!p) - return NULL; - - topa = page_address(p); - topa->last = 0; - topa->phys = page_to_phys(p); - - /* - * In case of singe-entry ToPA, always put the self-referencing END - * link as the 2nd entry in the table - */ - if (!pt_cap_get(PT_CAP_topa_multiple_entries)) { - TOPA_ENTRY(topa, 1)->base = topa->phys >> TOPA_SHIFT; - TOPA_ENTRY(topa, 1)->end = 1; - } - - return topa; -} - -/** - * topa_free() - free a page-sized ToPA table - * @topa: Table to deallocate. - */ -static void topa_free(struct topa *topa) -{ - free_page((unsigned long)topa); -} - -/** - * topa_insert_table() - insert a ToPA table into a buffer - * @buf: PT buffer that's being extended. - * @topa: New topa table to be inserted. - * - * If it's the first table in this buffer, set up buffer's pointers - * accordingly; otherwise, add a END=1 link entry to @topa to the current - * "last" table and adjust the last table pointer to @topa. - */ -static void topa_insert_table(struct pt_buffer *buf, struct topa *topa) -{ - struct topa *last = buf->last; - - list_add_tail(&topa->list, &buf->tables); - - if (!buf->first) { - buf->first = buf->last = buf->cur = topa; - return; - } - - topa->offset = last->offset + last->size; - buf->last = topa; - - if (!pt_cap_get(PT_CAP_topa_multiple_entries)) - return; - - BUG_ON(last->last != TENTS_PER_PAGE - 1); - - TOPA_ENTRY(last, -1)->base = topa->phys >> TOPA_SHIFT; - TOPA_ENTRY(last, -1)->end = 1; -} - -/** - * topa_table_full() - check if a ToPA table is filled up - * @topa: ToPA table. - */ -static bool topa_table_full(struct topa *topa) -{ - /* single-entry ToPA is a special case */ - if (!pt_cap_get(PT_CAP_topa_multiple_entries)) - return !!topa->last; - - return topa->last == TENTS_PER_PAGE - 1; -} - -/** - * topa_insert_pages() - create a list of ToPA tables - * @buf: PT buffer being initialized. - * @gfp: Allocation flags. - * - * This initializes a list of ToPA tables with entries from - * the data_pages provided by rb_alloc_aux(). - * - * Return: 0 on success or error code. - */ -static int topa_insert_pages(struct pt_buffer *buf, gfp_t gfp) -{ - struct topa *topa = buf->last; - int order = 0; - struct page *p; - - p = virt_to_page(buf->data_pages[buf->nr_pages]); - if (PagePrivate(p)) - order = page_private(p); - - if (topa_table_full(topa)) { - topa = topa_alloc(buf->cpu, gfp); - if (!topa) - return -ENOMEM; - - topa_insert_table(buf, topa); - } - - TOPA_ENTRY(topa, -1)->base = page_to_phys(p) >> TOPA_SHIFT; - TOPA_ENTRY(topa, -1)->size = order; - if (!buf->snapshot && !pt_cap_get(PT_CAP_topa_multiple_entries)) { - TOPA_ENTRY(topa, -1)->intr = 1; - TOPA_ENTRY(topa, -1)->stop = 1; - } - - topa->last++; - topa->size += sizes(order); - - buf->nr_pages += 1ul << order; - - return 0; -} - -/** - * pt_topa_dump() - print ToPA tables and their entries - * @buf: PT buffer. - */ -static void pt_topa_dump(struct pt_buffer *buf) -{ - struct topa *topa; - - list_for_each_entry(topa, &buf->tables, list) { - int i; - - pr_debug("# table @%p (%016Lx), off %llx size %zx\n", topa->table, - topa->phys, topa->offset, topa->size); - for (i = 0; i < TENTS_PER_PAGE; i++) { - pr_debug("# entry @%p (%lx sz %u %c%c%c) raw=%16llx\n", - &topa->table[i], - (unsigned long)topa->table[i].base << TOPA_SHIFT, - sizes(topa->table[i].size), - topa->table[i].end ? 'E' : ' ', - topa->table[i].intr ? 'I' : ' ', - topa->table[i].stop ? 'S' : ' ', - *(u64 *)&topa->table[i]); - if ((pt_cap_get(PT_CAP_topa_multiple_entries) && - topa->table[i].stop) || - topa->table[i].end) - break; - } - } -} - -/** - * pt_buffer_advance() - advance to the next output region - * @buf: PT buffer. - * - * Advance the current pointers in the buffer to the next ToPA entry. - */ -static void pt_buffer_advance(struct pt_buffer *buf) -{ - buf->output_off = 0; - buf->cur_idx++; - - if (buf->cur_idx == buf->cur->last) { - if (buf->cur == buf->last) - buf->cur = buf->first; - else - buf->cur = list_entry(buf->cur->list.next, struct topa, - list); - buf->cur_idx = 0; - } -} - -/** - * pt_update_head() - calculate current offsets and sizes - * @pt: Per-cpu pt context. - * - * Update buffer's current write pointer position and data size. - */ -static void pt_update_head(struct pt *pt) -{ - struct pt_buffer *buf = perf_get_aux(&pt->handle); - u64 topa_idx, base, old; - - /* offset of the first region in this table from the beginning of buf */ - base = buf->cur->offset + buf->output_off; - - /* offset of the current output region within this table */ - for (topa_idx = 0; topa_idx < buf->cur_idx; topa_idx++) - base += sizes(buf->cur->table[topa_idx].size); - - if (buf->snapshot) { - local_set(&buf->data_size, base); - } else { - old = (local64_xchg(&buf->head, base) & - ((buf->nr_pages << PAGE_SHIFT) - 1)); - if (base < old) - base += buf->nr_pages << PAGE_SHIFT; - - local_add(base - old, &buf->data_size); - } -} - -/** - * pt_buffer_region() - obtain current output region's address - * @buf: PT buffer. - */ -static void *pt_buffer_region(struct pt_buffer *buf) -{ - return phys_to_virt(buf->cur->table[buf->cur_idx].base << TOPA_SHIFT); -} - -/** - * pt_buffer_region_size() - obtain current output region's size - * @buf: PT buffer. - */ -static size_t pt_buffer_region_size(struct pt_buffer *buf) -{ - return sizes(buf->cur->table[buf->cur_idx].size); -} - -/** - * pt_handle_status() - take care of possible status conditions - * @pt: Per-cpu pt context. - */ -static void pt_handle_status(struct pt *pt) -{ - struct pt_buffer *buf = perf_get_aux(&pt->handle); - int advance = 0; - u64 status; - - rdmsrl(MSR_IA32_RTIT_STATUS, status); - - if (status & RTIT_STATUS_ERROR) { - pr_err_ratelimited("ToPA ERROR encountered, trying to recover\n"); - pt_topa_dump(buf); - status &= ~RTIT_STATUS_ERROR; - } - - if (status & RTIT_STATUS_STOPPED) { - status &= ~RTIT_STATUS_STOPPED; - - /* - * On systems that only do single-entry ToPA, hitting STOP - * means we are already losing data; need to let the decoder - * know. - */ - if (!pt_cap_get(PT_CAP_topa_multiple_entries) || - buf->output_off == sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) { - local_inc(&buf->lost); - advance++; - } - } - - /* - * Also on single-entry ToPA implementations, interrupt will come - * before the output reaches its output region's boundary. - */ - if (!pt_cap_get(PT_CAP_topa_multiple_entries) && !buf->snapshot && - pt_buffer_region_size(buf) - buf->output_off <= TOPA_PMI_MARGIN) { - void *head = pt_buffer_region(buf); - - /* everything within this margin needs to be zeroed out */ - memset(head + buf->output_off, 0, - pt_buffer_region_size(buf) - - buf->output_off); - advance++; - } - - if (advance) - pt_buffer_advance(buf); - - wrmsrl(MSR_IA32_RTIT_STATUS, status); -} - -/** - * pt_read_offset() - translate registers into buffer pointers - * @buf: PT buffer. - * - * Set buffer's output pointers from MSR values. - */ -static void pt_read_offset(struct pt_buffer *buf) -{ - u64 offset, base_topa; - - rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, base_topa); - buf->cur = phys_to_virt(base_topa); - - rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, offset); - /* offset within current output region */ - buf->output_off = offset >> 32; - /* index of current output region within this table */ - buf->cur_idx = (offset & 0xffffff80) >> 7; -} - -/** - * pt_topa_next_entry() - obtain index of the first page in the next ToPA entry - * @buf: PT buffer. - * @pg: Page offset in the buffer. - * - * When advancing to the next output region (ToPA entry), given a page offset - * into the buffer, we need to find the offset of the first page in the next - * region. - */ -static unsigned int pt_topa_next_entry(struct pt_buffer *buf, unsigned int pg) -{ - struct topa_entry *te = buf->topa_index[pg]; - - /* one region */ - if (buf->first == buf->last && buf->first->last == 1) - return pg; - - do { - pg++; - pg &= buf->nr_pages - 1; - } while (buf->topa_index[pg] == te); - - return pg; -} - -/** - * pt_buffer_reset_markers() - place interrupt and stop bits in the buffer - * @buf: PT buffer. - * @handle: Current output handle. - * - * Place INT and STOP marks to prevent overwriting old data that the consumer - * hasn't yet collected and waking up the consumer after a certain fraction of - * the buffer has filled up. Only needed and sensible for non-snapshot counters. - * - * This obviously relies on buf::head to figure out buffer markers, so it has - * to be called after pt_buffer_reset_offsets() and before the hardware tracing - * is enabled. - */ -static int pt_buffer_reset_markers(struct pt_buffer *buf, - struct perf_output_handle *handle) - -{ - unsigned long head = local64_read(&buf->head); - unsigned long idx, npages, wakeup; - - /* can't stop in the middle of an output region */ - if (buf->output_off + handle->size + 1 < - sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) - return -EINVAL; - - - /* single entry ToPA is handled by marking all regions STOP=1 INT=1 */ - if (!pt_cap_get(PT_CAP_topa_multiple_entries)) - return 0; - - /* clear STOP and INT from current entry */ - buf->topa_index[buf->stop_pos]->stop = 0; - buf->topa_index[buf->intr_pos]->intr = 0; - - /* how many pages till the STOP marker */ - npages = handle->size >> PAGE_SHIFT; - - /* if it's on a page boundary, fill up one more page */ - if (!offset_in_page(head + handle->size + 1)) - npages++; - - idx = (head >> PAGE_SHIFT) + npages; - idx &= buf->nr_pages - 1; - buf->stop_pos = idx; - - wakeup = handle->wakeup >> PAGE_SHIFT; - - /* in the worst case, wake up the consumer one page before hard stop */ - idx = (head >> PAGE_SHIFT) + npages - 1; - if (idx > wakeup) - idx = wakeup; - - idx &= buf->nr_pages - 1; - buf->intr_pos = idx; - - buf->topa_index[buf->stop_pos]->stop = 1; - buf->topa_index[buf->intr_pos]->intr = 1; - - return 0; -} - -/** - * pt_buffer_setup_topa_index() - build topa_index[] table of regions - * @buf: PT buffer. - * - * topa_index[] references output regions indexed by offset into the - * buffer for purposes of quick reverse lookup. - */ -static void pt_buffer_setup_topa_index(struct pt_buffer *buf) -{ - struct topa *cur = buf->first, *prev = buf->last; - struct topa_entry *te_cur = TOPA_ENTRY(cur, 0), - *te_prev = TOPA_ENTRY(prev, prev->last - 1); - int pg = 0, idx = 0; - - while (pg < buf->nr_pages) { - int tidx; - - /* pages within one topa entry */ - for (tidx = 0; tidx < 1 << te_cur->size; tidx++, pg++) - buf->topa_index[pg] = te_prev; - - te_prev = te_cur; - - if (idx == cur->last - 1) { - /* advance to next topa table */ - idx = 0; - cur = list_entry(cur->list.next, struct topa, list); - } else { - idx++; - } - te_cur = TOPA_ENTRY(cur, idx); - } - -} - -/** - * pt_buffer_reset_offsets() - adjust buffer's write pointers from aux_head - * @buf: PT buffer. - * @head: Write pointer (aux_head) from AUX buffer. - * - * Find the ToPA table and entry corresponding to given @head and set buffer's - * "current" pointers accordingly. This is done after we have obtained the - * current aux_head position from a successful call to perf_aux_output_begin() - * to make sure the hardware is writing to the right place. - * - * This function modifies buf::{cur,cur_idx,output_off} that will be programmed - * into PT msrs when the tracing is enabled and buf::head and buf::data_size, - * which are used to determine INT and STOP markers' locations by a subsequent - * call to pt_buffer_reset_markers(). - */ -static void pt_buffer_reset_offsets(struct pt_buffer *buf, unsigned long head) -{ - int pg; - - if (buf->snapshot) - head &= (buf->nr_pages << PAGE_SHIFT) - 1; - - pg = (head >> PAGE_SHIFT) & (buf->nr_pages - 1); - pg = pt_topa_next_entry(buf, pg); - - buf->cur = (struct topa *)((unsigned long)buf->topa_index[pg] & PAGE_MASK); - buf->cur_idx = ((unsigned long)buf->topa_index[pg] - - (unsigned long)buf->cur) / sizeof(struct topa_entry); - buf->output_off = head & (sizes(buf->cur->table[buf->cur_idx].size) - 1); - - local64_set(&buf->head, head); - local_set(&buf->data_size, 0); -} - -/** - * pt_buffer_fini_topa() - deallocate ToPA structure of a buffer - * @buf: PT buffer. - */ -static void pt_buffer_fini_topa(struct pt_buffer *buf) -{ - struct topa *topa, *iter; - - list_for_each_entry_safe(topa, iter, &buf->tables, list) { - /* - * right now, this is in free_aux() path only, so - * no need to unlink this table from the list - */ - topa_free(topa); - } -} - -/** - * pt_buffer_init_topa() - initialize ToPA table for pt buffer - * @buf: PT buffer. - * @size: Total size of all regions within this ToPA. - * @gfp: Allocation flags. - */ -static int pt_buffer_init_topa(struct pt_buffer *buf, unsigned long nr_pages, - gfp_t gfp) -{ - struct topa *topa; - int err; - - topa = topa_alloc(buf->cpu, gfp); - if (!topa) - return -ENOMEM; - - topa_insert_table(buf, topa); - - while (buf->nr_pages < nr_pages) { - err = topa_insert_pages(buf, gfp); - if (err) { - pt_buffer_fini_topa(buf); - return -ENOMEM; - } - } - - pt_buffer_setup_topa_index(buf); - - /* link last table to the first one, unless we're double buffering */ - if (pt_cap_get(PT_CAP_topa_multiple_entries)) { - TOPA_ENTRY(buf->last, -1)->base = buf->first->phys >> TOPA_SHIFT; - TOPA_ENTRY(buf->last, -1)->end = 1; - } - - pt_topa_dump(buf); - return 0; -} - -/** - * pt_buffer_setup_aux() - set up topa tables for a PT buffer - * @cpu: Cpu on which to allocate, -1 means current. - * @pages: Array of pointers to buffer pages passed from perf core. - * @nr_pages: Number of pages in the buffer. - * @snapshot: If this is a snapshot/overwrite counter. - * - * This is a pmu::setup_aux callback that sets up ToPA tables and all the - * bookkeeping for an AUX buffer. - * - * Return: Our private PT buffer structure. - */ -static void * -pt_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool snapshot) -{ - struct pt_buffer *buf; - int node, ret; - - if (!nr_pages) - return NULL; - - if (cpu == -1) - cpu = raw_smp_processor_id(); - node = cpu_to_node(cpu); - - buf = kzalloc_node(offsetof(struct pt_buffer, topa_index[nr_pages]), - GFP_KERNEL, node); - if (!buf) - return NULL; - - buf->cpu = cpu; - buf->snapshot = snapshot; - buf->data_pages = pages; - - INIT_LIST_HEAD(&buf->tables); - - ret = pt_buffer_init_topa(buf, nr_pages, GFP_KERNEL); - if (ret) { - kfree(buf); - return NULL; - } - - return buf; -} - -/** - * pt_buffer_free_aux() - perf AUX deallocation path callback - * @data: PT buffer. - */ -static void pt_buffer_free_aux(void *data) -{ - struct pt_buffer *buf = data; - - pt_buffer_fini_topa(buf); - kfree(buf); -} - -/** - * pt_buffer_is_full() - check if the buffer is full - * @buf: PT buffer. - * @pt: Per-cpu pt handle. - * - * If the user hasn't read data from the output region that aux_head - * points to, the buffer is considered full: the user needs to read at - * least this region and update aux_tail to point past it. - */ -static bool pt_buffer_is_full(struct pt_buffer *buf, struct pt *pt) -{ - if (buf->snapshot) - return false; - - if (local_read(&buf->data_size) >= pt->handle.size) - return true; - - return false; -} - -/** - * intel_pt_interrupt() - PT PMI handler - */ -void intel_pt_interrupt(void) -{ - struct pt *pt = this_cpu_ptr(&pt_ctx); - struct pt_buffer *buf; - struct perf_event *event = pt->handle.event; - - /* - * There may be a dangling PT bit in the interrupt status register - * after PT has been disabled by pt_event_stop(). Make sure we don't - * do anything (particularly, re-enable) for this event here. - */ - if (!ACCESS_ONCE(pt->handle_nmi)) - return; - - pt_config_start(false); - - if (!event) - return; - - buf = perf_get_aux(&pt->handle); - if (!buf) - return; - - pt_read_offset(buf); - - pt_handle_status(pt); - - pt_update_head(pt); - - perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0), - local_xchg(&buf->lost, 0)); - - if (!event->hw.state) { - int ret; - - buf = perf_aux_output_begin(&pt->handle, event); - if (!buf) { - event->hw.state = PERF_HES_STOPPED; - return; - } - - pt_buffer_reset_offsets(buf, pt->handle.head); - /* snapshot counters don't use PMI, so it's safe */ - ret = pt_buffer_reset_markers(buf, &pt->handle); - if (ret) { - perf_aux_output_end(&pt->handle, 0, true); - return; - } - - pt_config_buffer(buf->cur->table, buf->cur_idx, - buf->output_off); - pt_config(event); - } -} - -/* - * PMU callbacks - */ - -static void pt_event_start(struct perf_event *event, int mode) -{ - struct pt *pt = this_cpu_ptr(&pt_ctx); - struct pt_buffer *buf = perf_get_aux(&pt->handle); - - if (!buf || pt_buffer_is_full(buf, pt)) { - event->hw.state = PERF_HES_STOPPED; - return; - } - - ACCESS_ONCE(pt->handle_nmi) = 1; - event->hw.state = 0; - - pt_config_buffer(buf->cur->table, buf->cur_idx, - buf->output_off); - pt_config(event); -} - -static void pt_event_stop(struct perf_event *event, int mode) -{ - struct pt *pt = this_cpu_ptr(&pt_ctx); - - /* - * Protect against the PMI racing with disabling wrmsr, - * see comment in intel_pt_interrupt(). - */ - ACCESS_ONCE(pt->handle_nmi) = 0; - pt_config_start(false); - - if (event->hw.state == PERF_HES_STOPPED) - return; - - event->hw.state = PERF_HES_STOPPED; - - if (mode & PERF_EF_UPDATE) { - struct pt_buffer *buf = perf_get_aux(&pt->handle); - - if (!buf) - return; - - if (WARN_ON_ONCE(pt->handle.event != event)) - return; - - pt_read_offset(buf); - - pt_handle_status(pt); - - pt_update_head(pt); - } -} - -static void pt_event_del(struct perf_event *event, int mode) -{ - struct pt *pt = this_cpu_ptr(&pt_ctx); - struct pt_buffer *buf; - - pt_event_stop(event, PERF_EF_UPDATE); - - buf = perf_get_aux(&pt->handle); - - if (buf) { - if (buf->snapshot) - pt->handle.head = - local_xchg(&buf->data_size, - buf->nr_pages << PAGE_SHIFT); - perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0), - local_xchg(&buf->lost, 0)); - } -} - -static int pt_event_add(struct perf_event *event, int mode) -{ - struct pt_buffer *buf; - struct pt *pt = this_cpu_ptr(&pt_ctx); - struct hw_perf_event *hwc = &event->hw; - int ret = -EBUSY; - - if (pt->handle.event) - goto fail; - - buf = perf_aux_output_begin(&pt->handle, event); - ret = -EINVAL; - if (!buf) - goto fail_stop; - - pt_buffer_reset_offsets(buf, pt->handle.head); - if (!buf->snapshot) { - ret = pt_buffer_reset_markers(buf, &pt->handle); - if (ret) - goto fail_end_stop; - } - - if (mode & PERF_EF_START) { - pt_event_start(event, 0); - ret = -EBUSY; - if (hwc->state == PERF_HES_STOPPED) - goto fail_end_stop; - } else { - hwc->state = PERF_HES_STOPPED; - } - - return 0; - -fail_end_stop: - perf_aux_output_end(&pt->handle, 0, true); -fail_stop: - hwc->state = PERF_HES_STOPPED; -fail: - return ret; -} - -static void pt_event_read(struct perf_event *event) -{ -} - -static void pt_event_destroy(struct perf_event *event) -{ - x86_del_exclusive(x86_lbr_exclusive_pt); -} - -static int pt_event_init(struct perf_event *event) -{ - if (event->attr.type != pt_pmu.pmu.type) - return -ENOENT; - - if (!pt_event_valid(event)) - return -EINVAL; - - if (x86_add_exclusive(x86_lbr_exclusive_pt)) - return -EBUSY; - - event->destroy = pt_event_destroy; - - return 0; -} - -void cpu_emergency_stop_pt(void) -{ - struct pt *pt = this_cpu_ptr(&pt_ctx); - - if (pt->handle.event) - pt_event_stop(pt->handle.event, PERF_EF_UPDATE); -} - -static __init int pt_init(void) -{ - int ret, cpu, prior_warn = 0; - - BUILD_BUG_ON(sizeof(struct topa) > PAGE_SIZE); - - if (!test_cpu_cap(&boot_cpu_data, X86_FEATURE_INTEL_PT)) - return -ENODEV; - - get_online_cpus(); - for_each_online_cpu(cpu) { - u64 ctl; - - ret = rdmsrl_safe_on_cpu(cpu, MSR_IA32_RTIT_CTL, &ctl); - if (!ret && (ctl & RTIT_CTL_TRACEEN)) - prior_warn++; - } - put_online_cpus(); - - if (prior_warn) { - x86_add_exclusive(x86_lbr_exclusive_pt); - pr_warn("PT is enabled at boot time, doing nothing\n"); - - return -EBUSY; - } - - ret = pt_pmu_hw_init(); - if (ret) - return ret; - - if (!pt_cap_get(PT_CAP_topa_output)) { - pr_warn("ToPA output is not supported on this CPU\n"); - return -ENODEV; - } - - if (!pt_cap_get(PT_CAP_topa_multiple_entries)) - pt_pmu.pmu.capabilities = - PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_AUX_SW_DOUBLEBUF; - - pt_pmu.pmu.capabilities |= PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE; - pt_pmu.pmu.attr_groups = pt_attr_groups; - pt_pmu.pmu.task_ctx_nr = perf_sw_context; - pt_pmu.pmu.event_init = pt_event_init; - pt_pmu.pmu.add = pt_event_add; - pt_pmu.pmu.del = pt_event_del; - pt_pmu.pmu.start = pt_event_start; - pt_pmu.pmu.stop = pt_event_stop; - pt_pmu.pmu.read = pt_event_read; - pt_pmu.pmu.setup_aux = pt_buffer_setup_aux; - pt_pmu.pmu.free_aux = pt_buffer_free_aux; - ret = perf_pmu_register(&pt_pmu.pmu, "intel_pt", -1); - - return ret; -} -arch_initcall(pt_init); diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c deleted file mode 100644 index 24a351ad628d..000000000000 --- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c +++ /dev/null @@ -1,783 +0,0 @@ -/* - * perf_event_intel_rapl.c: support Intel RAPL energy consumption counters - * Copyright (C) 2013 Google, Inc., Stephane Eranian - * - * Intel RAPL interface is specified in the IA-32 Manual Vol3b - * section 14.7.1 (September 2013) - * - * RAPL provides more controls than just reporting energy consumption - * however here we only expose the 3 energy consumption free running - * counters (pp0, pkg, dram). - * - * Each of those counters increments in a power unit defined by the - * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules - * but it can vary. - * - * Counter to rapl events mappings: - * - * pp0 counter: consumption of all physical cores (power plane 0) - * event: rapl_energy_cores - * perf code: 0x1 - * - * pkg counter: consumption of the whole processor package - * event: rapl_energy_pkg - * perf code: 0x2 - * - * dram counter: consumption of the dram domain (servers only) - * event: rapl_energy_dram - * perf code: 0x3 - * - * dram counter: consumption of the builtin-gpu domain (client only) - * event: rapl_energy_gpu - * perf code: 0x4 - * - * We manage those counters as free running (read-only). They may be - * use simultaneously by other tools, such as turbostat. - * - * The events only support system-wide mode counting. There is no - * sampling support because it does not make sense and is not - * supported by the RAPL hardware. - * - * Because we want to avoid floating-point operations in the kernel, - * the events are all reported in fixed point arithmetic (32.32). - * Tools must adjust the counts to convert them to Watts using - * the duration of the measurement. Tools may use a function such as - * ldexp(raw_count, -32); - */ -#include <linux/module.h> -#include <linux/slab.h> -#include <linux/perf_event.h> -#include <asm/cpu_device_id.h> -#include "perf_event.h" - -/* - * RAPL energy status counters - */ -#define RAPL_IDX_PP0_NRG_STAT 0 /* all cores */ -#define INTEL_RAPL_PP0 0x1 /* pseudo-encoding */ -#define RAPL_IDX_PKG_NRG_STAT 1 /* entire package */ -#define INTEL_RAPL_PKG 0x2 /* pseudo-encoding */ -#define RAPL_IDX_RAM_NRG_STAT 2 /* DRAM */ -#define INTEL_RAPL_RAM 0x3 /* pseudo-encoding */ -#define RAPL_IDX_PP1_NRG_STAT 3 /* gpu */ -#define INTEL_RAPL_PP1 0x4 /* pseudo-encoding */ - -#define NR_RAPL_DOMAINS 0x4 -static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = { - "pp0-core", - "package", - "dram", - "pp1-gpu", -}; - -/* Clients have PP0, PKG */ -#define RAPL_IDX_CLN (1<<RAPL_IDX_PP0_NRG_STAT|\ - 1<<RAPL_IDX_PKG_NRG_STAT|\ - 1<<RAPL_IDX_PP1_NRG_STAT) - -/* Servers have PP0, PKG, RAM */ -#define RAPL_IDX_SRV (1<<RAPL_IDX_PP0_NRG_STAT|\ - 1<<RAPL_IDX_PKG_NRG_STAT|\ - 1<<RAPL_IDX_RAM_NRG_STAT) - -/* Servers have PP0, PKG, RAM, PP1 */ -#define RAPL_IDX_HSW (1<<RAPL_IDX_PP0_NRG_STAT|\ - 1<<RAPL_IDX_PKG_NRG_STAT|\ - 1<<RAPL_IDX_RAM_NRG_STAT|\ - 1<<RAPL_IDX_PP1_NRG_STAT) - -/* Knights Landing has PKG, RAM */ -#define RAPL_IDX_KNL (1<<RAPL_IDX_PKG_NRG_STAT|\ - 1<<RAPL_IDX_RAM_NRG_STAT) - -/* - * event code: LSB 8 bits, passed in attr->config - * any other bit is reserved - */ -#define RAPL_EVENT_MASK 0xFFULL - -#define DEFINE_RAPL_FORMAT_ATTR(_var, _name, _format) \ -static ssize_t __rapl_##_var##_show(struct kobject *kobj, \ - struct kobj_attribute *attr, \ - char *page) \ -{ \ - BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \ - return sprintf(page, _format "\n"); \ -} \ -static struct kobj_attribute format_attr_##_var = \ - __ATTR(_name, 0444, __rapl_##_var##_show, NULL) - -#define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */ - -#define RAPL_EVENT_ATTR_STR(_name, v, str) \ -static struct perf_pmu_events_attr event_attr_##v = { \ - .attr = __ATTR(_name, 0444, perf_event_sysfs_show, NULL), \ - .id = 0, \ - .event_str = str, \ -}; - -struct rapl_pmu { - spinlock_t lock; - int n_active; /* number of active events */ - struct list_head active_list; - struct pmu *pmu; /* pointer to rapl_pmu_class */ - ktime_t timer_interval; /* in ktime_t unit */ - struct hrtimer hrtimer; -}; - -static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly; /* 1/2^hw_unit Joule */ -static struct pmu rapl_pmu_class; -static cpumask_t rapl_cpu_mask; -static int rapl_cntr_mask; - -static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu); -static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free); - -static struct x86_pmu_quirk *rapl_quirks; -static inline u64 rapl_read_counter(struct perf_event *event) -{ - u64 raw; - rdmsrl(event->hw.event_base, raw); - return raw; -} - -#define rapl_add_quirk(func_) \ -do { \ - static struct x86_pmu_quirk __quirk __initdata = { \ - .func = func_, \ - }; \ - __quirk.next = rapl_quirks; \ - rapl_quirks = &__quirk; \ -} while (0) - -static inline u64 rapl_scale(u64 v, int cfg) -{ - if (cfg > NR_RAPL_DOMAINS) { - pr_warn("invalid domain %d, failed to scale data\n", cfg); - return v; - } - /* - * scale delta to smallest unit (1/2^32) - * users must then scale back: count * 1/(1e9*2^32) to get Joules - * or use ldexp(count, -32). - * Watts = Joules/Time delta - */ - return v << (32 - rapl_hw_unit[cfg - 1]); -} - -static u64 rapl_event_update(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - u64 prev_raw_count, new_raw_count; - s64 delta, sdelta; - int shift = RAPL_CNTR_WIDTH; - -again: - prev_raw_count = local64_read(&hwc->prev_count); - rdmsrl(event->hw.event_base, new_raw_count); - - if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, - new_raw_count) != prev_raw_count) { - cpu_relax(); - goto again; - } - - /* - * Now we have the new raw value and have updated the prev - * timestamp already. We can now calculate the elapsed delta - * (event-)time and add that to the generic event. - * - * Careful, not all hw sign-extends above the physical width - * of the count. - */ - delta = (new_raw_count << shift) - (prev_raw_count << shift); - delta >>= shift; - - sdelta = rapl_scale(delta, event->hw.config); - - local64_add(sdelta, &event->count); - - return new_raw_count; -} - -static void rapl_start_hrtimer(struct rapl_pmu *pmu) -{ - hrtimer_start(&pmu->hrtimer, pmu->timer_interval, - HRTIMER_MODE_REL_PINNED); -} - -static void rapl_stop_hrtimer(struct rapl_pmu *pmu) -{ - hrtimer_cancel(&pmu->hrtimer); -} - -static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer) -{ - struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu); - struct perf_event *event; - unsigned long flags; - - if (!pmu->n_active) - return HRTIMER_NORESTART; - - spin_lock_irqsave(&pmu->lock, flags); - - list_for_each_entry(event, &pmu->active_list, active_entry) { - rapl_event_update(event); - } - - spin_unlock_irqrestore(&pmu->lock, flags); - - hrtimer_forward_now(hrtimer, pmu->timer_interval); - - return HRTIMER_RESTART; -} - -static void rapl_hrtimer_init(struct rapl_pmu *pmu) -{ - struct hrtimer *hr = &pmu->hrtimer; - - hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hr->function = rapl_hrtimer_handle; -} - -static void __rapl_pmu_event_start(struct rapl_pmu *pmu, - struct perf_event *event) -{ - if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) - return; - - event->hw.state = 0; - - list_add_tail(&event->active_entry, &pmu->active_list); - - local64_set(&event->hw.prev_count, rapl_read_counter(event)); - - pmu->n_active++; - if (pmu->n_active == 1) - rapl_start_hrtimer(pmu); -} - -static void rapl_pmu_event_start(struct perf_event *event, int mode) -{ - struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu); - unsigned long flags; - - spin_lock_irqsave(&pmu->lock, flags); - __rapl_pmu_event_start(pmu, event); - spin_unlock_irqrestore(&pmu->lock, flags); -} - -static void rapl_pmu_event_stop(struct perf_event *event, int mode) -{ - struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu); - struct hw_perf_event *hwc = &event->hw; - unsigned long flags; - - spin_lock_irqsave(&pmu->lock, flags); - - /* mark event as deactivated and stopped */ - if (!(hwc->state & PERF_HES_STOPPED)) { - WARN_ON_ONCE(pmu->n_active <= 0); - pmu->n_active--; - if (pmu->n_active == 0) - rapl_stop_hrtimer(pmu); - - list_del(&event->active_entry); - - WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); - hwc->state |= PERF_HES_STOPPED; - } - - /* check if update of sw counter is necessary */ - if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { - /* - * Drain the remaining delta count out of a event - * that we are disabling: - */ - rapl_event_update(event); - hwc->state |= PERF_HES_UPTODATE; - } - - spin_unlock_irqrestore(&pmu->lock, flags); -} - -static int rapl_pmu_event_add(struct perf_event *event, int mode) -{ - struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu); - struct hw_perf_event *hwc = &event->hw; - unsigned long flags; - - spin_lock_irqsave(&pmu->lock, flags); - - hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; - - if (mode & PERF_EF_START) - __rapl_pmu_event_start(pmu, event); - - spin_unlock_irqrestore(&pmu->lock, flags); - - return 0; -} - -static void rapl_pmu_event_del(struct perf_event *event, int flags) -{ - rapl_pmu_event_stop(event, PERF_EF_UPDATE); -} - -static int rapl_pmu_event_init(struct perf_event *event) -{ - u64 cfg = event->attr.config & RAPL_EVENT_MASK; - int bit, msr, ret = 0; - - /* only look at RAPL events */ - if (event->attr.type != rapl_pmu_class.type) - return -ENOENT; - - /* check only supported bits are set */ - if (event->attr.config & ~RAPL_EVENT_MASK) - return -EINVAL; - - /* - * check event is known (determines counter) - */ - switch (cfg) { - case INTEL_RAPL_PP0: - bit = RAPL_IDX_PP0_NRG_STAT; - msr = MSR_PP0_ENERGY_STATUS; - break; - case INTEL_RAPL_PKG: - bit = RAPL_IDX_PKG_NRG_STAT; - msr = MSR_PKG_ENERGY_STATUS; - break; - case INTEL_RAPL_RAM: - bit = RAPL_IDX_RAM_NRG_STAT; - msr = MSR_DRAM_ENERGY_STATUS; - break; - case INTEL_RAPL_PP1: - bit = RAPL_IDX_PP1_NRG_STAT; - msr = MSR_PP1_ENERGY_STATUS; - break; - default: - return -EINVAL; - } - /* check event supported */ - if (!(rapl_cntr_mask & (1 << bit))) - return -EINVAL; - - /* unsupported modes and filters */ - if (event->attr.exclude_user || - event->attr.exclude_kernel || - event->attr.exclude_hv || - event->attr.exclude_idle || - event->attr.exclude_host || - event->attr.exclude_guest || - event->attr.sample_period) /* no sampling */ - return -EINVAL; - - /* must be done before validate_group */ - event->hw.event_base = msr; - event->hw.config = cfg; - event->hw.idx = bit; - - return ret; -} - -static void rapl_pmu_event_read(struct perf_event *event) -{ - rapl_event_update(event); -} - -static ssize_t rapl_get_attr_cpumask(struct device *dev, - struct device_attribute *attr, char *buf) -{ - return cpumap_print_to_pagebuf(true, buf, &rapl_cpu_mask); -} - -static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL); - -static struct attribute *rapl_pmu_attrs[] = { - &dev_attr_cpumask.attr, - NULL, -}; - -static struct attribute_group rapl_pmu_attr_group = { - .attrs = rapl_pmu_attrs, -}; - -RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01"); -RAPL_EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02"); -RAPL_EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03"); -RAPL_EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04"); - -RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules"); -RAPL_EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules"); -RAPL_EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules"); -RAPL_EVENT_ATTR_STR(energy-gpu.unit , rapl_gpu_unit, "Joules"); - -/* - * we compute in 0.23 nJ increments regardless of MSR - */ -RAPL_EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10"); -RAPL_EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890625e-10"); -RAPL_EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10"); -RAPL_EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10"); - -static struct attribute *rapl_events_srv_attr[] = { - EVENT_PTR(rapl_cores), - EVENT_PTR(rapl_pkg), - EVENT_PTR(rapl_ram), - - EVENT_PTR(rapl_cores_unit), - EVENT_PTR(rapl_pkg_unit), - EVENT_PTR(rapl_ram_unit), - - EVENT_PTR(rapl_cores_scale), - EVENT_PTR(rapl_pkg_scale), - EVENT_PTR(rapl_ram_scale), - NULL, -}; - -static struct attribute *rapl_events_cln_attr[] = { - EVENT_PTR(rapl_cores), - EVENT_PTR(rapl_pkg), - EVENT_PTR(rapl_gpu), - - EVENT_PTR(rapl_cores_unit), - EVENT_PTR(rapl_pkg_unit), - EVENT_PTR(rapl_gpu_unit), - - EVENT_PTR(rapl_cores_scale), - EVENT_PTR(rapl_pkg_scale), - EVENT_PTR(rapl_gpu_scale), - NULL, -}; - -static struct attribute *rapl_events_hsw_attr[] = { - EVENT_PTR(rapl_cores), - EVENT_PTR(rapl_pkg), - EVENT_PTR(rapl_gpu), - EVENT_PTR(rapl_ram), - - EVENT_PTR(rapl_cores_unit), - EVENT_PTR(rapl_pkg_unit), - EVENT_PTR(rapl_gpu_unit), - EVENT_PTR(rapl_ram_unit), - - EVENT_PTR(rapl_cores_scale), - EVENT_PTR(rapl_pkg_scale), - EVENT_PTR(rapl_gpu_scale), - EVENT_PTR(rapl_ram_scale), - NULL, -}; - -static struct attribute *rapl_events_knl_attr[] = { - EVENT_PTR(rapl_pkg), - EVENT_PTR(rapl_ram), - - EVENT_PTR(rapl_pkg_unit), - EVENT_PTR(rapl_ram_unit), - - EVENT_PTR(rapl_pkg_scale), - EVENT_PTR(rapl_ram_scale), - NULL, -}; - -static struct attribute_group rapl_pmu_events_group = { - .name = "events", - .attrs = NULL, /* patched at runtime */ -}; - -DEFINE_RAPL_FORMAT_ATTR(event, event, "config:0-7"); -static struct attribute *rapl_formats_attr[] = { - &format_attr_event.attr, - NULL, -}; - -static struct attribute_group rapl_pmu_format_group = { - .name = "format", - .attrs = rapl_formats_attr, -}; - -const struct attribute_group *rapl_attr_groups[] = { - &rapl_pmu_attr_group, - &rapl_pmu_format_group, - &rapl_pmu_events_group, - NULL, -}; - -static struct pmu rapl_pmu_class = { - .attr_groups = rapl_attr_groups, - .task_ctx_nr = perf_invalid_context, /* system-wide only */ - .event_init = rapl_pmu_event_init, - .add = rapl_pmu_event_add, /* must have */ - .del = rapl_pmu_event_del, /* must have */ - .start = rapl_pmu_event_start, - .stop = rapl_pmu_event_stop, - .read = rapl_pmu_event_read, -}; - -static void rapl_cpu_exit(int cpu) -{ - struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu); - int i, phys_id = topology_physical_package_id(cpu); - int target = -1; - - /* find a new cpu on same package */ - for_each_online_cpu(i) { - if (i == cpu) - continue; - if (phys_id == topology_physical_package_id(i)) { - target = i; - break; - } - } - /* - * clear cpu from cpumask - * if was set in cpumask and still some cpu on package, - * then move to new cpu - */ - if (cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask) && target >= 0) - cpumask_set_cpu(target, &rapl_cpu_mask); - - WARN_ON(cpumask_empty(&rapl_cpu_mask)); - /* - * migrate events and context to new cpu - */ - if (target >= 0) - perf_pmu_migrate_context(pmu->pmu, cpu, target); - - /* cancel overflow polling timer for CPU */ - rapl_stop_hrtimer(pmu); -} - -static void rapl_cpu_init(int cpu) -{ - int i, phys_id = topology_physical_package_id(cpu); - - /* check if phys_is is already covered */ - for_each_cpu(i, &rapl_cpu_mask) { - if (phys_id == topology_physical_package_id(i)) - return; - } - /* was not found, so add it */ - cpumask_set_cpu(cpu, &rapl_cpu_mask); -} - -static __init void rapl_hsw_server_quirk(void) -{ - /* - * DRAM domain on HSW server has fixed energy unit which can be - * different than the unit from power unit MSR. - * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2 - * of 2. Datasheet, September 2014, Reference Number: 330784-001 " - */ - rapl_hw_unit[RAPL_IDX_RAM_NRG_STAT] = 16; -} - -static int rapl_cpu_prepare(int cpu) -{ - struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu); - int phys_id = topology_physical_package_id(cpu); - u64 ms; - - if (pmu) - return 0; - - if (phys_id < 0) - return -1; - - pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu)); - if (!pmu) - return -1; - spin_lock_init(&pmu->lock); - - INIT_LIST_HEAD(&pmu->active_list); - - pmu->pmu = &rapl_pmu_class; - - /* - * use reference of 200W for scaling the timeout - * to avoid missing counter overflows. - * 200W = 200 Joules/sec - * divide interval by 2 to avoid lockstep (2 * 100) - * if hw unit is 32, then we use 2 ms 1/200/2 - */ - if (rapl_hw_unit[0] < 32) - ms = (1000 / (2 * 100)) * (1ULL << (32 - rapl_hw_unit[0] - 1)); - else - ms = 2; - - pmu->timer_interval = ms_to_ktime(ms); - - rapl_hrtimer_init(pmu); - - /* set RAPL pmu for this cpu for now */ - per_cpu(rapl_pmu, cpu) = pmu; - per_cpu(rapl_pmu_to_free, cpu) = NULL; - - return 0; -} - -static void rapl_cpu_kfree(int cpu) -{ - struct rapl_pmu *pmu = per_cpu(rapl_pmu_to_free, cpu); - - kfree(pmu); - - per_cpu(rapl_pmu_to_free, cpu) = NULL; -} - -static int rapl_cpu_dying(int cpu) -{ - struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu); - - if (!pmu) - return 0; - - per_cpu(rapl_pmu, cpu) = NULL; - - per_cpu(rapl_pmu_to_free, cpu) = pmu; - - return 0; -} - -static int rapl_cpu_notifier(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - unsigned int cpu = (long)hcpu; - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_UP_PREPARE: - rapl_cpu_prepare(cpu); - break; - case CPU_STARTING: - rapl_cpu_init(cpu); - break; - case CPU_UP_CANCELED: - case CPU_DYING: - rapl_cpu_dying(cpu); - break; - case CPU_ONLINE: - case CPU_DEAD: - rapl_cpu_kfree(cpu); - break; - case CPU_DOWN_PREPARE: - rapl_cpu_exit(cpu); - break; - default: - break; - } - - return NOTIFY_OK; -} - -static int rapl_check_hw_unit(void) -{ - u64 msr_rapl_power_unit_bits; - int i; - - /* protect rdmsrl() to handle virtualization */ - if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits)) - return -1; - for (i = 0; i < NR_RAPL_DOMAINS; i++) - rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL; - - return 0; -} - -static const struct x86_cpu_id rapl_cpu_match[] = { - [0] = { .vendor = X86_VENDOR_INTEL, .family = 6 }, - [1] = {}, -}; - -static int __init rapl_pmu_init(void) -{ - struct rapl_pmu *pmu; - int cpu, ret; - struct x86_pmu_quirk *quirk; - int i; - - /* - * check for Intel processor family 6 - */ - if (!x86_match_cpu(rapl_cpu_match)) - return 0; - - /* check supported CPU */ - switch (boot_cpu_data.x86_model) { - case 42: /* Sandy Bridge */ - case 58: /* Ivy Bridge */ - rapl_cntr_mask = RAPL_IDX_CLN; - rapl_pmu_events_group.attrs = rapl_events_cln_attr; - break; - case 63: /* Haswell-Server */ - rapl_add_quirk(rapl_hsw_server_quirk); - rapl_cntr_mask = RAPL_IDX_SRV; - rapl_pmu_events_group.attrs = rapl_events_srv_attr; - break; - case 60: /* Haswell */ - case 69: /* Haswell-Celeron */ - case 61: /* Broadwell */ - rapl_cntr_mask = RAPL_IDX_HSW; - rapl_pmu_events_group.attrs = rapl_events_hsw_attr; - break; - case 45: /* Sandy Bridge-EP */ - case 62: /* IvyTown */ - rapl_cntr_mask = RAPL_IDX_SRV; - rapl_pmu_events_group.attrs = rapl_events_srv_attr; - break; - case 87: /* Knights Landing */ - rapl_add_quirk(rapl_hsw_server_quirk); - rapl_cntr_mask = RAPL_IDX_KNL; - rapl_pmu_events_group.attrs = rapl_events_knl_attr; - - default: - /* unsupported */ - return 0; - } - ret = rapl_check_hw_unit(); - if (ret) - return ret; - - /* run cpu model quirks */ - for (quirk = rapl_quirks; quirk; quirk = quirk->next) - quirk->func(); - cpu_notifier_register_begin(); - - for_each_online_cpu(cpu) { - ret = rapl_cpu_prepare(cpu); - if (ret) - goto out; - rapl_cpu_init(cpu); - } - - __perf_cpu_notifier(rapl_cpu_notifier); - - ret = perf_pmu_register(&rapl_pmu_class, "power", -1); - if (WARN_ON(ret)) { - pr_info("RAPL PMU detected, registration failed (%d), RAPL PMU disabled\n", ret); - cpu_notifier_register_done(); - return -1; - } - - pmu = __this_cpu_read(rapl_pmu); - - pr_info("RAPL PMU detected," - " API unit is 2^-32 Joules," - " %d fixed counters" - " %llu ms ovfl timer\n", - hweight32(rapl_cntr_mask), - ktime_to_ms(pmu->timer_interval)); - for (i = 0; i < NR_RAPL_DOMAINS; i++) { - if (rapl_cntr_mask & (1 << i)) { - pr_info("hw unit of domain %s 2^-%d Joules\n", - rapl_domain_names[i], rapl_hw_unit[i]); - } - } -out: - cpu_notifier_register_done(); - - return 0; -} -device_initcall(rapl_pmu_init); diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c deleted file mode 100644 index 3bf41d413775..000000000000 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ /dev/null @@ -1,1401 +0,0 @@ -#include "perf_event_intel_uncore.h" - -static struct intel_uncore_type *empty_uncore[] = { NULL, }; -struct intel_uncore_type **uncore_msr_uncores = empty_uncore; -struct intel_uncore_type **uncore_pci_uncores = empty_uncore; - -static bool pcidrv_registered; -struct pci_driver *uncore_pci_driver; -/* pci bus to socket mapping */ -DEFINE_RAW_SPINLOCK(pci2phy_map_lock); -struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head); -struct pci_dev *uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX]; - -static DEFINE_RAW_SPINLOCK(uncore_box_lock); -/* mask of cpus that collect uncore events */ -static cpumask_t uncore_cpu_mask; - -/* constraint for the fixed counter */ -static struct event_constraint uncore_constraint_fixed = - EVENT_CONSTRAINT(~0ULL, 1 << UNCORE_PMC_IDX_FIXED, ~0ULL); -struct event_constraint uncore_constraint_empty = - EVENT_CONSTRAINT(0, 0, 0); - -int uncore_pcibus_to_physid(struct pci_bus *bus) -{ - struct pci2phy_map *map; - int phys_id = -1; - - raw_spin_lock(&pci2phy_map_lock); - list_for_each_entry(map, &pci2phy_map_head, list) { - if (map->segment == pci_domain_nr(bus)) { - phys_id = map->pbus_to_physid[bus->number]; - break; - } - } - raw_spin_unlock(&pci2phy_map_lock); - - return phys_id; -} - -struct pci2phy_map *__find_pci2phy_map(int segment) -{ - struct pci2phy_map *map, *alloc = NULL; - int i; - - lockdep_assert_held(&pci2phy_map_lock); - -lookup: - list_for_each_entry(map, &pci2phy_map_head, list) { - if (map->segment == segment) - goto end; - } - - if (!alloc) { - raw_spin_unlock(&pci2phy_map_lock); - alloc = kmalloc(sizeof(struct pci2phy_map), GFP_KERNEL); - raw_spin_lock(&pci2phy_map_lock); - - if (!alloc) - return NULL; - - goto lookup; - } - - map = alloc; - alloc = NULL; - map->segment = segment; - for (i = 0; i < 256; i++) - map->pbus_to_physid[i] = -1; - list_add_tail(&map->list, &pci2phy_map_head); - -end: - kfree(alloc); - return map; -} - -ssize_t uncore_event_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct uncore_event_desc *event = - container_of(attr, struct uncore_event_desc, attr); - return sprintf(buf, "%s", event->config); -} - -struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event) -{ - return container_of(event->pmu, struct intel_uncore_pmu, pmu); -} - -struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu) -{ - struct intel_uncore_box *box; - - box = *per_cpu_ptr(pmu->box, cpu); - if (box) - return box; - - raw_spin_lock(&uncore_box_lock); - /* Recheck in lock to handle races. */ - if (*per_cpu_ptr(pmu->box, cpu)) - goto out; - list_for_each_entry(box, &pmu->box_list, list) { - if (box->phys_id == topology_physical_package_id(cpu)) { - atomic_inc(&box->refcnt); - *per_cpu_ptr(pmu->box, cpu) = box; - break; - } - } -out: - raw_spin_unlock(&uncore_box_lock); - - return *per_cpu_ptr(pmu->box, cpu); -} - -struct intel_uncore_box *uncore_event_to_box(struct perf_event *event) -{ - /* - * perf core schedules event on the basis of cpu, uncore events are - * collected by one of the cpus inside a physical package. - */ - return uncore_pmu_to_box(uncore_event_to_pmu(event), smp_processor_id()); -} - -u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event) -{ - u64 count; - - rdmsrl(event->hw.event_base, count); - - return count; -} - -/* - * generic get constraint function for shared match/mask registers. - */ -struct event_constraint * -uncore_get_constraint(struct intel_uncore_box *box, struct perf_event *event) -{ - struct intel_uncore_extra_reg *er; - struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; - struct hw_perf_event_extra *reg2 = &event->hw.branch_reg; - unsigned long flags; - bool ok = false; - - /* - * reg->alloc can be set due to existing state, so for fake box we - * need to ignore this, otherwise we might fail to allocate proper - * fake state for this extra reg constraint. - */ - if (reg1->idx == EXTRA_REG_NONE || - (!uncore_box_is_fake(box) && reg1->alloc)) - return NULL; - - er = &box->shared_regs[reg1->idx]; - raw_spin_lock_irqsave(&er->lock, flags); - if (!atomic_read(&er->ref) || - (er->config1 == reg1->config && er->config2 == reg2->config)) { - atomic_inc(&er->ref); - er->config1 = reg1->config; - er->config2 = reg2->config; - ok = true; - } - raw_spin_unlock_irqrestore(&er->lock, flags); - - if (ok) { - if (!uncore_box_is_fake(box)) - reg1->alloc = 1; - return NULL; - } - - return &uncore_constraint_empty; -} - -void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event) -{ - struct intel_uncore_extra_reg *er; - struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; - - /* - * Only put constraint if extra reg was actually allocated. Also - * takes care of event which do not use an extra shared reg. - * - * Also, if this is a fake box we shouldn't touch any event state - * (reg->alloc) and we don't care about leaving inconsistent box - * state either since it will be thrown out. - */ - if (uncore_box_is_fake(box) || !reg1->alloc) - return; - - er = &box->shared_regs[reg1->idx]; - atomic_dec(&er->ref); - reg1->alloc = 0; -} - -u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx) -{ - struct intel_uncore_extra_reg *er; - unsigned long flags; - u64 config; - - er = &box->shared_regs[idx]; - - raw_spin_lock_irqsave(&er->lock, flags); - config = er->config; - raw_spin_unlock_irqrestore(&er->lock, flags); - - return config; -} - -static void uncore_assign_hw_event(struct intel_uncore_box *box, struct perf_event *event, int idx) -{ - struct hw_perf_event *hwc = &event->hw; - - hwc->idx = idx; - hwc->last_tag = ++box->tags[idx]; - - if (hwc->idx == UNCORE_PMC_IDX_FIXED) { - hwc->event_base = uncore_fixed_ctr(box); - hwc->config_base = uncore_fixed_ctl(box); - return; - } - - hwc->config_base = uncore_event_ctl(box, hwc->idx); - hwc->event_base = uncore_perf_ctr(box, hwc->idx); -} - -void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event) -{ - u64 prev_count, new_count, delta; - int shift; - - if (event->hw.idx >= UNCORE_PMC_IDX_FIXED) - shift = 64 - uncore_fixed_ctr_bits(box); - else - shift = 64 - uncore_perf_ctr_bits(box); - - /* the hrtimer might modify the previous event value */ -again: - prev_count = local64_read(&event->hw.prev_count); - new_count = uncore_read_counter(box, event); - if (local64_xchg(&event->hw.prev_count, new_count) != prev_count) - goto again; - - delta = (new_count << shift) - (prev_count << shift); - delta >>= shift; - - local64_add(delta, &event->count); -} - -/* - * The overflow interrupt is unavailable for SandyBridge-EP, is broken - * for SandyBridge. So we use hrtimer to periodically poll the counter - * to avoid overflow. - */ -static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer) -{ - struct intel_uncore_box *box; - struct perf_event *event; - unsigned long flags; - int bit; - - box = container_of(hrtimer, struct intel_uncore_box, hrtimer); - if (!box->n_active || box->cpu != smp_processor_id()) - return HRTIMER_NORESTART; - /* - * disable local interrupt to prevent uncore_pmu_event_start/stop - * to interrupt the update process - */ - local_irq_save(flags); - - /* - * handle boxes with an active event list as opposed to active - * counters - */ - list_for_each_entry(event, &box->active_list, active_entry) { - uncore_perf_event_update(box, event); - } - - for_each_set_bit(bit, box->active_mask, UNCORE_PMC_IDX_MAX) - uncore_perf_event_update(box, box->events[bit]); - - local_irq_restore(flags); - - hrtimer_forward_now(hrtimer, ns_to_ktime(box->hrtimer_duration)); - return HRTIMER_RESTART; -} - -void uncore_pmu_start_hrtimer(struct intel_uncore_box *box) -{ - hrtimer_start(&box->hrtimer, ns_to_ktime(box->hrtimer_duration), - HRTIMER_MODE_REL_PINNED); -} - -void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box) -{ - hrtimer_cancel(&box->hrtimer); -} - -static void uncore_pmu_init_hrtimer(struct intel_uncore_box *box) -{ - hrtimer_init(&box->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - box->hrtimer.function = uncore_pmu_hrtimer; -} - -static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, int node) -{ - struct intel_uncore_box *box; - int i, size; - - size = sizeof(*box) + type->num_shared_regs * sizeof(struct intel_uncore_extra_reg); - - box = kzalloc_node(size, GFP_KERNEL, node); - if (!box) - return NULL; - - for (i = 0; i < type->num_shared_regs; i++) - raw_spin_lock_init(&box->shared_regs[i].lock); - - uncore_pmu_init_hrtimer(box); - atomic_set(&box->refcnt, 1); - box->cpu = -1; - box->phys_id = -1; - - /* set default hrtimer timeout */ - box->hrtimer_duration = UNCORE_PMU_HRTIMER_INTERVAL; - - INIT_LIST_HEAD(&box->active_list); - - return box; -} - -/* - * Using uncore_pmu_event_init pmu event_init callback - * as a detection point for uncore events. - */ -static int uncore_pmu_event_init(struct perf_event *event); - -static bool is_uncore_event(struct perf_event *event) -{ - return event->pmu->event_init == uncore_pmu_event_init; -} - -static int -uncore_collect_events(struct intel_uncore_box *box, struct perf_event *leader, bool dogrp) -{ - struct perf_event *event; - int n, max_count; - - max_count = box->pmu->type->num_counters; - if (box->pmu->type->fixed_ctl) - max_count++; - - if (box->n_events >= max_count) - return -EINVAL; - - n = box->n_events; - - if (is_uncore_event(leader)) { - box->event_list[n] = leader; - n++; - } - - if (!dogrp) - return n; - - list_for_each_entry(event, &leader->sibling_list, group_entry) { - if (!is_uncore_event(event) || - event->state <= PERF_EVENT_STATE_OFF) - continue; - - if (n >= max_count) - return -EINVAL; - - box->event_list[n] = event; - n++; - } - return n; -} - -static struct event_constraint * -uncore_get_event_constraint(struct intel_uncore_box *box, struct perf_event *event) -{ - struct intel_uncore_type *type = box->pmu->type; - struct event_constraint *c; - - if (type->ops->get_constraint) { - c = type->ops->get_constraint(box, event); - if (c) - return c; - } - - if (event->attr.config == UNCORE_FIXED_EVENT) - return &uncore_constraint_fixed; - - if (type->constraints) { - for_each_event_constraint(c, type->constraints) { - if ((event->hw.config & c->cmask) == c->code) - return c; - } - } - - return &type->unconstrainted; -} - -static void uncore_put_event_constraint(struct intel_uncore_box *box, struct perf_event *event) -{ - if (box->pmu->type->ops->put_constraint) - box->pmu->type->ops->put_constraint(box, event); -} - -static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int n) -{ - unsigned long used_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)]; - struct event_constraint *c; - int i, wmin, wmax, ret = 0; - struct hw_perf_event *hwc; - - bitmap_zero(used_mask, UNCORE_PMC_IDX_MAX); - - for (i = 0, wmin = UNCORE_PMC_IDX_MAX, wmax = 0; i < n; i++) { - c = uncore_get_event_constraint(box, box->event_list[i]); - box->event_constraint[i] = c; - wmin = min(wmin, c->weight); - wmax = max(wmax, c->weight); - } - - /* fastpath, try to reuse previous register */ - for (i = 0; i < n; i++) { - hwc = &box->event_list[i]->hw; - c = box->event_constraint[i]; - - /* never assigned */ - if (hwc->idx == -1) - break; - - /* constraint still honored */ - if (!test_bit(hwc->idx, c->idxmsk)) - break; - - /* not already used */ - if (test_bit(hwc->idx, used_mask)) - break; - - __set_bit(hwc->idx, used_mask); - if (assign) - assign[i] = hwc->idx; - } - /* slow path */ - if (i != n) - ret = perf_assign_events(box->event_constraint, n, - wmin, wmax, n, assign); - - if (!assign || ret) { - for (i = 0; i < n; i++) - uncore_put_event_constraint(box, box->event_list[i]); - } - return ret ? -EINVAL : 0; -} - -static void uncore_pmu_event_start(struct perf_event *event, int flags) -{ - struct intel_uncore_box *box = uncore_event_to_box(event); - int idx = event->hw.idx; - - if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) - return; - - if (WARN_ON_ONCE(idx == -1 || idx >= UNCORE_PMC_IDX_MAX)) - return; - - event->hw.state = 0; - box->events[idx] = event; - box->n_active++; - __set_bit(idx, box->active_mask); - - local64_set(&event->hw.prev_count, uncore_read_counter(box, event)); - uncore_enable_event(box, event); - - if (box->n_active == 1) { - uncore_enable_box(box); - uncore_pmu_start_hrtimer(box); - } -} - -static void uncore_pmu_event_stop(struct perf_event *event, int flags) -{ - struct intel_uncore_box *box = uncore_event_to_box(event); - struct hw_perf_event *hwc = &event->hw; - - if (__test_and_clear_bit(hwc->idx, box->active_mask)) { - uncore_disable_event(box, event); - box->n_active--; - box->events[hwc->idx] = NULL; - WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); - hwc->state |= PERF_HES_STOPPED; - - if (box->n_active == 0) { - uncore_disable_box(box); - uncore_pmu_cancel_hrtimer(box); - } - } - - if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { - /* - * Drain the remaining delta count out of a event - * that we are disabling: - */ - uncore_perf_event_update(box, event); - hwc->state |= PERF_HES_UPTODATE; - } -} - -static int uncore_pmu_event_add(struct perf_event *event, int flags) -{ - struct intel_uncore_box *box = uncore_event_to_box(event); - struct hw_perf_event *hwc = &event->hw; - int assign[UNCORE_PMC_IDX_MAX]; - int i, n, ret; - - if (!box) - return -ENODEV; - - ret = n = uncore_collect_events(box, event, false); - if (ret < 0) - return ret; - - hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; - if (!(flags & PERF_EF_START)) - hwc->state |= PERF_HES_ARCH; - - ret = uncore_assign_events(box, assign, n); - if (ret) - return ret; - - /* save events moving to new counters */ - for (i = 0; i < box->n_events; i++) { - event = box->event_list[i]; - hwc = &event->hw; - - if (hwc->idx == assign[i] && - hwc->last_tag == box->tags[assign[i]]) - continue; - /* - * Ensure we don't accidentally enable a stopped - * counter simply because we rescheduled. - */ - if (hwc->state & PERF_HES_STOPPED) - hwc->state |= PERF_HES_ARCH; - - uncore_pmu_event_stop(event, PERF_EF_UPDATE); - } - - /* reprogram moved events into new counters */ - for (i = 0; i < n; i++) { - event = box->event_list[i]; - hwc = &event->hw; - - if (hwc->idx != assign[i] || - hwc->last_tag != box->tags[assign[i]]) - uncore_assign_hw_event(box, event, assign[i]); - else if (i < box->n_events) - continue; - - if (hwc->state & PERF_HES_ARCH) - continue; - - uncore_pmu_event_start(event, 0); - } - box->n_events = n; - - return 0; -} - -static void uncore_pmu_event_del(struct perf_event *event, int flags) -{ - struct intel_uncore_box *box = uncore_event_to_box(event); - int i; - - uncore_pmu_event_stop(event, PERF_EF_UPDATE); - - for (i = 0; i < box->n_events; i++) { - if (event == box->event_list[i]) { - uncore_put_event_constraint(box, event); - - while (++i < box->n_events) - box->event_list[i - 1] = box->event_list[i]; - - --box->n_events; - break; - } - } - - event->hw.idx = -1; - event->hw.last_tag = ~0ULL; -} - -void uncore_pmu_event_read(struct perf_event *event) -{ - struct intel_uncore_box *box = uncore_event_to_box(event); - uncore_perf_event_update(box, event); -} - -/* - * validation ensures the group can be loaded onto the - * PMU if it was the only group available. - */ -static int uncore_validate_group(struct intel_uncore_pmu *pmu, - struct perf_event *event) -{ - struct perf_event *leader = event->group_leader; - struct intel_uncore_box *fake_box; - int ret = -EINVAL, n; - - fake_box = uncore_alloc_box(pmu->type, NUMA_NO_NODE); - if (!fake_box) - return -ENOMEM; - - fake_box->pmu = pmu; - /* - * the event is not yet connected with its - * siblings therefore we must first collect - * existing siblings, then add the new event - * before we can simulate the scheduling - */ - n = uncore_collect_events(fake_box, leader, true); - if (n < 0) - goto out; - - fake_box->n_events = n; - n = uncore_collect_events(fake_box, event, false); - if (n < 0) - goto out; - - fake_box->n_events = n; - - ret = uncore_assign_events(fake_box, NULL, n); -out: - kfree(fake_box); - return ret; -} - -static int uncore_pmu_event_init(struct perf_event *event) -{ - struct intel_uncore_pmu *pmu; - struct intel_uncore_box *box; - struct hw_perf_event *hwc = &event->hw; - int ret; - - if (event->attr.type != event->pmu->type) - return -ENOENT; - - pmu = uncore_event_to_pmu(event); - /* no device found for this pmu */ - if (pmu->func_id < 0) - return -ENOENT; - - /* - * Uncore PMU does measure at all privilege level all the time. - * So it doesn't make sense to specify any exclude bits. - */ - if (event->attr.exclude_user || event->attr.exclude_kernel || - event->attr.exclude_hv || event->attr.exclude_idle) - return -EINVAL; - - /* Sampling not supported yet */ - if (hwc->sample_period) - return -EINVAL; - - /* - * Place all uncore events for a particular physical package - * onto a single cpu - */ - if (event->cpu < 0) - return -EINVAL; - box = uncore_pmu_to_box(pmu, event->cpu); - if (!box || box->cpu < 0) - return -EINVAL; - event->cpu = box->cpu; - - event->hw.idx = -1; - event->hw.last_tag = ~0ULL; - event->hw.extra_reg.idx = EXTRA_REG_NONE; - event->hw.branch_reg.idx = EXTRA_REG_NONE; - - if (event->attr.config == UNCORE_FIXED_EVENT) { - /* no fixed counter */ - if (!pmu->type->fixed_ctl) - return -EINVAL; - /* - * if there is only one fixed counter, only the first pmu - * can access the fixed counter - */ - if (pmu->type->single_fixed && pmu->pmu_idx > 0) - return -EINVAL; - - /* fixed counters have event field hardcoded to zero */ - hwc->config = 0ULL; - } else { - hwc->config = event->attr.config & pmu->type->event_mask; - if (pmu->type->ops->hw_config) { - ret = pmu->type->ops->hw_config(box, event); - if (ret) - return ret; - } - } - - if (event->group_leader != event) - ret = uncore_validate_group(pmu, event); - else - ret = 0; - - return ret; -} - -static ssize_t uncore_get_attr_cpumask(struct device *dev, - struct device_attribute *attr, char *buf) -{ - return cpumap_print_to_pagebuf(true, buf, &uncore_cpu_mask); -} - -static DEVICE_ATTR(cpumask, S_IRUGO, uncore_get_attr_cpumask, NULL); - -static struct attribute *uncore_pmu_attrs[] = { - &dev_attr_cpumask.attr, - NULL, -}; - -static struct attribute_group uncore_pmu_attr_group = { - .attrs = uncore_pmu_attrs, -}; - -static int uncore_pmu_register(struct intel_uncore_pmu *pmu) -{ - int ret; - - if (!pmu->type->pmu) { - pmu->pmu = (struct pmu) { - .attr_groups = pmu->type->attr_groups, - .task_ctx_nr = perf_invalid_context, - .event_init = uncore_pmu_event_init, - .add = uncore_pmu_event_add, - .del = uncore_pmu_event_del, - .start = uncore_pmu_event_start, - .stop = uncore_pmu_event_stop, - .read = uncore_pmu_event_read, - }; - } else { - pmu->pmu = *pmu->type->pmu; - pmu->pmu.attr_groups = pmu->type->attr_groups; - } - - if (pmu->type->num_boxes == 1) { - if (strlen(pmu->type->name) > 0) - sprintf(pmu->name, "uncore_%s", pmu->type->name); - else - sprintf(pmu->name, "uncore"); - } else { - sprintf(pmu->name, "uncore_%s_%d", pmu->type->name, - pmu->pmu_idx); - } - - ret = perf_pmu_register(&pmu->pmu, pmu->name, -1); - return ret; -} - -static void __init uncore_type_exit(struct intel_uncore_type *type) -{ - int i; - - for (i = 0; i < type->num_boxes; i++) - free_percpu(type->pmus[i].box); - kfree(type->pmus); - type->pmus = NULL; - kfree(type->events_group); - type->events_group = NULL; -} - -static void __init uncore_types_exit(struct intel_uncore_type **types) -{ - int i; - for (i = 0; types[i]; i++) - uncore_type_exit(types[i]); -} - -static int __init uncore_type_init(struct intel_uncore_type *type) -{ - struct intel_uncore_pmu *pmus; - struct attribute_group *attr_group; - struct attribute **attrs; - int i, j; - - pmus = kzalloc(sizeof(*pmus) * type->num_boxes, GFP_KERNEL); - if (!pmus) - return -ENOMEM; - - type->pmus = pmus; - - type->unconstrainted = (struct event_constraint) - __EVENT_CONSTRAINT(0, (1ULL << type->num_counters) - 1, - 0, type->num_counters, 0, 0); - - for (i = 0; i < type->num_boxes; i++) { - pmus[i].func_id = -1; - pmus[i].pmu_idx = i; - pmus[i].type = type; - INIT_LIST_HEAD(&pmus[i].box_list); - pmus[i].box = alloc_percpu(struct intel_uncore_box *); - if (!pmus[i].box) - goto fail; - } - - if (type->event_descs) { - i = 0; - while (type->event_descs[i].attr.attr.name) - i++; - - attr_group = kzalloc(sizeof(struct attribute *) * (i + 1) + - sizeof(*attr_group), GFP_KERNEL); - if (!attr_group) - goto fail; - - attrs = (struct attribute **)(attr_group + 1); - attr_group->name = "events"; - attr_group->attrs = attrs; - - for (j = 0; j < i; j++) - attrs[j] = &type->event_descs[j].attr.attr; - - type->events_group = attr_group; - } - - type->pmu_group = &uncore_pmu_attr_group; - return 0; -fail: - uncore_type_exit(type); - return -ENOMEM; -} - -static int __init uncore_types_init(struct intel_uncore_type **types) -{ - int i, ret; - - for (i = 0; types[i]; i++) { - ret = uncore_type_init(types[i]); - if (ret) - goto fail; - } - return 0; -fail: - while (--i >= 0) - uncore_type_exit(types[i]); - return ret; -} - -/* - * add a pci uncore device - */ -static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) -{ - struct intel_uncore_pmu *pmu; - struct intel_uncore_box *box; - struct intel_uncore_type *type; - int phys_id; - bool first_box = false; - - phys_id = uncore_pcibus_to_physid(pdev->bus); - if (phys_id < 0) - return -ENODEV; - - if (UNCORE_PCI_DEV_TYPE(id->driver_data) == UNCORE_EXTRA_PCI_DEV) { - int idx = UNCORE_PCI_DEV_IDX(id->driver_data); - uncore_extra_pci_dev[phys_id][idx] = pdev; - pci_set_drvdata(pdev, NULL); - return 0; - } - - type = uncore_pci_uncores[UNCORE_PCI_DEV_TYPE(id->driver_data)]; - box = uncore_alloc_box(type, NUMA_NO_NODE); - if (!box) - return -ENOMEM; - - /* - * for performance monitoring unit with multiple boxes, - * each box has a different function id. - */ - pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)]; - /* Knights Landing uses a common PCI device ID for multiple instances of - * an uncore PMU device type. There is only one entry per device type in - * the knl_uncore_pci_ids table inspite of multiple devices present for - * some device types. Hence PCI device idx would be 0 for all devices. - * So increment pmu pointer to point to an unused array element. - */ - if (boot_cpu_data.x86_model == 87) - while (pmu->func_id >= 0) - pmu++; - if (pmu->func_id < 0) - pmu->func_id = pdev->devfn; - else - WARN_ON_ONCE(pmu->func_id != pdev->devfn); - - box->phys_id = phys_id; - box->pci_dev = pdev; - box->pmu = pmu; - uncore_box_init(box); - pci_set_drvdata(pdev, box); - - raw_spin_lock(&uncore_box_lock); - if (list_empty(&pmu->box_list)) - first_box = true; - list_add_tail(&box->list, &pmu->box_list); - raw_spin_unlock(&uncore_box_lock); - - if (first_box) - uncore_pmu_register(pmu); - return 0; -} - -static void uncore_pci_remove(struct pci_dev *pdev) -{ - struct intel_uncore_box *box = pci_get_drvdata(pdev); - struct intel_uncore_pmu *pmu; - int i, cpu, phys_id; - bool last_box = false; - - phys_id = uncore_pcibus_to_physid(pdev->bus); - box = pci_get_drvdata(pdev); - if (!box) { - for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) { - if (uncore_extra_pci_dev[phys_id][i] == pdev) { - uncore_extra_pci_dev[phys_id][i] = NULL; - break; - } - } - WARN_ON_ONCE(i >= UNCORE_EXTRA_PCI_DEV_MAX); - return; - } - - pmu = box->pmu; - if (WARN_ON_ONCE(phys_id != box->phys_id)) - return; - - pci_set_drvdata(pdev, NULL); - - raw_spin_lock(&uncore_box_lock); - list_del(&box->list); - if (list_empty(&pmu->box_list)) - last_box = true; - raw_spin_unlock(&uncore_box_lock); - - for_each_possible_cpu(cpu) { - if (*per_cpu_ptr(pmu->box, cpu) == box) { - *per_cpu_ptr(pmu->box, cpu) = NULL; - atomic_dec(&box->refcnt); - } - } - - WARN_ON_ONCE(atomic_read(&box->refcnt) != 1); - kfree(box); - - if (last_box) - perf_pmu_unregister(&pmu->pmu); -} - -static int __init uncore_pci_init(void) -{ - int ret; - - switch (boot_cpu_data.x86_model) { - case 45: /* Sandy Bridge-EP */ - ret = snbep_uncore_pci_init(); - break; - case 62: /* Ivy Bridge-EP */ - ret = ivbep_uncore_pci_init(); - break; - case 63: /* Haswell-EP */ - ret = hswep_uncore_pci_init(); - break; - case 79: /* BDX-EP */ - case 86: /* BDX-DE */ - ret = bdx_uncore_pci_init(); - break; - case 42: /* Sandy Bridge */ - ret = snb_uncore_pci_init(); - break; - case 58: /* Ivy Bridge */ - ret = ivb_uncore_pci_init(); - break; - case 60: /* Haswell */ - case 69: /* Haswell Celeron */ - ret = hsw_uncore_pci_init(); - break; - case 61: /* Broadwell */ - ret = bdw_uncore_pci_init(); - break; - case 87: /* Knights Landing */ - ret = knl_uncore_pci_init(); - break; - case 94: /* SkyLake */ - ret = skl_uncore_pci_init(); - break; - default: - return 0; - } - - if (ret) - return ret; - - ret = uncore_types_init(uncore_pci_uncores); - if (ret) - return ret; - - uncore_pci_driver->probe = uncore_pci_probe; - uncore_pci_driver->remove = uncore_pci_remove; - - ret = pci_register_driver(uncore_pci_driver); - if (ret == 0) - pcidrv_registered = true; - else - uncore_types_exit(uncore_pci_uncores); - - return ret; -} - -static void __init uncore_pci_exit(void) -{ - if (pcidrv_registered) { - pcidrv_registered = false; - pci_unregister_driver(uncore_pci_driver); - uncore_types_exit(uncore_pci_uncores); - } -} - -/* CPU hot plug/unplug are serialized by cpu_add_remove_lock mutex */ -static LIST_HEAD(boxes_to_free); - -static void uncore_kfree_boxes(void) -{ - struct intel_uncore_box *box; - - while (!list_empty(&boxes_to_free)) { - box = list_entry(boxes_to_free.next, - struct intel_uncore_box, list); - list_del(&box->list); - kfree(box); - } -} - -static void uncore_cpu_dying(int cpu) -{ - struct intel_uncore_type *type; - struct intel_uncore_pmu *pmu; - struct intel_uncore_box *box; - int i, j; - - for (i = 0; uncore_msr_uncores[i]; i++) { - type = uncore_msr_uncores[i]; - for (j = 0; j < type->num_boxes; j++) { - pmu = &type->pmus[j]; - box = *per_cpu_ptr(pmu->box, cpu); - *per_cpu_ptr(pmu->box, cpu) = NULL; - if (box && atomic_dec_and_test(&box->refcnt)) - list_add(&box->list, &boxes_to_free); - } - } -} - -static int uncore_cpu_starting(int cpu) -{ - struct intel_uncore_type *type; - struct intel_uncore_pmu *pmu; - struct intel_uncore_box *box, *exist; - int i, j, k, phys_id; - - phys_id = topology_physical_package_id(cpu); - - for (i = 0; uncore_msr_uncores[i]; i++) { - type = uncore_msr_uncores[i]; - for (j = 0; j < type->num_boxes; j++) { - pmu = &type->pmus[j]; - box = *per_cpu_ptr(pmu->box, cpu); - /* called by uncore_cpu_init? */ - if (box && box->phys_id >= 0) { - uncore_box_init(box); - continue; - } - - for_each_online_cpu(k) { - exist = *per_cpu_ptr(pmu->box, k); - if (exist && exist->phys_id == phys_id) { - atomic_inc(&exist->refcnt); - *per_cpu_ptr(pmu->box, cpu) = exist; - if (box) { - list_add(&box->list, - &boxes_to_free); - box = NULL; - } - break; - } - } - - if (box) { - box->phys_id = phys_id; - uncore_box_init(box); - } - } - } - return 0; -} - -static int uncore_cpu_prepare(int cpu, int phys_id) -{ - struct intel_uncore_type *type; - struct intel_uncore_pmu *pmu; - struct intel_uncore_box *box; - int i, j; - - for (i = 0; uncore_msr_uncores[i]; i++) { - type = uncore_msr_uncores[i]; - for (j = 0; j < type->num_boxes; j++) { - pmu = &type->pmus[j]; - if (pmu->func_id < 0) - pmu->func_id = j; - - box = uncore_alloc_box(type, cpu_to_node(cpu)); - if (!box) - return -ENOMEM; - - box->pmu = pmu; - box->phys_id = phys_id; - *per_cpu_ptr(pmu->box, cpu) = box; - } - } - return 0; -} - -static void -uncore_change_context(struct intel_uncore_type **uncores, int old_cpu, int new_cpu) -{ - struct intel_uncore_type *type; - struct intel_uncore_pmu *pmu; - struct intel_uncore_box *box; - int i, j; - - for (i = 0; uncores[i]; i++) { - type = uncores[i]; - for (j = 0; j < type->num_boxes; j++) { - pmu = &type->pmus[j]; - if (old_cpu < 0) - box = uncore_pmu_to_box(pmu, new_cpu); - else - box = uncore_pmu_to_box(pmu, old_cpu); - if (!box) - continue; - - if (old_cpu < 0) { - WARN_ON_ONCE(box->cpu != -1); - box->cpu = new_cpu; - continue; - } - - WARN_ON_ONCE(box->cpu != old_cpu); - if (new_cpu >= 0) { - uncore_pmu_cancel_hrtimer(box); - perf_pmu_migrate_context(&pmu->pmu, - old_cpu, new_cpu); - box->cpu = new_cpu; - } else { - box->cpu = -1; - } - } - } -} - -static void uncore_event_exit_cpu(int cpu) -{ - int i, phys_id, target; - - /* if exiting cpu is used for collecting uncore events */ - if (!cpumask_test_and_clear_cpu(cpu, &uncore_cpu_mask)) - return; - - /* find a new cpu to collect uncore events */ - phys_id = topology_physical_package_id(cpu); - target = -1; - for_each_online_cpu(i) { - if (i == cpu) - continue; - if (phys_id == topology_physical_package_id(i)) { - target = i; - break; - } - } - - /* migrate uncore events to the new cpu */ - if (target >= 0) - cpumask_set_cpu(target, &uncore_cpu_mask); - - uncore_change_context(uncore_msr_uncores, cpu, target); - uncore_change_context(uncore_pci_uncores, cpu, target); -} - -static void uncore_event_init_cpu(int cpu) -{ - int i, phys_id; - - phys_id = topology_physical_package_id(cpu); - for_each_cpu(i, &uncore_cpu_mask) { - if (phys_id == topology_physical_package_id(i)) - return; - } - - cpumask_set_cpu(cpu, &uncore_cpu_mask); - - uncore_change_context(uncore_msr_uncores, -1, cpu); - uncore_change_context(uncore_pci_uncores, -1, cpu); -} - -static int uncore_cpu_notifier(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - unsigned int cpu = (long)hcpu; - - /* allocate/free data structure for uncore box */ - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_UP_PREPARE: - uncore_cpu_prepare(cpu, -1); - break; - case CPU_STARTING: - uncore_cpu_starting(cpu); - break; - case CPU_UP_CANCELED: - case CPU_DYING: - uncore_cpu_dying(cpu); - break; - case CPU_ONLINE: - case CPU_DEAD: - uncore_kfree_boxes(); - break; - default: - break; - } - - /* select the cpu that collects uncore events */ - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_DOWN_FAILED: - case CPU_STARTING: - uncore_event_init_cpu(cpu); - break; - case CPU_DOWN_PREPARE: - uncore_event_exit_cpu(cpu); - break; - default: - break; - } - - return NOTIFY_OK; -} - -static struct notifier_block uncore_cpu_nb = { - .notifier_call = uncore_cpu_notifier, - /* - * to migrate uncore events, our notifier should be executed - * before perf core's notifier. - */ - .priority = CPU_PRI_PERF + 1, -}; - -static void __init uncore_cpu_setup(void *dummy) -{ - uncore_cpu_starting(smp_processor_id()); -} - -static int __init uncore_cpu_init(void) -{ - int ret; - - switch (boot_cpu_data.x86_model) { - case 26: /* Nehalem */ - case 30: - case 37: /* Westmere */ - case 44: - nhm_uncore_cpu_init(); - break; - case 42: /* Sandy Bridge */ - case 58: /* Ivy Bridge */ - case 60: /* Haswell */ - case 69: /* Haswell */ - case 70: /* Haswell */ - case 61: /* Broadwell */ - case 71: /* Broadwell */ - snb_uncore_cpu_init(); - break; - case 45: /* Sandy Bridge-EP */ - snbep_uncore_cpu_init(); - break; - case 46: /* Nehalem-EX */ - case 47: /* Westmere-EX aka. Xeon E7 */ - nhmex_uncore_cpu_init(); - break; - case 62: /* Ivy Bridge-EP */ - ivbep_uncore_cpu_init(); - break; - case 63: /* Haswell-EP */ - hswep_uncore_cpu_init(); - break; - case 79: /* BDX-EP */ - case 86: /* BDX-DE */ - bdx_uncore_cpu_init(); - break; - case 87: /* Knights Landing */ - knl_uncore_cpu_init(); - break; - default: - return 0; - } - - ret = uncore_types_init(uncore_msr_uncores); - if (ret) - return ret; - - return 0; -} - -static int __init uncore_pmus_register(void) -{ - struct intel_uncore_pmu *pmu; - struct intel_uncore_type *type; - int i, j; - - for (i = 0; uncore_msr_uncores[i]; i++) { - type = uncore_msr_uncores[i]; - for (j = 0; j < type->num_boxes; j++) { - pmu = &type->pmus[j]; - uncore_pmu_register(pmu); - } - } - - return 0; -} - -static void __init uncore_cpumask_init(void) -{ - int cpu; - - /* - * ony invoke once from msr or pci init code - */ - if (!cpumask_empty(&uncore_cpu_mask)) - return; - - cpu_notifier_register_begin(); - - for_each_online_cpu(cpu) { - int i, phys_id = topology_physical_package_id(cpu); - - for_each_cpu(i, &uncore_cpu_mask) { - if (phys_id == topology_physical_package_id(i)) { - phys_id = -1; - break; - } - } - if (phys_id < 0) - continue; - - uncore_cpu_prepare(cpu, phys_id); - uncore_event_init_cpu(cpu); - } - on_each_cpu(uncore_cpu_setup, NULL, 1); - - __register_cpu_notifier(&uncore_cpu_nb); - - cpu_notifier_register_done(); -} - - -static int __init intel_uncore_init(void) -{ - int ret; - - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) - return -ENODEV; - - if (cpu_has_hypervisor) - return -ENODEV; - - ret = uncore_pci_init(); - if (ret) - goto fail; - ret = uncore_cpu_init(); - if (ret) { - uncore_pci_exit(); - goto fail; - } - uncore_cpumask_init(); - - uncore_pmus_register(); - return 0; -fail: - return ret; -} -device_initcall(intel_uncore_init); diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h deleted file mode 100644 index a7086b862156..000000000000 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h +++ /dev/null @@ -1,357 +0,0 @@ -#include <linux/module.h> -#include <linux/slab.h> -#include <linux/pci.h> -#include <linux/perf_event.h> -#include "perf_event.h" - -#define UNCORE_PMU_NAME_LEN 32 -#define UNCORE_PMU_HRTIMER_INTERVAL (60LL * NSEC_PER_SEC) -#define UNCORE_SNB_IMC_HRTIMER_INTERVAL (5ULL * NSEC_PER_SEC) - -#define UNCORE_FIXED_EVENT 0xff -#define UNCORE_PMC_IDX_MAX_GENERIC 8 -#define UNCORE_PMC_IDX_FIXED UNCORE_PMC_IDX_MAX_GENERIC -#define UNCORE_PMC_IDX_MAX (UNCORE_PMC_IDX_FIXED + 1) - -#define UNCORE_PCI_DEV_DATA(type, idx) ((type << 8) | idx) -#define UNCORE_PCI_DEV_TYPE(data) ((data >> 8) & 0xff) -#define UNCORE_PCI_DEV_IDX(data) (data & 0xff) -#define UNCORE_EXTRA_PCI_DEV 0xff -#define UNCORE_EXTRA_PCI_DEV_MAX 3 - -/* support up to 8 sockets */ -#define UNCORE_SOCKET_MAX 8 - -#define UNCORE_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, 0xff) - -struct intel_uncore_ops; -struct intel_uncore_pmu; -struct intel_uncore_box; -struct uncore_event_desc; - -struct intel_uncore_type { - const char *name; - int num_counters; - int num_boxes; - int perf_ctr_bits; - int fixed_ctr_bits; - unsigned perf_ctr; - unsigned event_ctl; - unsigned event_mask; - unsigned fixed_ctr; - unsigned fixed_ctl; - unsigned box_ctl; - unsigned msr_offset; - unsigned num_shared_regs:8; - unsigned single_fixed:1; - unsigned pair_ctr_ctl:1; - unsigned *msr_offsets; - struct event_constraint unconstrainted; - struct event_constraint *constraints; - struct intel_uncore_pmu *pmus; - struct intel_uncore_ops *ops; - struct uncore_event_desc *event_descs; - const struct attribute_group *attr_groups[4]; - struct pmu *pmu; /* for custom pmu ops */ -}; - -#define pmu_group attr_groups[0] -#define format_group attr_groups[1] -#define events_group attr_groups[2] - -struct intel_uncore_ops { - void (*init_box)(struct intel_uncore_box *); - void (*disable_box)(struct intel_uncore_box *); - void (*enable_box)(struct intel_uncore_box *); - void (*disable_event)(struct intel_uncore_box *, struct perf_event *); - void (*enable_event)(struct intel_uncore_box *, struct perf_event *); - u64 (*read_counter)(struct intel_uncore_box *, struct perf_event *); - int (*hw_config)(struct intel_uncore_box *, struct perf_event *); - struct event_constraint *(*get_constraint)(struct intel_uncore_box *, - struct perf_event *); - void (*put_constraint)(struct intel_uncore_box *, struct perf_event *); -}; - -struct intel_uncore_pmu { - struct pmu pmu; - char name[UNCORE_PMU_NAME_LEN]; - int pmu_idx; - int func_id; - struct intel_uncore_type *type; - struct intel_uncore_box ** __percpu box; - struct list_head box_list; -}; - -struct intel_uncore_extra_reg { - raw_spinlock_t lock; - u64 config, config1, config2; - atomic_t ref; -}; - -struct intel_uncore_box { - int phys_id; - int n_active; /* number of active events */ - int n_events; - int cpu; /* cpu to collect events */ - unsigned long flags; - atomic_t refcnt; - struct perf_event *events[UNCORE_PMC_IDX_MAX]; - struct perf_event *event_list[UNCORE_PMC_IDX_MAX]; - struct event_constraint *event_constraint[UNCORE_PMC_IDX_MAX]; - unsigned long active_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)]; - u64 tags[UNCORE_PMC_IDX_MAX]; - struct pci_dev *pci_dev; - struct intel_uncore_pmu *pmu; - u64 hrtimer_duration; /* hrtimer timeout for this box */ - struct hrtimer hrtimer; - struct list_head list; - struct list_head active_list; - void *io_addr; - struct intel_uncore_extra_reg shared_regs[0]; -}; - -#define UNCORE_BOX_FLAG_INITIATED 0 - -struct uncore_event_desc { - struct kobj_attribute attr; - const char *config; -}; - -struct pci2phy_map { - struct list_head list; - int segment; - int pbus_to_physid[256]; -}; - -int uncore_pcibus_to_physid(struct pci_bus *bus); -struct pci2phy_map *__find_pci2phy_map(int segment); - -ssize_t uncore_event_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf); - -#define INTEL_UNCORE_EVENT_DESC(_name, _config) \ -{ \ - .attr = __ATTR(_name, 0444, uncore_event_show, NULL), \ - .config = _config, \ -} - -#define DEFINE_UNCORE_FORMAT_ATTR(_var, _name, _format) \ -static ssize_t __uncore_##_var##_show(struct kobject *kobj, \ - struct kobj_attribute *attr, \ - char *page) \ -{ \ - BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \ - return sprintf(page, _format "\n"); \ -} \ -static struct kobj_attribute format_attr_##_var = \ - __ATTR(_name, 0444, __uncore_##_var##_show, NULL) - -static inline unsigned uncore_pci_box_ctl(struct intel_uncore_box *box) -{ - return box->pmu->type->box_ctl; -} - -static inline unsigned uncore_pci_fixed_ctl(struct intel_uncore_box *box) -{ - return box->pmu->type->fixed_ctl; -} - -static inline unsigned uncore_pci_fixed_ctr(struct intel_uncore_box *box) -{ - return box->pmu->type->fixed_ctr; -} - -static inline -unsigned uncore_pci_event_ctl(struct intel_uncore_box *box, int idx) -{ - return idx * 4 + box->pmu->type->event_ctl; -} - -static inline -unsigned uncore_pci_perf_ctr(struct intel_uncore_box *box, int idx) -{ - return idx * 8 + box->pmu->type->perf_ctr; -} - -static inline unsigned uncore_msr_box_offset(struct intel_uncore_box *box) -{ - struct intel_uncore_pmu *pmu = box->pmu; - return pmu->type->msr_offsets ? - pmu->type->msr_offsets[pmu->pmu_idx] : - pmu->type->msr_offset * pmu->pmu_idx; -} - -static inline unsigned uncore_msr_box_ctl(struct intel_uncore_box *box) -{ - if (!box->pmu->type->box_ctl) - return 0; - return box->pmu->type->box_ctl + uncore_msr_box_offset(box); -} - -static inline unsigned uncore_msr_fixed_ctl(struct intel_uncore_box *box) -{ - if (!box->pmu->type->fixed_ctl) - return 0; - return box->pmu->type->fixed_ctl + uncore_msr_box_offset(box); -} - -static inline unsigned uncore_msr_fixed_ctr(struct intel_uncore_box *box) -{ - return box->pmu->type->fixed_ctr + uncore_msr_box_offset(box); -} - -static inline -unsigned uncore_msr_event_ctl(struct intel_uncore_box *box, int idx) -{ - return box->pmu->type->event_ctl + - (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) + - uncore_msr_box_offset(box); -} - -static inline -unsigned uncore_msr_perf_ctr(struct intel_uncore_box *box, int idx) -{ - return box->pmu->type->perf_ctr + - (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) + - uncore_msr_box_offset(box); -} - -static inline -unsigned uncore_fixed_ctl(struct intel_uncore_box *box) -{ - if (box->pci_dev) - return uncore_pci_fixed_ctl(box); - else - return uncore_msr_fixed_ctl(box); -} - -static inline -unsigned uncore_fixed_ctr(struct intel_uncore_box *box) -{ - if (box->pci_dev) - return uncore_pci_fixed_ctr(box); - else - return uncore_msr_fixed_ctr(box); -} - -static inline -unsigned uncore_event_ctl(struct intel_uncore_box *box, int idx) -{ - if (box->pci_dev) - return uncore_pci_event_ctl(box, idx); - else - return uncore_msr_event_ctl(box, idx); -} - -static inline -unsigned uncore_perf_ctr(struct intel_uncore_box *box, int idx) -{ - if (box->pci_dev) - return uncore_pci_perf_ctr(box, idx); - else - return uncore_msr_perf_ctr(box, idx); -} - -static inline int uncore_perf_ctr_bits(struct intel_uncore_box *box) -{ - return box->pmu->type->perf_ctr_bits; -} - -static inline int uncore_fixed_ctr_bits(struct intel_uncore_box *box) -{ - return box->pmu->type->fixed_ctr_bits; -} - -static inline int uncore_num_counters(struct intel_uncore_box *box) -{ - return box->pmu->type->num_counters; -} - -static inline void uncore_disable_box(struct intel_uncore_box *box) -{ - if (box->pmu->type->ops->disable_box) - box->pmu->type->ops->disable_box(box); -} - -static inline void uncore_enable_box(struct intel_uncore_box *box) -{ - if (box->pmu->type->ops->enable_box) - box->pmu->type->ops->enable_box(box); -} - -static inline void uncore_disable_event(struct intel_uncore_box *box, - struct perf_event *event) -{ - box->pmu->type->ops->disable_event(box, event); -} - -static inline void uncore_enable_event(struct intel_uncore_box *box, - struct perf_event *event) -{ - box->pmu->type->ops->enable_event(box, event); -} - -static inline u64 uncore_read_counter(struct intel_uncore_box *box, - struct perf_event *event) -{ - return box->pmu->type->ops->read_counter(box, event); -} - -static inline void uncore_box_init(struct intel_uncore_box *box) -{ - if (!test_and_set_bit(UNCORE_BOX_FLAG_INITIATED, &box->flags)) { - if (box->pmu->type->ops->init_box) - box->pmu->type->ops->init_box(box); - } -} - -static inline bool uncore_box_is_fake(struct intel_uncore_box *box) -{ - return (box->phys_id < 0); -} - -struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event); -struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu); -struct intel_uncore_box *uncore_event_to_box(struct perf_event *event); -u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event); -void uncore_pmu_start_hrtimer(struct intel_uncore_box *box); -void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box); -void uncore_pmu_event_read(struct perf_event *event); -void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event); -struct event_constraint * -uncore_get_constraint(struct intel_uncore_box *box, struct perf_event *event); -void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event); -u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx); - -extern struct intel_uncore_type **uncore_msr_uncores; -extern struct intel_uncore_type **uncore_pci_uncores; -extern struct pci_driver *uncore_pci_driver; -extern raw_spinlock_t pci2phy_map_lock; -extern struct list_head pci2phy_map_head; -extern struct pci_dev *uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX]; -extern struct event_constraint uncore_constraint_empty; - -/* perf_event_intel_uncore_snb.c */ -int snb_uncore_pci_init(void); -int ivb_uncore_pci_init(void); -int hsw_uncore_pci_init(void); -int bdw_uncore_pci_init(void); -int skl_uncore_pci_init(void); -void snb_uncore_cpu_init(void); -void nhm_uncore_cpu_init(void); -int snb_pci2phy_map_init(int devid); - -/* perf_event_intel_uncore_snbep.c */ -int snbep_uncore_pci_init(void); -void snbep_uncore_cpu_init(void); -int ivbep_uncore_pci_init(void); -void ivbep_uncore_cpu_init(void); -int hswep_uncore_pci_init(void); -void hswep_uncore_cpu_init(void); -int bdx_uncore_pci_init(void); -void bdx_uncore_cpu_init(void); -int knl_uncore_pci_init(void); -void knl_uncore_cpu_init(void); - -/* perf_event_intel_uncore_nhmex.c */ -void nhmex_uncore_cpu_init(void); diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_nhmex.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_nhmex.c deleted file mode 100644 index 2749965afed0..000000000000 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore_nhmex.c +++ /dev/null @@ -1,1221 +0,0 @@ -/* Nehalem-EX/Westmere-EX uncore support */ -#include "perf_event_intel_uncore.h" - -/* NHM-EX event control */ -#define NHMEX_PMON_CTL_EV_SEL_MASK 0x000000ff -#define NHMEX_PMON_CTL_UMASK_MASK 0x0000ff00 -#define NHMEX_PMON_CTL_EN_BIT0 (1 << 0) -#define NHMEX_PMON_CTL_EDGE_DET (1 << 18) -#define NHMEX_PMON_CTL_PMI_EN (1 << 20) -#define NHMEX_PMON_CTL_EN_BIT22 (1 << 22) -#define NHMEX_PMON_CTL_INVERT (1 << 23) -#define NHMEX_PMON_CTL_TRESH_MASK 0xff000000 -#define NHMEX_PMON_RAW_EVENT_MASK (NHMEX_PMON_CTL_EV_SEL_MASK | \ - NHMEX_PMON_CTL_UMASK_MASK | \ - NHMEX_PMON_CTL_EDGE_DET | \ - NHMEX_PMON_CTL_INVERT | \ - NHMEX_PMON_CTL_TRESH_MASK) - -/* NHM-EX Ubox */ -#define NHMEX_U_MSR_PMON_GLOBAL_CTL 0xc00 -#define NHMEX_U_MSR_PMON_CTR 0xc11 -#define NHMEX_U_MSR_PMON_EV_SEL 0xc10 - -#define NHMEX_U_PMON_GLOBAL_EN (1 << 0) -#define NHMEX_U_PMON_GLOBAL_PMI_CORE_SEL 0x0000001e -#define NHMEX_U_PMON_GLOBAL_EN_ALL (1 << 28) -#define NHMEX_U_PMON_GLOBAL_RST_ALL (1 << 29) -#define NHMEX_U_PMON_GLOBAL_FRZ_ALL (1 << 31) - -#define NHMEX_U_PMON_RAW_EVENT_MASK \ - (NHMEX_PMON_CTL_EV_SEL_MASK | \ - NHMEX_PMON_CTL_EDGE_DET) - -/* NHM-EX Cbox */ -#define NHMEX_C0_MSR_PMON_GLOBAL_CTL 0xd00 -#define NHMEX_C0_MSR_PMON_CTR0 0xd11 -#define NHMEX_C0_MSR_PMON_EV_SEL0 0xd10 -#define NHMEX_C_MSR_OFFSET 0x20 - -/* NHM-EX Bbox */ -#define NHMEX_B0_MSR_PMON_GLOBAL_CTL 0xc20 -#define NHMEX_B0_MSR_PMON_CTR0 0xc31 -#define NHMEX_B0_MSR_PMON_CTL0 0xc30 -#define NHMEX_B_MSR_OFFSET 0x40 -#define NHMEX_B0_MSR_MATCH 0xe45 -#define NHMEX_B0_MSR_MASK 0xe46 -#define NHMEX_B1_MSR_MATCH 0xe4d -#define NHMEX_B1_MSR_MASK 0xe4e - -#define NHMEX_B_PMON_CTL_EN (1 << 0) -#define NHMEX_B_PMON_CTL_EV_SEL_SHIFT 1 -#define NHMEX_B_PMON_CTL_EV_SEL_MASK \ - (0x1f << NHMEX_B_PMON_CTL_EV_SEL_SHIFT) -#define NHMEX_B_PMON_CTR_SHIFT 6 -#define NHMEX_B_PMON_CTR_MASK \ - (0x3 << NHMEX_B_PMON_CTR_SHIFT) -#define NHMEX_B_PMON_RAW_EVENT_MASK \ - (NHMEX_B_PMON_CTL_EV_SEL_MASK | \ - NHMEX_B_PMON_CTR_MASK) - -/* NHM-EX Sbox */ -#define NHMEX_S0_MSR_PMON_GLOBAL_CTL 0xc40 -#define NHMEX_S0_MSR_PMON_CTR0 0xc51 -#define NHMEX_S0_MSR_PMON_CTL0 0xc50 -#define NHMEX_S_MSR_OFFSET 0x80 -#define NHMEX_S0_MSR_MM_CFG 0xe48 -#define NHMEX_S0_MSR_MATCH 0xe49 -#define NHMEX_S0_MSR_MASK 0xe4a -#define NHMEX_S1_MSR_MM_CFG 0xe58 -#define NHMEX_S1_MSR_MATCH 0xe59 -#define NHMEX_S1_MSR_MASK 0xe5a - -#define NHMEX_S_PMON_MM_CFG_EN (0x1ULL << 63) -#define NHMEX_S_EVENT_TO_R_PROG_EV 0 - -/* NHM-EX Mbox */ -#define NHMEX_M0_MSR_GLOBAL_CTL 0xca0 -#define NHMEX_M0_MSR_PMU_DSP 0xca5 -#define NHMEX_M0_MSR_PMU_ISS 0xca6 -#define NHMEX_M0_MSR_PMU_MAP 0xca7 -#define NHMEX_M0_MSR_PMU_MSC_THR 0xca8 -#define NHMEX_M0_MSR_PMU_PGT 0xca9 -#define NHMEX_M0_MSR_PMU_PLD 0xcaa -#define NHMEX_M0_MSR_PMU_ZDP_CTL_FVC 0xcab -#define NHMEX_M0_MSR_PMU_CTL0 0xcb0 -#define NHMEX_M0_MSR_PMU_CNT0 0xcb1 -#define NHMEX_M_MSR_OFFSET 0x40 -#define NHMEX_M0_MSR_PMU_MM_CFG 0xe54 -#define NHMEX_M1_MSR_PMU_MM_CFG 0xe5c - -#define NHMEX_M_PMON_MM_CFG_EN (1ULL << 63) -#define NHMEX_M_PMON_ADDR_MATCH_MASK 0x3ffffffffULL -#define NHMEX_M_PMON_ADDR_MASK_MASK 0x7ffffffULL -#define NHMEX_M_PMON_ADDR_MASK_SHIFT 34 - -#define NHMEX_M_PMON_CTL_EN (1 << 0) -#define NHMEX_M_PMON_CTL_PMI_EN (1 << 1) -#define NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT 2 -#define NHMEX_M_PMON_CTL_COUNT_MODE_MASK \ - (0x3 << NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT) -#define NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT 4 -#define NHMEX_M_PMON_CTL_STORAGE_MODE_MASK \ - (0x3 << NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT) -#define NHMEX_M_PMON_CTL_WRAP_MODE (1 << 6) -#define NHMEX_M_PMON_CTL_FLAG_MODE (1 << 7) -#define NHMEX_M_PMON_CTL_INC_SEL_SHIFT 9 -#define NHMEX_M_PMON_CTL_INC_SEL_MASK \ - (0x1f << NHMEX_M_PMON_CTL_INC_SEL_SHIFT) -#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT 19 -#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK \ - (0x7 << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT) -#define NHMEX_M_PMON_RAW_EVENT_MASK \ - (NHMEX_M_PMON_CTL_COUNT_MODE_MASK | \ - NHMEX_M_PMON_CTL_STORAGE_MODE_MASK | \ - NHMEX_M_PMON_CTL_WRAP_MODE | \ - NHMEX_M_PMON_CTL_FLAG_MODE | \ - NHMEX_M_PMON_CTL_INC_SEL_MASK | \ - NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK) - -#define NHMEX_M_PMON_ZDP_CTL_FVC_MASK (((1 << 11) - 1) | (1 << 23)) -#define NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n) (0x7ULL << (11 + 3 * (n))) - -#define WSMEX_M_PMON_ZDP_CTL_FVC_MASK (((1 << 12) - 1) | (1 << 24)) -#define WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n) (0x7ULL << (12 + 3 * (n))) - -/* - * use the 9~13 bits to select event If the 7th bit is not set, - * otherwise use the 19~21 bits to select event. - */ -#define MBOX_INC_SEL(x) ((x) << NHMEX_M_PMON_CTL_INC_SEL_SHIFT) -#define MBOX_SET_FLAG_SEL(x) (((x) << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT) | \ - NHMEX_M_PMON_CTL_FLAG_MODE) -#define MBOX_INC_SEL_MASK (NHMEX_M_PMON_CTL_INC_SEL_MASK | \ - NHMEX_M_PMON_CTL_FLAG_MODE) -#define MBOX_SET_FLAG_SEL_MASK (NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK | \ - NHMEX_M_PMON_CTL_FLAG_MODE) -#define MBOX_INC_SEL_EXTAR_REG(c, r) \ - EVENT_EXTRA_REG(MBOX_INC_SEL(c), NHMEX_M0_MSR_PMU_##r, \ - MBOX_INC_SEL_MASK, (u64)-1, NHMEX_M_##r) -#define MBOX_SET_FLAG_SEL_EXTRA_REG(c, r) \ - EVENT_EXTRA_REG(MBOX_SET_FLAG_SEL(c), NHMEX_M0_MSR_PMU_##r, \ - MBOX_SET_FLAG_SEL_MASK, \ - (u64)-1, NHMEX_M_##r) - -/* NHM-EX Rbox */ -#define NHMEX_R_MSR_GLOBAL_CTL 0xe00 -#define NHMEX_R_MSR_PMON_CTL0 0xe10 -#define NHMEX_R_MSR_PMON_CNT0 0xe11 -#define NHMEX_R_MSR_OFFSET 0x20 - -#define NHMEX_R_MSR_PORTN_QLX_CFG(n) \ - ((n) < 4 ? (0xe0c + (n)) : (0xe2c + (n) - 4)) -#define NHMEX_R_MSR_PORTN_IPERF_CFG0(n) (0xe04 + (n)) -#define NHMEX_R_MSR_PORTN_IPERF_CFG1(n) (0xe24 + (n)) -#define NHMEX_R_MSR_PORTN_XBR_OFFSET(n) \ - (((n) < 4 ? 0 : 0x10) + (n) * 4) -#define NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) \ - (0xe60 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n)) -#define NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(n) \ - (NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 1) -#define NHMEX_R_MSR_PORTN_XBR_SET1_MASK(n) \ - (NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 2) -#define NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) \ - (0xe70 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n)) -#define NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(n) \ - (NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 1) -#define NHMEX_R_MSR_PORTN_XBR_SET2_MASK(n) \ - (NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 2) - -#define NHMEX_R_PMON_CTL_EN (1 << 0) -#define NHMEX_R_PMON_CTL_EV_SEL_SHIFT 1 -#define NHMEX_R_PMON_CTL_EV_SEL_MASK \ - (0x1f << NHMEX_R_PMON_CTL_EV_SEL_SHIFT) -#define NHMEX_R_PMON_CTL_PMI_EN (1 << 6) -#define NHMEX_R_PMON_RAW_EVENT_MASK NHMEX_R_PMON_CTL_EV_SEL_MASK - -/* NHM-EX Wbox */ -#define NHMEX_W_MSR_GLOBAL_CTL 0xc80 -#define NHMEX_W_MSR_PMON_CNT0 0xc90 -#define NHMEX_W_MSR_PMON_EVT_SEL0 0xc91 -#define NHMEX_W_MSR_PMON_FIXED_CTR 0x394 -#define NHMEX_W_MSR_PMON_FIXED_CTL 0x395 - -#define NHMEX_W_PMON_GLOBAL_FIXED_EN (1ULL << 31) - -#define __BITS_VALUE(x, i, n) ((typeof(x))(((x) >> ((i) * (n))) & \ - ((1ULL << (n)) - 1))) - -DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7"); -DEFINE_UNCORE_FORMAT_ATTR(event5, event, "config:1-5"); -DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15"); -DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18"); -DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23"); -DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31"); -DEFINE_UNCORE_FORMAT_ATTR(counter, counter, "config:6-7"); -DEFINE_UNCORE_FORMAT_ATTR(match, match, "config1:0-63"); -DEFINE_UNCORE_FORMAT_ATTR(mask, mask, "config2:0-63"); - -static void nhmex_uncore_msr_init_box(struct intel_uncore_box *box) -{ - wrmsrl(NHMEX_U_MSR_PMON_GLOBAL_CTL, NHMEX_U_PMON_GLOBAL_EN_ALL); -} - -static void nhmex_uncore_msr_disable_box(struct intel_uncore_box *box) -{ - unsigned msr = uncore_msr_box_ctl(box); - u64 config; - - if (msr) { - rdmsrl(msr, config); - config &= ~((1ULL << uncore_num_counters(box)) - 1); - /* WBox has a fixed counter */ - if (uncore_msr_fixed_ctl(box)) - config &= ~NHMEX_W_PMON_GLOBAL_FIXED_EN; - wrmsrl(msr, config); - } -} - -static void nhmex_uncore_msr_enable_box(struct intel_uncore_box *box) -{ - unsigned msr = uncore_msr_box_ctl(box); - u64 config; - - if (msr) { - rdmsrl(msr, config); - config |= (1ULL << uncore_num_counters(box)) - 1; - /* WBox has a fixed counter */ - if (uncore_msr_fixed_ctl(box)) - config |= NHMEX_W_PMON_GLOBAL_FIXED_EN; - wrmsrl(msr, config); - } -} - -static void nhmex_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event) -{ - wrmsrl(event->hw.config_base, 0); -} - -static void nhmex_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - - if (hwc->idx >= UNCORE_PMC_IDX_FIXED) - wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0); - else if (box->pmu->type->event_mask & NHMEX_PMON_CTL_EN_BIT0) - wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22); - else - wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0); -} - -#define NHMEX_UNCORE_OPS_COMMON_INIT() \ - .init_box = nhmex_uncore_msr_init_box, \ - .disable_box = nhmex_uncore_msr_disable_box, \ - .enable_box = nhmex_uncore_msr_enable_box, \ - .disable_event = nhmex_uncore_msr_disable_event, \ - .read_counter = uncore_msr_read_counter - -static struct intel_uncore_ops nhmex_uncore_ops = { - NHMEX_UNCORE_OPS_COMMON_INIT(), - .enable_event = nhmex_uncore_msr_enable_event, -}; - -static struct attribute *nhmex_uncore_ubox_formats_attr[] = { - &format_attr_event.attr, - &format_attr_edge.attr, - NULL, -}; - -static struct attribute_group nhmex_uncore_ubox_format_group = { - .name = "format", - .attrs = nhmex_uncore_ubox_formats_attr, -}; - -static struct intel_uncore_type nhmex_uncore_ubox = { - .name = "ubox", - .num_counters = 1, - .num_boxes = 1, - .perf_ctr_bits = 48, - .event_ctl = NHMEX_U_MSR_PMON_EV_SEL, - .perf_ctr = NHMEX_U_MSR_PMON_CTR, - .event_mask = NHMEX_U_PMON_RAW_EVENT_MASK, - .box_ctl = NHMEX_U_MSR_PMON_GLOBAL_CTL, - .ops = &nhmex_uncore_ops, - .format_group = &nhmex_uncore_ubox_format_group -}; - -static struct attribute *nhmex_uncore_cbox_formats_attr[] = { - &format_attr_event.attr, - &format_attr_umask.attr, - &format_attr_edge.attr, - &format_attr_inv.attr, - &format_attr_thresh8.attr, - NULL, -}; - -static struct attribute_group nhmex_uncore_cbox_format_group = { - .name = "format", - .attrs = nhmex_uncore_cbox_formats_attr, -}; - -/* msr offset for each instance of cbox */ -static unsigned nhmex_cbox_msr_offsets[] = { - 0x0, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, 0x240, 0x2c0, -}; - -static struct intel_uncore_type nhmex_uncore_cbox = { - .name = "cbox", - .num_counters = 6, - .num_boxes = 10, - .perf_ctr_bits = 48, - .event_ctl = NHMEX_C0_MSR_PMON_EV_SEL0, - .perf_ctr = NHMEX_C0_MSR_PMON_CTR0, - .event_mask = NHMEX_PMON_RAW_EVENT_MASK, - .box_ctl = NHMEX_C0_MSR_PMON_GLOBAL_CTL, - .msr_offsets = nhmex_cbox_msr_offsets, - .pair_ctr_ctl = 1, - .ops = &nhmex_uncore_ops, - .format_group = &nhmex_uncore_cbox_format_group -}; - -static struct uncore_event_desc nhmex_uncore_wbox_events[] = { - INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0"), - { /* end: all zeroes */ }, -}; - -static struct intel_uncore_type nhmex_uncore_wbox = { - .name = "wbox", - .num_counters = 4, - .num_boxes = 1, - .perf_ctr_bits = 48, - .event_ctl = NHMEX_W_MSR_PMON_CNT0, - .perf_ctr = NHMEX_W_MSR_PMON_EVT_SEL0, - .fixed_ctr = NHMEX_W_MSR_PMON_FIXED_CTR, - .fixed_ctl = NHMEX_W_MSR_PMON_FIXED_CTL, - .event_mask = NHMEX_PMON_RAW_EVENT_MASK, - .box_ctl = NHMEX_W_MSR_GLOBAL_CTL, - .pair_ctr_ctl = 1, - .event_descs = nhmex_uncore_wbox_events, - .ops = &nhmex_uncore_ops, - .format_group = &nhmex_uncore_cbox_format_group -}; - -static int nhmex_bbox_hw_config(struct intel_uncore_box *box, struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - struct hw_perf_event_extra *reg1 = &hwc->extra_reg; - struct hw_perf_event_extra *reg2 = &hwc->branch_reg; - int ctr, ev_sel; - - ctr = (hwc->config & NHMEX_B_PMON_CTR_MASK) >> - NHMEX_B_PMON_CTR_SHIFT; - ev_sel = (hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK) >> - NHMEX_B_PMON_CTL_EV_SEL_SHIFT; - - /* events that do not use the match/mask registers */ - if ((ctr == 0 && ev_sel > 0x3) || (ctr == 1 && ev_sel > 0x6) || - (ctr == 2 && ev_sel != 0x4) || ctr == 3) - return 0; - - if (box->pmu->pmu_idx == 0) - reg1->reg = NHMEX_B0_MSR_MATCH; - else - reg1->reg = NHMEX_B1_MSR_MATCH; - reg1->idx = 0; - reg1->config = event->attr.config1; - reg2->config = event->attr.config2; - return 0; -} - -static void nhmex_bbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - struct hw_perf_event_extra *reg1 = &hwc->extra_reg; - struct hw_perf_event_extra *reg2 = &hwc->branch_reg; - - if (reg1->idx != EXTRA_REG_NONE) { - wrmsrl(reg1->reg, reg1->config); - wrmsrl(reg1->reg + 1, reg2->config); - } - wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 | - (hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK)); -} - -/* - * The Bbox has 4 counters, but each counter monitors different events. - * Use bits 6-7 in the event config to select counter. - */ -static struct event_constraint nhmex_uncore_bbox_constraints[] = { - EVENT_CONSTRAINT(0 , 1, 0xc0), - EVENT_CONSTRAINT(0x40, 2, 0xc0), - EVENT_CONSTRAINT(0x80, 4, 0xc0), - EVENT_CONSTRAINT(0xc0, 8, 0xc0), - EVENT_CONSTRAINT_END, -}; - -static struct attribute *nhmex_uncore_bbox_formats_attr[] = { - &format_attr_event5.attr, - &format_attr_counter.attr, - &format_attr_match.attr, - &format_attr_mask.attr, - NULL, -}; - -static struct attribute_group nhmex_uncore_bbox_format_group = { - .name = "format", - .attrs = nhmex_uncore_bbox_formats_attr, -}; - -static struct intel_uncore_ops nhmex_uncore_bbox_ops = { - NHMEX_UNCORE_OPS_COMMON_INIT(), - .enable_event = nhmex_bbox_msr_enable_event, - .hw_config = nhmex_bbox_hw_config, - .get_constraint = uncore_get_constraint, - .put_constraint = uncore_put_constraint, -}; - -static struct intel_uncore_type nhmex_uncore_bbox = { - .name = "bbox", - .num_counters = 4, - .num_boxes = 2, - .perf_ctr_bits = 48, - .event_ctl = NHMEX_B0_MSR_PMON_CTL0, - .perf_ctr = NHMEX_B0_MSR_PMON_CTR0, - .event_mask = NHMEX_B_PMON_RAW_EVENT_MASK, - .box_ctl = NHMEX_B0_MSR_PMON_GLOBAL_CTL, - .msr_offset = NHMEX_B_MSR_OFFSET, - .pair_ctr_ctl = 1, - .num_shared_regs = 1, - .constraints = nhmex_uncore_bbox_constraints, - .ops = &nhmex_uncore_bbox_ops, - .format_group = &nhmex_uncore_bbox_format_group -}; - -static int nhmex_sbox_hw_config(struct intel_uncore_box *box, struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - struct hw_perf_event_extra *reg1 = &hwc->extra_reg; - struct hw_perf_event_extra *reg2 = &hwc->branch_reg; - - /* only TO_R_PROG_EV event uses the match/mask register */ - if ((hwc->config & NHMEX_PMON_CTL_EV_SEL_MASK) != - NHMEX_S_EVENT_TO_R_PROG_EV) - return 0; - - if (box->pmu->pmu_idx == 0) - reg1->reg = NHMEX_S0_MSR_MM_CFG; - else - reg1->reg = NHMEX_S1_MSR_MM_CFG; - reg1->idx = 0; - reg1->config = event->attr.config1; - reg2->config = event->attr.config2; - return 0; -} - -static void nhmex_sbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - struct hw_perf_event_extra *reg1 = &hwc->extra_reg; - struct hw_perf_event_extra *reg2 = &hwc->branch_reg; - - if (reg1->idx != EXTRA_REG_NONE) { - wrmsrl(reg1->reg, 0); - wrmsrl(reg1->reg + 1, reg1->config); - wrmsrl(reg1->reg + 2, reg2->config); - wrmsrl(reg1->reg, NHMEX_S_PMON_MM_CFG_EN); - } - wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22); -} - -static struct attribute *nhmex_uncore_sbox_formats_attr[] = { - &format_attr_event.attr, - &format_attr_umask.attr, - &format_attr_edge.attr, - &format_attr_inv.attr, - &format_attr_thresh8.attr, - &format_attr_match.attr, - &format_attr_mask.attr, - NULL, -}; - -static struct attribute_group nhmex_uncore_sbox_format_group = { - .name = "format", - .attrs = nhmex_uncore_sbox_formats_attr, -}; - -static struct intel_uncore_ops nhmex_uncore_sbox_ops = { - NHMEX_UNCORE_OPS_COMMON_INIT(), - .enable_event = nhmex_sbox_msr_enable_event, - .hw_config = nhmex_sbox_hw_config, - .get_constraint = uncore_get_constraint, - .put_constraint = uncore_put_constraint, -}; - -static struct intel_uncore_type nhmex_uncore_sbox = { - .name = "sbox", - .num_counters = 4, - .num_boxes = 2, - .perf_ctr_bits = 48, - .event_ctl = NHMEX_S0_MSR_PMON_CTL0, - .perf_ctr = NHMEX_S0_MSR_PMON_CTR0, - .event_mask = NHMEX_PMON_RAW_EVENT_MASK, - .box_ctl = NHMEX_S0_MSR_PMON_GLOBAL_CTL, - .msr_offset = NHMEX_S_MSR_OFFSET, - .pair_ctr_ctl = 1, - .num_shared_regs = 1, - .ops = &nhmex_uncore_sbox_ops, - .format_group = &nhmex_uncore_sbox_format_group -}; - -enum { - EXTRA_REG_NHMEX_M_FILTER, - EXTRA_REG_NHMEX_M_DSP, - EXTRA_REG_NHMEX_M_ISS, - EXTRA_REG_NHMEX_M_MAP, - EXTRA_REG_NHMEX_M_MSC_THR, - EXTRA_REG_NHMEX_M_PGT, - EXTRA_REG_NHMEX_M_PLD, - EXTRA_REG_NHMEX_M_ZDP_CTL_FVC, -}; - -static struct extra_reg nhmex_uncore_mbox_extra_regs[] = { - MBOX_INC_SEL_EXTAR_REG(0x0, DSP), - MBOX_INC_SEL_EXTAR_REG(0x4, MSC_THR), - MBOX_INC_SEL_EXTAR_REG(0x5, MSC_THR), - MBOX_INC_SEL_EXTAR_REG(0x9, ISS), - /* event 0xa uses two extra registers */ - MBOX_INC_SEL_EXTAR_REG(0xa, ISS), - MBOX_INC_SEL_EXTAR_REG(0xa, PLD), - MBOX_INC_SEL_EXTAR_REG(0xb, PLD), - /* events 0xd ~ 0x10 use the same extra register */ - MBOX_INC_SEL_EXTAR_REG(0xd, ZDP_CTL_FVC), - MBOX_INC_SEL_EXTAR_REG(0xe, ZDP_CTL_FVC), - MBOX_INC_SEL_EXTAR_REG(0xf, ZDP_CTL_FVC), - MBOX_INC_SEL_EXTAR_REG(0x10, ZDP_CTL_FVC), - MBOX_INC_SEL_EXTAR_REG(0x16, PGT), - MBOX_SET_FLAG_SEL_EXTRA_REG(0x0, DSP), - MBOX_SET_FLAG_SEL_EXTRA_REG(0x1, ISS), - MBOX_SET_FLAG_SEL_EXTRA_REG(0x5, PGT), - MBOX_SET_FLAG_SEL_EXTRA_REG(0x6, MAP), - EVENT_EXTRA_END -}; - -/* Nehalem-EX or Westmere-EX ? */ -static bool uncore_nhmex; - -static bool nhmex_mbox_get_shared_reg(struct intel_uncore_box *box, int idx, u64 config) -{ - struct intel_uncore_extra_reg *er; - unsigned long flags; - bool ret = false; - u64 mask; - - if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) { - er = &box->shared_regs[idx]; - raw_spin_lock_irqsave(&er->lock, flags); - if (!atomic_read(&er->ref) || er->config == config) { - atomic_inc(&er->ref); - er->config = config; - ret = true; - } - raw_spin_unlock_irqrestore(&er->lock, flags); - - return ret; - } - /* - * The ZDP_CTL_FVC MSR has 4 fields which are used to control - * events 0xd ~ 0x10. Besides these 4 fields, there are additional - * fields which are shared. - */ - idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC; - if (WARN_ON_ONCE(idx >= 4)) - return false; - - /* mask of the shared fields */ - if (uncore_nhmex) - mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK; - else - mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK; - er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC]; - - raw_spin_lock_irqsave(&er->lock, flags); - /* add mask of the non-shared field if it's in use */ - if (__BITS_VALUE(atomic_read(&er->ref), idx, 8)) { - if (uncore_nhmex) - mask |= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); - else - mask |= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); - } - - if (!atomic_read(&er->ref) || !((er->config ^ config) & mask)) { - atomic_add(1 << (idx * 8), &er->ref); - if (uncore_nhmex) - mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK | - NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); - else - mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK | - WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); - er->config &= ~mask; - er->config |= (config & mask); - ret = true; - } - raw_spin_unlock_irqrestore(&er->lock, flags); - - return ret; -} - -static void nhmex_mbox_put_shared_reg(struct intel_uncore_box *box, int idx) -{ - struct intel_uncore_extra_reg *er; - - if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) { - er = &box->shared_regs[idx]; - atomic_dec(&er->ref); - return; - } - - idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC; - er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC]; - atomic_sub(1 << (idx * 8), &er->ref); -} - -static u64 nhmex_mbox_alter_er(struct perf_event *event, int new_idx, bool modify) -{ - struct hw_perf_event *hwc = &event->hw; - struct hw_perf_event_extra *reg1 = &hwc->extra_reg; - u64 idx, orig_idx = __BITS_VALUE(reg1->idx, 0, 8); - u64 config = reg1->config; - - /* get the non-shared control bits and shift them */ - idx = orig_idx - EXTRA_REG_NHMEX_M_ZDP_CTL_FVC; - if (uncore_nhmex) - config &= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); - else - config &= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); - if (new_idx > orig_idx) { - idx = new_idx - orig_idx; - config <<= 3 * idx; - } else { - idx = orig_idx - new_idx; - config >>= 3 * idx; - } - - /* add the shared control bits back */ - if (uncore_nhmex) - config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config; - else - config |= WSMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config; - config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config; - if (modify) { - /* adjust the main event selector */ - if (new_idx > orig_idx) - hwc->config += idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT; - else - hwc->config -= idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT; - reg1->config = config; - reg1->idx = ~0xff | new_idx; - } - return config; -} - -static struct event_constraint * -nhmex_mbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event) -{ - struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; - struct hw_perf_event_extra *reg2 = &event->hw.branch_reg; - int i, idx[2], alloc = 0; - u64 config1 = reg1->config; - - idx[0] = __BITS_VALUE(reg1->idx, 0, 8); - idx[1] = __BITS_VALUE(reg1->idx, 1, 8); -again: - for (i = 0; i < 2; i++) { - if (!uncore_box_is_fake(box) && (reg1->alloc & (0x1 << i))) - idx[i] = 0xff; - - if (idx[i] == 0xff) - continue; - - if (!nhmex_mbox_get_shared_reg(box, idx[i], - __BITS_VALUE(config1, i, 32))) - goto fail; - alloc |= (0x1 << i); - } - - /* for the match/mask registers */ - if (reg2->idx != EXTRA_REG_NONE && - (uncore_box_is_fake(box) || !reg2->alloc) && - !nhmex_mbox_get_shared_reg(box, reg2->idx, reg2->config)) - goto fail; - - /* - * If it's a fake box -- as per validate_{group,event}() we - * shouldn't touch event state and we can avoid doing so - * since both will only call get_event_constraints() once - * on each event, this avoids the need for reg->alloc. - */ - if (!uncore_box_is_fake(box)) { - if (idx[0] != 0xff && idx[0] != __BITS_VALUE(reg1->idx, 0, 8)) - nhmex_mbox_alter_er(event, idx[0], true); - reg1->alloc |= alloc; - if (reg2->idx != EXTRA_REG_NONE) - reg2->alloc = 1; - } - return NULL; -fail: - if (idx[0] != 0xff && !(alloc & 0x1) && - idx[0] >= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) { - /* - * events 0xd ~ 0x10 are functional identical, but are - * controlled by different fields in the ZDP_CTL_FVC - * register. If we failed to take one field, try the - * rest 3 choices. - */ - BUG_ON(__BITS_VALUE(reg1->idx, 1, 8) != 0xff); - idx[0] -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC; - idx[0] = (idx[0] + 1) % 4; - idx[0] += EXTRA_REG_NHMEX_M_ZDP_CTL_FVC; - if (idx[0] != __BITS_VALUE(reg1->idx, 0, 8)) { - config1 = nhmex_mbox_alter_er(event, idx[0], false); - goto again; - } - } - - if (alloc & 0x1) - nhmex_mbox_put_shared_reg(box, idx[0]); - if (alloc & 0x2) - nhmex_mbox_put_shared_reg(box, idx[1]); - return &uncore_constraint_empty; -} - -static void nhmex_mbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event) -{ - struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; - struct hw_perf_event_extra *reg2 = &event->hw.branch_reg; - - if (uncore_box_is_fake(box)) - return; - - if (reg1->alloc & 0x1) - nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 0, 8)); - if (reg1->alloc & 0x2) - nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 1, 8)); - reg1->alloc = 0; - - if (reg2->alloc) { - nhmex_mbox_put_shared_reg(box, reg2->idx); - reg2->alloc = 0; - } -} - -static int nhmex_mbox_extra_reg_idx(struct extra_reg *er) -{ - if (er->idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) - return er->idx; - return er->idx + (er->event >> NHMEX_M_PMON_CTL_INC_SEL_SHIFT) - 0xd; -} - -static int nhmex_mbox_hw_config(struct intel_uncore_box *box, struct perf_event *event) -{ - struct intel_uncore_type *type = box->pmu->type; - struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; - struct hw_perf_event_extra *reg2 = &event->hw.branch_reg; - struct extra_reg *er; - unsigned msr; - int reg_idx = 0; - /* - * The mbox events may require 2 extra MSRs at the most. But only - * the lower 32 bits in these MSRs are significant, so we can use - * config1 to pass two MSRs' config. - */ - for (er = nhmex_uncore_mbox_extra_regs; er->msr; er++) { - if (er->event != (event->hw.config & er->config_mask)) - continue; - if (event->attr.config1 & ~er->valid_mask) - return -EINVAL; - - msr = er->msr + type->msr_offset * box->pmu->pmu_idx; - if (WARN_ON_ONCE(msr >= 0xffff || er->idx >= 0xff)) - return -EINVAL; - - /* always use the 32~63 bits to pass the PLD config */ - if (er->idx == EXTRA_REG_NHMEX_M_PLD) - reg_idx = 1; - else if (WARN_ON_ONCE(reg_idx > 0)) - return -EINVAL; - - reg1->idx &= ~(0xff << (reg_idx * 8)); - reg1->reg &= ~(0xffff << (reg_idx * 16)); - reg1->idx |= nhmex_mbox_extra_reg_idx(er) << (reg_idx * 8); - reg1->reg |= msr << (reg_idx * 16); - reg1->config = event->attr.config1; - reg_idx++; - } - /* - * The mbox only provides ability to perform address matching - * for the PLD events. - */ - if (reg_idx == 2) { - reg2->idx = EXTRA_REG_NHMEX_M_FILTER; - if (event->attr.config2 & NHMEX_M_PMON_MM_CFG_EN) - reg2->config = event->attr.config2; - else - reg2->config = ~0ULL; - if (box->pmu->pmu_idx == 0) - reg2->reg = NHMEX_M0_MSR_PMU_MM_CFG; - else - reg2->reg = NHMEX_M1_MSR_PMU_MM_CFG; - } - return 0; -} - -static u64 nhmex_mbox_shared_reg_config(struct intel_uncore_box *box, int idx) -{ - struct intel_uncore_extra_reg *er; - unsigned long flags; - u64 config; - - if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) - return box->shared_regs[idx].config; - - er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC]; - raw_spin_lock_irqsave(&er->lock, flags); - config = er->config; - raw_spin_unlock_irqrestore(&er->lock, flags); - return config; -} - -static void nhmex_mbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - struct hw_perf_event_extra *reg1 = &hwc->extra_reg; - struct hw_perf_event_extra *reg2 = &hwc->branch_reg; - int idx; - - idx = __BITS_VALUE(reg1->idx, 0, 8); - if (idx != 0xff) - wrmsrl(__BITS_VALUE(reg1->reg, 0, 16), - nhmex_mbox_shared_reg_config(box, idx)); - idx = __BITS_VALUE(reg1->idx, 1, 8); - if (idx != 0xff) - wrmsrl(__BITS_VALUE(reg1->reg, 1, 16), - nhmex_mbox_shared_reg_config(box, idx)); - - if (reg2->idx != EXTRA_REG_NONE) { - wrmsrl(reg2->reg, 0); - if (reg2->config != ~0ULL) { - wrmsrl(reg2->reg + 1, - reg2->config & NHMEX_M_PMON_ADDR_MATCH_MASK); - wrmsrl(reg2->reg + 2, NHMEX_M_PMON_ADDR_MASK_MASK & - (reg2->config >> NHMEX_M_PMON_ADDR_MASK_SHIFT)); - wrmsrl(reg2->reg, NHMEX_M_PMON_MM_CFG_EN); - } - } - - wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0); -} - -DEFINE_UNCORE_FORMAT_ATTR(count_mode, count_mode, "config:2-3"); -DEFINE_UNCORE_FORMAT_ATTR(storage_mode, storage_mode, "config:4-5"); -DEFINE_UNCORE_FORMAT_ATTR(wrap_mode, wrap_mode, "config:6"); -DEFINE_UNCORE_FORMAT_ATTR(flag_mode, flag_mode, "config:7"); -DEFINE_UNCORE_FORMAT_ATTR(inc_sel, inc_sel, "config:9-13"); -DEFINE_UNCORE_FORMAT_ATTR(set_flag_sel, set_flag_sel, "config:19-21"); -DEFINE_UNCORE_FORMAT_ATTR(filter_cfg_en, filter_cfg_en, "config2:63"); -DEFINE_UNCORE_FORMAT_ATTR(filter_match, filter_match, "config2:0-33"); -DEFINE_UNCORE_FORMAT_ATTR(filter_mask, filter_mask, "config2:34-61"); -DEFINE_UNCORE_FORMAT_ATTR(dsp, dsp, "config1:0-31"); -DEFINE_UNCORE_FORMAT_ATTR(thr, thr, "config1:0-31"); -DEFINE_UNCORE_FORMAT_ATTR(fvc, fvc, "config1:0-31"); -DEFINE_UNCORE_FORMAT_ATTR(pgt, pgt, "config1:0-31"); -DEFINE_UNCORE_FORMAT_ATTR(map, map, "config1:0-31"); -DEFINE_UNCORE_FORMAT_ATTR(iss, iss, "config1:0-31"); -DEFINE_UNCORE_FORMAT_ATTR(pld, pld, "config1:32-63"); - -static struct attribute *nhmex_uncore_mbox_formats_attr[] = { - &format_attr_count_mode.attr, - &format_attr_storage_mode.attr, - &format_attr_wrap_mode.attr, - &format_attr_flag_mode.attr, - &format_attr_inc_sel.attr, - &format_attr_set_flag_sel.attr, - &format_attr_filter_cfg_en.attr, - &format_attr_filter_match.attr, - &format_attr_filter_mask.attr, - &format_attr_dsp.attr, - &format_attr_thr.attr, - &format_attr_fvc.attr, - &format_attr_pgt.attr, - &format_attr_map.attr, - &format_attr_iss.attr, - &format_attr_pld.attr, - NULL, -}; - -static struct attribute_group nhmex_uncore_mbox_format_group = { - .name = "format", - .attrs = nhmex_uncore_mbox_formats_attr, -}; - -static struct uncore_event_desc nhmex_uncore_mbox_events[] = { - INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x2800"), - INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x2820"), - { /* end: all zeroes */ }, -}; - -static struct uncore_event_desc wsmex_uncore_mbox_events[] = { - INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x5000"), - INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x5040"), - { /* end: all zeroes */ }, -}; - -static struct intel_uncore_ops nhmex_uncore_mbox_ops = { - NHMEX_UNCORE_OPS_COMMON_INIT(), - .enable_event = nhmex_mbox_msr_enable_event, - .hw_config = nhmex_mbox_hw_config, - .get_constraint = nhmex_mbox_get_constraint, - .put_constraint = nhmex_mbox_put_constraint, -}; - -static struct intel_uncore_type nhmex_uncore_mbox = { - .name = "mbox", - .num_counters = 6, - .num_boxes = 2, - .perf_ctr_bits = 48, - .event_ctl = NHMEX_M0_MSR_PMU_CTL0, - .perf_ctr = NHMEX_M0_MSR_PMU_CNT0, - .event_mask = NHMEX_M_PMON_RAW_EVENT_MASK, - .box_ctl = NHMEX_M0_MSR_GLOBAL_CTL, - .msr_offset = NHMEX_M_MSR_OFFSET, - .pair_ctr_ctl = 1, - .num_shared_regs = 8, - .event_descs = nhmex_uncore_mbox_events, - .ops = &nhmex_uncore_mbox_ops, - .format_group = &nhmex_uncore_mbox_format_group, -}; - -static void nhmex_rbox_alter_er(struct intel_uncore_box *box, struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - struct hw_perf_event_extra *reg1 = &hwc->extra_reg; - - /* adjust the main event selector and extra register index */ - if (reg1->idx % 2) { - reg1->idx--; - hwc->config -= 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT; - } else { - reg1->idx++; - hwc->config += 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT; - } - - /* adjust extra register config */ - switch (reg1->idx % 6) { - case 2: - /* shift the 8~15 bits to the 0~7 bits */ - reg1->config >>= 8; - break; - case 3: - /* shift the 0~7 bits to the 8~15 bits */ - reg1->config <<= 8; - break; - } -} - -/* - * Each rbox has 4 event set which monitor PQI port 0~3 or 4~7. - * An event set consists of 6 events, the 3rd and 4th events in - * an event set use the same extra register. So an event set uses - * 5 extra registers. - */ -static struct event_constraint * -nhmex_rbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - struct hw_perf_event_extra *reg1 = &hwc->extra_reg; - struct hw_perf_event_extra *reg2 = &hwc->branch_reg; - struct intel_uncore_extra_reg *er; - unsigned long flags; - int idx, er_idx; - u64 config1; - bool ok = false; - - if (!uncore_box_is_fake(box) && reg1->alloc) - return NULL; - - idx = reg1->idx % 6; - config1 = reg1->config; -again: - er_idx = idx; - /* the 3rd and 4th events use the same extra register */ - if (er_idx > 2) - er_idx--; - er_idx += (reg1->idx / 6) * 5; - - er = &box->shared_regs[er_idx]; - raw_spin_lock_irqsave(&er->lock, flags); - if (idx < 2) { - if (!atomic_read(&er->ref) || er->config == reg1->config) { - atomic_inc(&er->ref); - er->config = reg1->config; - ok = true; - } - } else if (idx == 2 || idx == 3) { - /* - * these two events use different fields in a extra register, - * the 0~7 bits and the 8~15 bits respectively. - */ - u64 mask = 0xff << ((idx - 2) * 8); - if (!__BITS_VALUE(atomic_read(&er->ref), idx - 2, 8) || - !((er->config ^ config1) & mask)) { - atomic_add(1 << ((idx - 2) * 8), &er->ref); - er->config &= ~mask; - er->config |= config1 & mask; - ok = true; - } - } else { - if (!atomic_read(&er->ref) || - (er->config == (hwc->config >> 32) && - er->config1 == reg1->config && - er->config2 == reg2->config)) { - atomic_inc(&er->ref); - er->config = (hwc->config >> 32); - er->config1 = reg1->config; - er->config2 = reg2->config; - ok = true; - } - } - raw_spin_unlock_irqrestore(&er->lock, flags); - - if (!ok) { - /* - * The Rbox events are always in pairs. The paired - * events are functional identical, but use different - * extra registers. If we failed to take an extra - * register, try the alternative. - */ - idx ^= 1; - if (idx != reg1->idx % 6) { - if (idx == 2) - config1 >>= 8; - else if (idx == 3) - config1 <<= 8; - goto again; - } - } else { - if (!uncore_box_is_fake(box)) { - if (idx != reg1->idx % 6) - nhmex_rbox_alter_er(box, event); - reg1->alloc = 1; - } - return NULL; - } - return &uncore_constraint_empty; -} - -static void nhmex_rbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event) -{ - struct intel_uncore_extra_reg *er; - struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; - int idx, er_idx; - - if (uncore_box_is_fake(box) || !reg1->alloc) - return; - - idx = reg1->idx % 6; - er_idx = idx; - if (er_idx > 2) - er_idx--; - er_idx += (reg1->idx / 6) * 5; - - er = &box->shared_regs[er_idx]; - if (idx == 2 || idx == 3) - atomic_sub(1 << ((idx - 2) * 8), &er->ref); - else - atomic_dec(&er->ref); - - reg1->alloc = 0; -} - -static int nhmex_rbox_hw_config(struct intel_uncore_box *box, struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; - struct hw_perf_event_extra *reg2 = &event->hw.branch_reg; - int idx; - - idx = (event->hw.config & NHMEX_R_PMON_CTL_EV_SEL_MASK) >> - NHMEX_R_PMON_CTL_EV_SEL_SHIFT; - if (idx >= 0x18) - return -EINVAL; - - reg1->idx = idx; - reg1->config = event->attr.config1; - - switch (idx % 6) { - case 4: - case 5: - hwc->config |= event->attr.config & (~0ULL << 32); - reg2->config = event->attr.config2; - break; - } - return 0; -} - -static void nhmex_rbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - struct hw_perf_event_extra *reg1 = &hwc->extra_reg; - struct hw_perf_event_extra *reg2 = &hwc->branch_reg; - int idx, port; - - idx = reg1->idx; - port = idx / 6 + box->pmu->pmu_idx * 4; - - switch (idx % 6) { - case 0: - wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG0(port), reg1->config); - break; - case 1: - wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG1(port), reg1->config); - break; - case 2: - case 3: - wrmsrl(NHMEX_R_MSR_PORTN_QLX_CFG(port), - uncore_shared_reg_config(box, 2 + (idx / 6) * 5)); - break; - case 4: - wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(port), - hwc->config >> 32); - wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(port), reg1->config); - wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MASK(port), reg2->config); - break; - case 5: - wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(port), - hwc->config >> 32); - wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(port), reg1->config); - wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MASK(port), reg2->config); - break; - } - - wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 | - (hwc->config & NHMEX_R_PMON_CTL_EV_SEL_MASK)); -} - -DEFINE_UNCORE_FORMAT_ATTR(xbr_mm_cfg, xbr_mm_cfg, "config:32-63"); -DEFINE_UNCORE_FORMAT_ATTR(xbr_match, xbr_match, "config1:0-63"); -DEFINE_UNCORE_FORMAT_ATTR(xbr_mask, xbr_mask, "config2:0-63"); -DEFINE_UNCORE_FORMAT_ATTR(qlx_cfg, qlx_cfg, "config1:0-15"); -DEFINE_UNCORE_FORMAT_ATTR(iperf_cfg, iperf_cfg, "config1:0-31"); - -static struct attribute *nhmex_uncore_rbox_formats_attr[] = { - &format_attr_event5.attr, - &format_attr_xbr_mm_cfg.attr, - &format_attr_xbr_match.attr, - &format_attr_xbr_mask.attr, - &format_attr_qlx_cfg.attr, - &format_attr_iperf_cfg.attr, - NULL, -}; - -static struct attribute_group nhmex_uncore_rbox_format_group = { - .name = "format", - .attrs = nhmex_uncore_rbox_formats_attr, -}; - -static struct uncore_event_desc nhmex_uncore_rbox_events[] = { - INTEL_UNCORE_EVENT_DESC(qpi0_flit_send, "event=0x0,iperf_cfg=0x80000000"), - INTEL_UNCORE_EVENT_DESC(qpi1_filt_send, "event=0x6,iperf_cfg=0x80000000"), - INTEL_UNCORE_EVENT_DESC(qpi0_idle_filt, "event=0x0,iperf_cfg=0x40000000"), - INTEL_UNCORE_EVENT_DESC(qpi1_idle_filt, "event=0x6,iperf_cfg=0x40000000"), - INTEL_UNCORE_EVENT_DESC(qpi0_date_response, "event=0x0,iperf_cfg=0xc4"), - INTEL_UNCORE_EVENT_DESC(qpi1_date_response, "event=0x6,iperf_cfg=0xc4"), - { /* end: all zeroes */ }, -}; - -static struct intel_uncore_ops nhmex_uncore_rbox_ops = { - NHMEX_UNCORE_OPS_COMMON_INIT(), - .enable_event = nhmex_rbox_msr_enable_event, - .hw_config = nhmex_rbox_hw_config, - .get_constraint = nhmex_rbox_get_constraint, - .put_constraint = nhmex_rbox_put_constraint, -}; - -static struct intel_uncore_type nhmex_uncore_rbox = { - .name = "rbox", - .num_counters = 8, - .num_boxes = 2, - .perf_ctr_bits = 48, - .event_ctl = NHMEX_R_MSR_PMON_CTL0, - .perf_ctr = NHMEX_R_MSR_PMON_CNT0, - .event_mask = NHMEX_R_PMON_RAW_EVENT_MASK, - .box_ctl = NHMEX_R_MSR_GLOBAL_CTL, - .msr_offset = NHMEX_R_MSR_OFFSET, - .pair_ctr_ctl = 1, - .num_shared_regs = 20, - .event_descs = nhmex_uncore_rbox_events, - .ops = &nhmex_uncore_rbox_ops, - .format_group = &nhmex_uncore_rbox_format_group -}; - -static struct intel_uncore_type *nhmex_msr_uncores[] = { - &nhmex_uncore_ubox, - &nhmex_uncore_cbox, - &nhmex_uncore_bbox, - &nhmex_uncore_sbox, - &nhmex_uncore_mbox, - &nhmex_uncore_rbox, - &nhmex_uncore_wbox, - NULL, -}; - -void nhmex_uncore_cpu_init(void) -{ - if (boot_cpu_data.x86_model == 46) - uncore_nhmex = true; - else - nhmex_uncore_mbox.event_descs = wsmex_uncore_mbox_events; - if (nhmex_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) - nhmex_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; - uncore_msr_uncores = nhmex_msr_uncores; -} -/* end of Nehalem-EX uncore support */ diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c deleted file mode 100644 index 2bd030ddd0db..000000000000 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c +++ /dev/null @@ -1,717 +0,0 @@ -/* Nehalem/SandBridge/Haswell uncore support */ -#include "perf_event_intel_uncore.h" - -/* Uncore IMC PCI IDs */ -#define PCI_DEVICE_ID_INTEL_SNB_IMC 0x0100 -#define PCI_DEVICE_ID_INTEL_IVB_IMC 0x0154 -#define PCI_DEVICE_ID_INTEL_IVB_E3_IMC 0x0150 -#define PCI_DEVICE_ID_INTEL_HSW_IMC 0x0c00 -#define PCI_DEVICE_ID_INTEL_HSW_U_IMC 0x0a04 -#define PCI_DEVICE_ID_INTEL_BDW_IMC 0x1604 -#define PCI_DEVICE_ID_INTEL_SKL_IMC 0x191f - -/* SNB event control */ -#define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff -#define SNB_UNC_CTL_UMASK_MASK 0x0000ff00 -#define SNB_UNC_CTL_EDGE_DET (1 << 18) -#define SNB_UNC_CTL_EN (1 << 22) -#define SNB_UNC_CTL_INVERT (1 << 23) -#define SNB_UNC_CTL_CMASK_MASK 0x1f000000 -#define NHM_UNC_CTL_CMASK_MASK 0xff000000 -#define NHM_UNC_FIXED_CTR_CTL_EN (1 << 0) - -#define SNB_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \ - SNB_UNC_CTL_UMASK_MASK | \ - SNB_UNC_CTL_EDGE_DET | \ - SNB_UNC_CTL_INVERT | \ - SNB_UNC_CTL_CMASK_MASK) - -#define NHM_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \ - SNB_UNC_CTL_UMASK_MASK | \ - SNB_UNC_CTL_EDGE_DET | \ - SNB_UNC_CTL_INVERT | \ - NHM_UNC_CTL_CMASK_MASK) - -/* SNB global control register */ -#define SNB_UNC_PERF_GLOBAL_CTL 0x391 -#define SNB_UNC_FIXED_CTR_CTRL 0x394 -#define SNB_UNC_FIXED_CTR 0x395 - -/* SNB uncore global control */ -#define SNB_UNC_GLOBAL_CTL_CORE_ALL ((1 << 4) - 1) -#define SNB_UNC_GLOBAL_CTL_EN (1 << 29) - -/* SNB Cbo register */ -#define SNB_UNC_CBO_0_PERFEVTSEL0 0x700 -#define SNB_UNC_CBO_0_PER_CTR0 0x706 -#define SNB_UNC_CBO_MSR_OFFSET 0x10 - -/* SNB ARB register */ -#define SNB_UNC_ARB_PER_CTR0 0x3b0 -#define SNB_UNC_ARB_PERFEVTSEL0 0x3b2 -#define SNB_UNC_ARB_MSR_OFFSET 0x10 - -/* NHM global control register */ -#define NHM_UNC_PERF_GLOBAL_CTL 0x391 -#define NHM_UNC_FIXED_CTR 0x394 -#define NHM_UNC_FIXED_CTR_CTRL 0x395 - -/* NHM uncore global control */ -#define NHM_UNC_GLOBAL_CTL_EN_PC_ALL ((1ULL << 8) - 1) -#define NHM_UNC_GLOBAL_CTL_EN_FC (1ULL << 32) - -/* NHM uncore register */ -#define NHM_UNC_PERFEVTSEL0 0x3c0 -#define NHM_UNC_UNCORE_PMC0 0x3b0 - -DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7"); -DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15"); -DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18"); -DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23"); -DEFINE_UNCORE_FORMAT_ATTR(cmask5, cmask, "config:24-28"); -DEFINE_UNCORE_FORMAT_ATTR(cmask8, cmask, "config:24-31"); - -/* Sandy Bridge uncore support */ -static void snb_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - - if (hwc->idx < UNCORE_PMC_IDX_FIXED) - wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN); - else - wrmsrl(hwc->config_base, SNB_UNC_CTL_EN); -} - -static void snb_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event) -{ - wrmsrl(event->hw.config_base, 0); -} - -static void snb_uncore_msr_init_box(struct intel_uncore_box *box) -{ - if (box->pmu->pmu_idx == 0) { - wrmsrl(SNB_UNC_PERF_GLOBAL_CTL, - SNB_UNC_GLOBAL_CTL_EN | SNB_UNC_GLOBAL_CTL_CORE_ALL); - } -} - -static struct uncore_event_desc snb_uncore_events[] = { - INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"), - { /* end: all zeroes */ }, -}; - -static struct attribute *snb_uncore_formats_attr[] = { - &format_attr_event.attr, - &format_attr_umask.attr, - &format_attr_edge.attr, - &format_attr_inv.attr, - &format_attr_cmask5.attr, - NULL, -}; - -static struct attribute_group snb_uncore_format_group = { - .name = "format", - .attrs = snb_uncore_formats_attr, -}; - -static struct intel_uncore_ops snb_uncore_msr_ops = { - .init_box = snb_uncore_msr_init_box, - .disable_event = snb_uncore_msr_disable_event, - .enable_event = snb_uncore_msr_enable_event, - .read_counter = uncore_msr_read_counter, -}; - -static struct event_constraint snb_uncore_arb_constraints[] = { - UNCORE_EVENT_CONSTRAINT(0x80, 0x1), - UNCORE_EVENT_CONSTRAINT(0x83, 0x1), - EVENT_CONSTRAINT_END -}; - -static struct intel_uncore_type snb_uncore_cbox = { - .name = "cbox", - .num_counters = 2, - .num_boxes = 4, - .perf_ctr_bits = 44, - .fixed_ctr_bits = 48, - .perf_ctr = SNB_UNC_CBO_0_PER_CTR0, - .event_ctl = SNB_UNC_CBO_0_PERFEVTSEL0, - .fixed_ctr = SNB_UNC_FIXED_CTR, - .fixed_ctl = SNB_UNC_FIXED_CTR_CTRL, - .single_fixed = 1, - .event_mask = SNB_UNC_RAW_EVENT_MASK, - .msr_offset = SNB_UNC_CBO_MSR_OFFSET, - .ops = &snb_uncore_msr_ops, - .format_group = &snb_uncore_format_group, - .event_descs = snb_uncore_events, -}; - -static struct intel_uncore_type snb_uncore_arb = { - .name = "arb", - .num_counters = 2, - .num_boxes = 1, - .perf_ctr_bits = 44, - .perf_ctr = SNB_UNC_ARB_PER_CTR0, - .event_ctl = SNB_UNC_ARB_PERFEVTSEL0, - .event_mask = SNB_UNC_RAW_EVENT_MASK, - .msr_offset = SNB_UNC_ARB_MSR_OFFSET, - .constraints = snb_uncore_arb_constraints, - .ops = &snb_uncore_msr_ops, - .format_group = &snb_uncore_format_group, -}; - -static struct intel_uncore_type *snb_msr_uncores[] = { - &snb_uncore_cbox, - &snb_uncore_arb, - NULL, -}; - -void snb_uncore_cpu_init(void) -{ - uncore_msr_uncores = snb_msr_uncores; - if (snb_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) - snb_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; -} - -enum { - SNB_PCI_UNCORE_IMC, -}; - -static struct uncore_event_desc snb_uncore_imc_events[] = { - INTEL_UNCORE_EVENT_DESC(data_reads, "event=0x01"), - INTEL_UNCORE_EVENT_DESC(data_reads.scale, "6.103515625e-5"), - INTEL_UNCORE_EVENT_DESC(data_reads.unit, "MiB"), - - INTEL_UNCORE_EVENT_DESC(data_writes, "event=0x02"), - INTEL_UNCORE_EVENT_DESC(data_writes.scale, "6.103515625e-5"), - INTEL_UNCORE_EVENT_DESC(data_writes.unit, "MiB"), - - { /* end: all zeroes */ }, -}; - -#define SNB_UNCORE_PCI_IMC_EVENT_MASK 0xff -#define SNB_UNCORE_PCI_IMC_BAR_OFFSET 0x48 - -/* page size multiple covering all config regs */ -#define SNB_UNCORE_PCI_IMC_MAP_SIZE 0x6000 - -#define SNB_UNCORE_PCI_IMC_DATA_READS 0x1 -#define SNB_UNCORE_PCI_IMC_DATA_READS_BASE 0x5050 -#define SNB_UNCORE_PCI_IMC_DATA_WRITES 0x2 -#define SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE 0x5054 -#define SNB_UNCORE_PCI_IMC_CTR_BASE SNB_UNCORE_PCI_IMC_DATA_READS_BASE - -static struct attribute *snb_uncore_imc_formats_attr[] = { - &format_attr_event.attr, - NULL, -}; - -static struct attribute_group snb_uncore_imc_format_group = { - .name = "format", - .attrs = snb_uncore_imc_formats_attr, -}; - -static void snb_uncore_imc_init_box(struct intel_uncore_box *box) -{ - struct pci_dev *pdev = box->pci_dev; - int where = SNB_UNCORE_PCI_IMC_BAR_OFFSET; - resource_size_t addr; - u32 pci_dword; - - pci_read_config_dword(pdev, where, &pci_dword); - addr = pci_dword; - -#ifdef CONFIG_PHYS_ADDR_T_64BIT - pci_read_config_dword(pdev, where + 4, &pci_dword); - addr |= ((resource_size_t)pci_dword << 32); -#endif - - addr &= ~(PAGE_SIZE - 1); - - box->io_addr = ioremap(addr, SNB_UNCORE_PCI_IMC_MAP_SIZE); - box->hrtimer_duration = UNCORE_SNB_IMC_HRTIMER_INTERVAL; -} - -static void snb_uncore_imc_enable_box(struct intel_uncore_box *box) -{} - -static void snb_uncore_imc_disable_box(struct intel_uncore_box *box) -{} - -static void snb_uncore_imc_enable_event(struct intel_uncore_box *box, struct perf_event *event) -{} - -static void snb_uncore_imc_disable_event(struct intel_uncore_box *box, struct perf_event *event) -{} - -static u64 snb_uncore_imc_read_counter(struct intel_uncore_box *box, struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - - return (u64)*(unsigned int *)(box->io_addr + hwc->event_base); -} - -/* - * custom event_init() function because we define our own fixed, free - * running counters, so we do not want to conflict with generic uncore - * logic. Also simplifies processing - */ -static int snb_uncore_imc_event_init(struct perf_event *event) -{ - struct intel_uncore_pmu *pmu; - struct intel_uncore_box *box; - struct hw_perf_event *hwc = &event->hw; - u64 cfg = event->attr.config & SNB_UNCORE_PCI_IMC_EVENT_MASK; - int idx, base; - - if (event->attr.type != event->pmu->type) - return -ENOENT; - - pmu = uncore_event_to_pmu(event); - /* no device found for this pmu */ - if (pmu->func_id < 0) - return -ENOENT; - - /* Sampling not supported yet */ - if (hwc->sample_period) - return -EINVAL; - - /* unsupported modes and filters */ - if (event->attr.exclude_user || - event->attr.exclude_kernel || - event->attr.exclude_hv || - event->attr.exclude_idle || - event->attr.exclude_host || - event->attr.exclude_guest || - event->attr.sample_period) /* no sampling */ - return -EINVAL; - - /* - * Place all uncore events for a particular physical package - * onto a single cpu - */ - if (event->cpu < 0) - return -EINVAL; - - /* check only supported bits are set */ - if (event->attr.config & ~SNB_UNCORE_PCI_IMC_EVENT_MASK) - return -EINVAL; - - box = uncore_pmu_to_box(pmu, event->cpu); - if (!box || box->cpu < 0) - return -EINVAL; - - event->cpu = box->cpu; - - event->hw.idx = -1; - event->hw.last_tag = ~0ULL; - event->hw.extra_reg.idx = EXTRA_REG_NONE; - event->hw.branch_reg.idx = EXTRA_REG_NONE; - /* - * check event is known (whitelist, determines counter) - */ - switch (cfg) { - case SNB_UNCORE_PCI_IMC_DATA_READS: - base = SNB_UNCORE_PCI_IMC_DATA_READS_BASE; - idx = UNCORE_PMC_IDX_FIXED; - break; - case SNB_UNCORE_PCI_IMC_DATA_WRITES: - base = SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE; - idx = UNCORE_PMC_IDX_FIXED + 1; - break; - default: - return -EINVAL; - } - - /* must be done before validate_group */ - event->hw.event_base = base; - event->hw.config = cfg; - event->hw.idx = idx; - - /* no group validation needed, we have free running counters */ - - return 0; -} - -static int snb_uncore_imc_hw_config(struct intel_uncore_box *box, struct perf_event *event) -{ - return 0; -} - -static void snb_uncore_imc_event_start(struct perf_event *event, int flags) -{ - struct intel_uncore_box *box = uncore_event_to_box(event); - u64 count; - - if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) - return; - - event->hw.state = 0; - box->n_active++; - - list_add_tail(&event->active_entry, &box->active_list); - - count = snb_uncore_imc_read_counter(box, event); - local64_set(&event->hw.prev_count, count); - - if (box->n_active == 1) - uncore_pmu_start_hrtimer(box); -} - -static void snb_uncore_imc_event_stop(struct perf_event *event, int flags) -{ - struct intel_uncore_box *box = uncore_event_to_box(event); - struct hw_perf_event *hwc = &event->hw; - - if (!(hwc->state & PERF_HES_STOPPED)) { - box->n_active--; - - WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); - hwc->state |= PERF_HES_STOPPED; - - list_del(&event->active_entry); - - if (box->n_active == 0) - uncore_pmu_cancel_hrtimer(box); - } - - if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { - /* - * Drain the remaining delta count out of a event - * that we are disabling: - */ - uncore_perf_event_update(box, event); - hwc->state |= PERF_HES_UPTODATE; - } -} - -static int snb_uncore_imc_event_add(struct perf_event *event, int flags) -{ - struct intel_uncore_box *box = uncore_event_to_box(event); - struct hw_perf_event *hwc = &event->hw; - - if (!box) - return -ENODEV; - - hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; - if (!(flags & PERF_EF_START)) - hwc->state |= PERF_HES_ARCH; - - snb_uncore_imc_event_start(event, 0); - - box->n_events++; - - return 0; -} - -static void snb_uncore_imc_event_del(struct perf_event *event, int flags) -{ - struct intel_uncore_box *box = uncore_event_to_box(event); - int i; - - snb_uncore_imc_event_stop(event, PERF_EF_UPDATE); - - for (i = 0; i < box->n_events; i++) { - if (event == box->event_list[i]) { - --box->n_events; - break; - } - } -} - -int snb_pci2phy_map_init(int devid) -{ - struct pci_dev *dev = NULL; - struct pci2phy_map *map; - int bus, segment; - - dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, dev); - if (!dev) - return -ENOTTY; - - bus = dev->bus->number; - segment = pci_domain_nr(dev->bus); - - raw_spin_lock(&pci2phy_map_lock); - map = __find_pci2phy_map(segment); - if (!map) { - raw_spin_unlock(&pci2phy_map_lock); - pci_dev_put(dev); - return -ENOMEM; - } - map->pbus_to_physid[bus] = 0; - raw_spin_unlock(&pci2phy_map_lock); - - pci_dev_put(dev); - - return 0; -} - -static struct pmu snb_uncore_imc_pmu = { - .task_ctx_nr = perf_invalid_context, - .event_init = snb_uncore_imc_event_init, - .add = snb_uncore_imc_event_add, - .del = snb_uncore_imc_event_del, - .start = snb_uncore_imc_event_start, - .stop = snb_uncore_imc_event_stop, - .read = uncore_pmu_event_read, -}; - -static struct intel_uncore_ops snb_uncore_imc_ops = { - .init_box = snb_uncore_imc_init_box, - .enable_box = snb_uncore_imc_enable_box, - .disable_box = snb_uncore_imc_disable_box, - .disable_event = snb_uncore_imc_disable_event, - .enable_event = snb_uncore_imc_enable_event, - .hw_config = snb_uncore_imc_hw_config, - .read_counter = snb_uncore_imc_read_counter, -}; - -static struct intel_uncore_type snb_uncore_imc = { - .name = "imc", - .num_counters = 2, - .num_boxes = 1, - .fixed_ctr_bits = 32, - .fixed_ctr = SNB_UNCORE_PCI_IMC_CTR_BASE, - .event_descs = snb_uncore_imc_events, - .format_group = &snb_uncore_imc_format_group, - .perf_ctr = SNB_UNCORE_PCI_IMC_DATA_READS_BASE, - .event_mask = SNB_UNCORE_PCI_IMC_EVENT_MASK, - .ops = &snb_uncore_imc_ops, - .pmu = &snb_uncore_imc_pmu, -}; - -static struct intel_uncore_type *snb_pci_uncores[] = { - [SNB_PCI_UNCORE_IMC] = &snb_uncore_imc, - NULL, -}; - -static const struct pci_device_id snb_uncore_pci_ids[] = { - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SNB_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* end: all zeroes */ }, -}; - -static const struct pci_device_id ivb_uncore_pci_ids[] = { - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_E3_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* end: all zeroes */ }, -}; - -static const struct pci_device_id hsw_uncore_pci_ids[] = { - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_U_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* end: all zeroes */ }, -}; - -static const struct pci_device_id bdw_uncore_pci_ids[] = { - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_BDW_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* end: all zeroes */ }, -}; - -static const struct pci_device_id skl_uncore_pci_ids[] = { - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* end: all zeroes */ }, -}; - -static struct pci_driver snb_uncore_pci_driver = { - .name = "snb_uncore", - .id_table = snb_uncore_pci_ids, -}; - -static struct pci_driver ivb_uncore_pci_driver = { - .name = "ivb_uncore", - .id_table = ivb_uncore_pci_ids, -}; - -static struct pci_driver hsw_uncore_pci_driver = { - .name = "hsw_uncore", - .id_table = hsw_uncore_pci_ids, -}; - -static struct pci_driver bdw_uncore_pci_driver = { - .name = "bdw_uncore", - .id_table = bdw_uncore_pci_ids, -}; - -static struct pci_driver skl_uncore_pci_driver = { - .name = "skl_uncore", - .id_table = skl_uncore_pci_ids, -}; - -struct imc_uncore_pci_dev { - __u32 pci_id; - struct pci_driver *driver; -}; -#define IMC_DEV(a, d) \ - { .pci_id = PCI_DEVICE_ID_INTEL_##a, .driver = (d) } - -static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = { - IMC_DEV(SNB_IMC, &snb_uncore_pci_driver), - IMC_DEV(IVB_IMC, &ivb_uncore_pci_driver), /* 3rd Gen Core processor */ - IMC_DEV(IVB_E3_IMC, &ivb_uncore_pci_driver), /* Xeon E3-1200 v2/3rd Gen Core processor */ - IMC_DEV(HSW_IMC, &hsw_uncore_pci_driver), /* 4th Gen Core Processor */ - IMC_DEV(HSW_U_IMC, &hsw_uncore_pci_driver), /* 4th Gen Core ULT Mobile Processor */ - IMC_DEV(BDW_IMC, &bdw_uncore_pci_driver), /* 5th Gen Core U */ - IMC_DEV(SKL_IMC, &skl_uncore_pci_driver), /* 6th Gen Core */ - { /* end marker */ } -}; - - -#define for_each_imc_pci_id(x, t) \ - for (x = (t); (x)->pci_id; x++) - -static struct pci_driver *imc_uncore_find_dev(void) -{ - const struct imc_uncore_pci_dev *p; - int ret; - - for_each_imc_pci_id(p, desktop_imc_pci_ids) { - ret = snb_pci2phy_map_init(p->pci_id); - if (ret == 0) - return p->driver; - } - return NULL; -} - -static int imc_uncore_pci_init(void) -{ - struct pci_driver *imc_drv = imc_uncore_find_dev(); - - if (!imc_drv) - return -ENODEV; - - uncore_pci_uncores = snb_pci_uncores; - uncore_pci_driver = imc_drv; - - return 0; -} - -int snb_uncore_pci_init(void) -{ - return imc_uncore_pci_init(); -} - -int ivb_uncore_pci_init(void) -{ - return imc_uncore_pci_init(); -} -int hsw_uncore_pci_init(void) -{ - return imc_uncore_pci_init(); -} - -int bdw_uncore_pci_init(void) -{ - return imc_uncore_pci_init(); -} - -int skl_uncore_pci_init(void) -{ - return imc_uncore_pci_init(); -} - -/* end of Sandy Bridge uncore support */ - -/* Nehalem uncore support */ -static void nhm_uncore_msr_disable_box(struct intel_uncore_box *box) -{ - wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, 0); -} - -static void nhm_uncore_msr_enable_box(struct intel_uncore_box *box) -{ - wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, NHM_UNC_GLOBAL_CTL_EN_PC_ALL | NHM_UNC_GLOBAL_CTL_EN_FC); -} - -static void nhm_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - - if (hwc->idx < UNCORE_PMC_IDX_FIXED) - wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN); - else - wrmsrl(hwc->config_base, NHM_UNC_FIXED_CTR_CTL_EN); -} - -static struct attribute *nhm_uncore_formats_attr[] = { - &format_attr_event.attr, - &format_attr_umask.attr, - &format_attr_edge.attr, - &format_attr_inv.attr, - &format_attr_cmask8.attr, - NULL, -}; - -static struct attribute_group nhm_uncore_format_group = { - .name = "format", - .attrs = nhm_uncore_formats_attr, -}; - -static struct uncore_event_desc nhm_uncore_events[] = { - INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"), - INTEL_UNCORE_EVENT_DESC(qmc_writes_full_any, "event=0x2f,umask=0x0f"), - INTEL_UNCORE_EVENT_DESC(qmc_normal_reads_any, "event=0x2c,umask=0x0f"), - INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_reads, "event=0x20,umask=0x01"), - INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_writes, "event=0x20,umask=0x02"), - INTEL_UNCORE_EVENT_DESC(qhl_request_remote_reads, "event=0x20,umask=0x04"), - INTEL_UNCORE_EVENT_DESC(qhl_request_remote_writes, "event=0x20,umask=0x08"), - INTEL_UNCORE_EVENT_DESC(qhl_request_local_reads, "event=0x20,umask=0x10"), - INTEL_UNCORE_EVENT_DESC(qhl_request_local_writes, "event=0x20,umask=0x20"), - { /* end: all zeroes */ }, -}; - -static struct intel_uncore_ops nhm_uncore_msr_ops = { - .disable_box = nhm_uncore_msr_disable_box, - .enable_box = nhm_uncore_msr_enable_box, - .disable_event = snb_uncore_msr_disable_event, - .enable_event = nhm_uncore_msr_enable_event, - .read_counter = uncore_msr_read_counter, -}; - -static struct intel_uncore_type nhm_uncore = { - .name = "", - .num_counters = 8, - .num_boxes = 1, - .perf_ctr_bits = 48, - .fixed_ctr_bits = 48, - .event_ctl = NHM_UNC_PERFEVTSEL0, - .perf_ctr = NHM_UNC_UNCORE_PMC0, - .fixed_ctr = NHM_UNC_FIXED_CTR, - .fixed_ctl = NHM_UNC_FIXED_CTR_CTRL, - .event_mask = NHM_UNC_RAW_EVENT_MASK, - .event_descs = nhm_uncore_events, - .ops = &nhm_uncore_msr_ops, - .format_group = &nhm_uncore_format_group, -}; - -static struct intel_uncore_type *nhm_msr_uncores[] = { - &nhm_uncore, - NULL, -}; - -void nhm_uncore_cpu_init(void) -{ - uncore_msr_uncores = nhm_msr_uncores; -} - -/* end of Nehalem uncore support */ diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c deleted file mode 100644 index 33acb884ccf1..000000000000 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c +++ /dev/null @@ -1,3126 +0,0 @@ -/* SandyBridge-EP/IvyTown uncore support */ -#include "perf_event_intel_uncore.h" - - -/* SNB-EP Box level control */ -#define SNBEP_PMON_BOX_CTL_RST_CTRL (1 << 0) -#define SNBEP_PMON_BOX_CTL_RST_CTRS (1 << 1) -#define SNBEP_PMON_BOX_CTL_FRZ (1 << 8) -#define SNBEP_PMON_BOX_CTL_FRZ_EN (1 << 16) -#define SNBEP_PMON_BOX_CTL_INT (SNBEP_PMON_BOX_CTL_RST_CTRL | \ - SNBEP_PMON_BOX_CTL_RST_CTRS | \ - SNBEP_PMON_BOX_CTL_FRZ_EN) -/* SNB-EP event control */ -#define SNBEP_PMON_CTL_EV_SEL_MASK 0x000000ff -#define SNBEP_PMON_CTL_UMASK_MASK 0x0000ff00 -#define SNBEP_PMON_CTL_RST (1 << 17) -#define SNBEP_PMON_CTL_EDGE_DET (1 << 18) -#define SNBEP_PMON_CTL_EV_SEL_EXT (1 << 21) -#define SNBEP_PMON_CTL_EN (1 << 22) -#define SNBEP_PMON_CTL_INVERT (1 << 23) -#define SNBEP_PMON_CTL_TRESH_MASK 0xff000000 -#define SNBEP_PMON_RAW_EVENT_MASK (SNBEP_PMON_CTL_EV_SEL_MASK | \ - SNBEP_PMON_CTL_UMASK_MASK | \ - SNBEP_PMON_CTL_EDGE_DET | \ - SNBEP_PMON_CTL_INVERT | \ - SNBEP_PMON_CTL_TRESH_MASK) - -/* SNB-EP Ubox event control */ -#define SNBEP_U_MSR_PMON_CTL_TRESH_MASK 0x1f000000 -#define SNBEP_U_MSR_PMON_RAW_EVENT_MASK \ - (SNBEP_PMON_CTL_EV_SEL_MASK | \ - SNBEP_PMON_CTL_UMASK_MASK | \ - SNBEP_PMON_CTL_EDGE_DET | \ - SNBEP_PMON_CTL_INVERT | \ - SNBEP_U_MSR_PMON_CTL_TRESH_MASK) - -#define SNBEP_CBO_PMON_CTL_TID_EN (1 << 19) -#define SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK (SNBEP_PMON_RAW_EVENT_MASK | \ - SNBEP_CBO_PMON_CTL_TID_EN) - -/* SNB-EP PCU event control */ -#define SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK 0x0000c000 -#define SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK 0x1f000000 -#define SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT (1 << 30) -#define SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET (1 << 31) -#define SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK \ - (SNBEP_PMON_CTL_EV_SEL_MASK | \ - SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \ - SNBEP_PMON_CTL_EDGE_DET | \ - SNBEP_PMON_CTL_EV_SEL_EXT | \ - SNBEP_PMON_CTL_INVERT | \ - SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \ - SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \ - SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET) - -#define SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK \ - (SNBEP_PMON_RAW_EVENT_MASK | \ - SNBEP_PMON_CTL_EV_SEL_EXT) - -/* SNB-EP pci control register */ -#define SNBEP_PCI_PMON_BOX_CTL 0xf4 -#define SNBEP_PCI_PMON_CTL0 0xd8 -/* SNB-EP pci counter register */ -#define SNBEP_PCI_PMON_CTR0 0xa0 - -/* SNB-EP home agent register */ -#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH0 0x40 -#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH1 0x44 -#define SNBEP_HA_PCI_PMON_BOX_OPCODEMATCH 0x48 -/* SNB-EP memory controller register */ -#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTL 0xf0 -#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTR 0xd0 -/* SNB-EP QPI register */ -#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH0 0x228 -#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH1 0x22c -#define SNBEP_Q_Py_PCI_PMON_PKT_MASK0 0x238 -#define SNBEP_Q_Py_PCI_PMON_PKT_MASK1 0x23c - -/* SNB-EP Ubox register */ -#define SNBEP_U_MSR_PMON_CTR0 0xc16 -#define SNBEP_U_MSR_PMON_CTL0 0xc10 - -#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTL 0xc08 -#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTR 0xc09 - -/* SNB-EP Cbo register */ -#define SNBEP_C0_MSR_PMON_CTR0 0xd16 -#define SNBEP_C0_MSR_PMON_CTL0 0xd10 -#define SNBEP_C0_MSR_PMON_BOX_CTL 0xd04 -#define SNBEP_C0_MSR_PMON_BOX_FILTER 0xd14 -#define SNBEP_CBO_MSR_OFFSET 0x20 - -#define SNBEP_CB0_MSR_PMON_BOX_FILTER_TID 0x1f -#define SNBEP_CB0_MSR_PMON_BOX_FILTER_NID 0x3fc00 -#define SNBEP_CB0_MSR_PMON_BOX_FILTER_STATE 0x7c0000 -#define SNBEP_CB0_MSR_PMON_BOX_FILTER_OPC 0xff800000 - -#define SNBEP_CBO_EVENT_EXTRA_REG(e, m, i) { \ - .event = (e), \ - .msr = SNBEP_C0_MSR_PMON_BOX_FILTER, \ - .config_mask = (m), \ - .idx = (i) \ -} - -/* SNB-EP PCU register */ -#define SNBEP_PCU_MSR_PMON_CTR0 0xc36 -#define SNBEP_PCU_MSR_PMON_CTL0 0xc30 -#define SNBEP_PCU_MSR_PMON_BOX_CTL 0xc24 -#define SNBEP_PCU_MSR_PMON_BOX_FILTER 0xc34 -#define SNBEP_PCU_MSR_PMON_BOX_FILTER_MASK 0xffffffff -#define SNBEP_PCU_MSR_CORE_C3_CTR 0x3fc -#define SNBEP_PCU_MSR_CORE_C6_CTR 0x3fd - -/* IVBEP event control */ -#define IVBEP_PMON_BOX_CTL_INT (SNBEP_PMON_BOX_CTL_RST_CTRL | \ - SNBEP_PMON_BOX_CTL_RST_CTRS) -#define IVBEP_PMON_RAW_EVENT_MASK (SNBEP_PMON_CTL_EV_SEL_MASK | \ - SNBEP_PMON_CTL_UMASK_MASK | \ - SNBEP_PMON_CTL_EDGE_DET | \ - SNBEP_PMON_CTL_TRESH_MASK) -/* IVBEP Ubox */ -#define IVBEP_U_MSR_PMON_GLOBAL_CTL 0xc00 -#define IVBEP_U_PMON_GLOBAL_FRZ_ALL (1 << 31) -#define IVBEP_U_PMON_GLOBAL_UNFRZ_ALL (1 << 29) - -#define IVBEP_U_MSR_PMON_RAW_EVENT_MASK \ - (SNBEP_PMON_CTL_EV_SEL_MASK | \ - SNBEP_PMON_CTL_UMASK_MASK | \ - SNBEP_PMON_CTL_EDGE_DET | \ - SNBEP_U_MSR_PMON_CTL_TRESH_MASK) -/* IVBEP Cbo */ -#define IVBEP_CBO_MSR_PMON_RAW_EVENT_MASK (IVBEP_PMON_RAW_EVENT_MASK | \ - SNBEP_CBO_PMON_CTL_TID_EN) - -#define IVBEP_CB0_MSR_PMON_BOX_FILTER_TID (0x1fULL << 0) -#define IVBEP_CB0_MSR_PMON_BOX_FILTER_LINK (0xfULL << 5) -#define IVBEP_CB0_MSR_PMON_BOX_FILTER_STATE (0x3fULL << 17) -#define IVBEP_CB0_MSR_PMON_BOX_FILTER_NID (0xffffULL << 32) -#define IVBEP_CB0_MSR_PMON_BOX_FILTER_OPC (0x1ffULL << 52) -#define IVBEP_CB0_MSR_PMON_BOX_FILTER_C6 (0x1ULL << 61) -#define IVBEP_CB0_MSR_PMON_BOX_FILTER_NC (0x1ULL << 62) -#define IVBEP_CB0_MSR_PMON_BOX_FILTER_ISOC (0x1ULL << 63) - -/* IVBEP home agent */ -#define IVBEP_HA_PCI_PMON_CTL_Q_OCC_RST (1 << 16) -#define IVBEP_HA_PCI_PMON_RAW_EVENT_MASK \ - (IVBEP_PMON_RAW_EVENT_MASK | \ - IVBEP_HA_PCI_PMON_CTL_Q_OCC_RST) -/* IVBEP PCU */ -#define IVBEP_PCU_MSR_PMON_RAW_EVENT_MASK \ - (SNBEP_PMON_CTL_EV_SEL_MASK | \ - SNBEP_PMON_CTL_EV_SEL_EXT | \ - SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \ - SNBEP_PMON_CTL_EDGE_DET | \ - SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \ - SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \ - SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET) -/* IVBEP QPI */ -#define IVBEP_QPI_PCI_PMON_RAW_EVENT_MASK \ - (IVBEP_PMON_RAW_EVENT_MASK | \ - SNBEP_PMON_CTL_EV_SEL_EXT) - -#define __BITS_VALUE(x, i, n) ((typeof(x))(((x) >> ((i) * (n))) & \ - ((1ULL << (n)) - 1))) - -/* Haswell-EP Ubox */ -#define HSWEP_U_MSR_PMON_CTR0 0x709 -#define HSWEP_U_MSR_PMON_CTL0 0x705 -#define HSWEP_U_MSR_PMON_FILTER 0x707 - -#define HSWEP_U_MSR_PMON_UCLK_FIXED_CTL 0x703 -#define HSWEP_U_MSR_PMON_UCLK_FIXED_CTR 0x704 - -#define HSWEP_U_MSR_PMON_BOX_FILTER_TID (0x1 << 0) -#define HSWEP_U_MSR_PMON_BOX_FILTER_CID (0x1fULL << 1) -#define HSWEP_U_MSR_PMON_BOX_FILTER_MASK \ - (HSWEP_U_MSR_PMON_BOX_FILTER_TID | \ - HSWEP_U_MSR_PMON_BOX_FILTER_CID) - -/* Haswell-EP CBo */ -#define HSWEP_C0_MSR_PMON_CTR0 0xe08 -#define HSWEP_C0_MSR_PMON_CTL0 0xe01 -#define HSWEP_C0_MSR_PMON_BOX_CTL 0xe00 -#define HSWEP_C0_MSR_PMON_BOX_FILTER0 0xe05 -#define HSWEP_CBO_MSR_OFFSET 0x10 - - -#define HSWEP_CB0_MSR_PMON_BOX_FILTER_TID (0x3fULL << 0) -#define HSWEP_CB0_MSR_PMON_BOX_FILTER_LINK (0xfULL << 6) -#define HSWEP_CB0_MSR_PMON_BOX_FILTER_STATE (0x7fULL << 17) -#define HSWEP_CB0_MSR_PMON_BOX_FILTER_NID (0xffffULL << 32) -#define HSWEP_CB0_MSR_PMON_BOX_FILTER_OPC (0x1ffULL << 52) -#define HSWEP_CB0_MSR_PMON_BOX_FILTER_C6 (0x1ULL << 61) -#define HSWEP_CB0_MSR_PMON_BOX_FILTER_NC (0x1ULL << 62) -#define HSWEP_CB0_MSR_PMON_BOX_FILTER_ISOC (0x1ULL << 63) - - -/* Haswell-EP Sbox */ -#define HSWEP_S0_MSR_PMON_CTR0 0x726 -#define HSWEP_S0_MSR_PMON_CTL0 0x721 -#define HSWEP_S0_MSR_PMON_BOX_CTL 0x720 -#define HSWEP_SBOX_MSR_OFFSET 0xa -#define HSWEP_S_MSR_PMON_RAW_EVENT_MASK (SNBEP_PMON_RAW_EVENT_MASK | \ - SNBEP_CBO_PMON_CTL_TID_EN) - -/* Haswell-EP PCU */ -#define HSWEP_PCU_MSR_PMON_CTR0 0x717 -#define HSWEP_PCU_MSR_PMON_CTL0 0x711 -#define HSWEP_PCU_MSR_PMON_BOX_CTL 0x710 -#define HSWEP_PCU_MSR_PMON_BOX_FILTER 0x715 - -/* KNL Ubox */ -#define KNL_U_MSR_PMON_RAW_EVENT_MASK \ - (SNBEP_U_MSR_PMON_RAW_EVENT_MASK | \ - SNBEP_CBO_PMON_CTL_TID_EN) -/* KNL CHA */ -#define KNL_CHA_MSR_OFFSET 0xc -#define KNL_CHA_MSR_PMON_CTL_QOR (1 << 16) -#define KNL_CHA_MSR_PMON_RAW_EVENT_MASK \ - (SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK | \ - KNL_CHA_MSR_PMON_CTL_QOR) -#define KNL_CHA_MSR_PMON_BOX_FILTER_TID 0x1ff -#define KNL_CHA_MSR_PMON_BOX_FILTER_STATE (7 << 18) -#define KNL_CHA_MSR_PMON_BOX_FILTER_OP (0xfffffe2aULL << 32) - -/* KNL EDC/MC UCLK */ -#define KNL_UCLK_MSR_PMON_CTR0_LOW 0x400 -#define KNL_UCLK_MSR_PMON_CTL0 0x420 -#define KNL_UCLK_MSR_PMON_BOX_CTL 0x430 -#define KNL_UCLK_MSR_PMON_UCLK_FIXED_LOW 0x44c -#define KNL_UCLK_MSR_PMON_UCLK_FIXED_CTL 0x454 -#define KNL_PMON_FIXED_CTL_EN 0x1 - -/* KNL EDC */ -#define KNL_EDC0_ECLK_MSR_PMON_CTR0_LOW 0xa00 -#define KNL_EDC0_ECLK_MSR_PMON_CTL0 0xa20 -#define KNL_EDC0_ECLK_MSR_PMON_BOX_CTL 0xa30 -#define KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_LOW 0xa3c -#define KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_CTL 0xa44 - -/* KNL MC */ -#define KNL_MC0_CH0_MSR_PMON_CTR0_LOW 0xb00 -#define KNL_MC0_CH0_MSR_PMON_CTL0 0xb20 -#define KNL_MC0_CH0_MSR_PMON_BOX_CTL 0xb30 -#define KNL_MC0_CH0_MSR_PMON_FIXED_LOW 0xb3c -#define KNL_MC0_CH0_MSR_PMON_FIXED_CTL 0xb44 - -/* KNL IRP */ -#define KNL_IRP_PCI_PMON_BOX_CTL 0xf0 -#define KNL_IRP_PCI_PMON_RAW_EVENT_MASK (SNBEP_PMON_RAW_EVENT_MASK | \ - KNL_CHA_MSR_PMON_CTL_QOR) -/* KNL PCU */ -#define KNL_PCU_PMON_CTL_EV_SEL_MASK 0x0000007f -#define KNL_PCU_PMON_CTL_USE_OCC_CTR (1 << 7) -#define KNL_PCU_MSR_PMON_CTL_TRESH_MASK 0x3f000000 -#define KNL_PCU_MSR_PMON_RAW_EVENT_MASK \ - (KNL_PCU_PMON_CTL_EV_SEL_MASK | \ - KNL_PCU_PMON_CTL_USE_OCC_CTR | \ - SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \ - SNBEP_PMON_CTL_EDGE_DET | \ - SNBEP_CBO_PMON_CTL_TID_EN | \ - SNBEP_PMON_CTL_EV_SEL_EXT | \ - SNBEP_PMON_CTL_INVERT | \ - KNL_PCU_MSR_PMON_CTL_TRESH_MASK | \ - SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \ - SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET) - -DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7"); -DEFINE_UNCORE_FORMAT_ATTR(event2, event, "config:0-6"); -DEFINE_UNCORE_FORMAT_ATTR(event_ext, event, "config:0-7,21"); -DEFINE_UNCORE_FORMAT_ATTR(use_occ_ctr, use_occ_ctr, "config:7"); -DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15"); -DEFINE_UNCORE_FORMAT_ATTR(qor, qor, "config:16"); -DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18"); -DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19"); -DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23"); -DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31"); -DEFINE_UNCORE_FORMAT_ATTR(thresh6, thresh, "config:24-29"); -DEFINE_UNCORE_FORMAT_ATTR(thresh5, thresh, "config:24-28"); -DEFINE_UNCORE_FORMAT_ATTR(occ_sel, occ_sel, "config:14-15"); -DEFINE_UNCORE_FORMAT_ATTR(occ_invert, occ_invert, "config:30"); -DEFINE_UNCORE_FORMAT_ATTR(occ_edge, occ_edge, "config:14-51"); -DEFINE_UNCORE_FORMAT_ATTR(occ_edge_det, occ_edge_det, "config:31"); -DEFINE_UNCORE_FORMAT_ATTR(filter_tid, filter_tid, "config1:0-4"); -DEFINE_UNCORE_FORMAT_ATTR(filter_tid2, filter_tid, "config1:0"); -DEFINE_UNCORE_FORMAT_ATTR(filter_tid3, filter_tid, "config1:0-5"); -DEFINE_UNCORE_FORMAT_ATTR(filter_tid4, filter_tid, "config1:0-8"); -DEFINE_UNCORE_FORMAT_ATTR(filter_cid, filter_cid, "config1:5"); -DEFINE_UNCORE_FORMAT_ATTR(filter_link, filter_link, "config1:5-8"); -DEFINE_UNCORE_FORMAT_ATTR(filter_link2, filter_link, "config1:6-8"); -DEFINE_UNCORE_FORMAT_ATTR(filter_link3, filter_link, "config1:12"); -DEFINE_UNCORE_FORMAT_ATTR(filter_nid, filter_nid, "config1:10-17"); -DEFINE_UNCORE_FORMAT_ATTR(filter_nid2, filter_nid, "config1:32-47"); -DEFINE_UNCORE_FORMAT_ATTR(filter_state, filter_state, "config1:18-22"); -DEFINE_UNCORE_FORMAT_ATTR(filter_state2, filter_state, "config1:17-22"); -DEFINE_UNCORE_FORMAT_ATTR(filter_state3, filter_state, "config1:17-23"); -DEFINE_UNCORE_FORMAT_ATTR(filter_state4, filter_state, "config1:18-20"); -DEFINE_UNCORE_FORMAT_ATTR(filter_local, filter_local, "config1:33"); -DEFINE_UNCORE_FORMAT_ATTR(filter_all_op, filter_all_op, "config1:35"); -DEFINE_UNCORE_FORMAT_ATTR(filter_nnm, filter_nnm, "config1:37"); -DEFINE_UNCORE_FORMAT_ATTR(filter_opc, filter_opc, "config1:23-31"); -DEFINE_UNCORE_FORMAT_ATTR(filter_opc2, filter_opc, "config1:52-60"); -DEFINE_UNCORE_FORMAT_ATTR(filter_opc3, filter_opc, "config1:41-60"); -DEFINE_UNCORE_FORMAT_ATTR(filter_nc, filter_nc, "config1:62"); -DEFINE_UNCORE_FORMAT_ATTR(filter_c6, filter_c6, "config1:61"); -DEFINE_UNCORE_FORMAT_ATTR(filter_isoc, filter_isoc, "config1:63"); -DEFINE_UNCORE_FORMAT_ATTR(filter_band0, filter_band0, "config1:0-7"); -DEFINE_UNCORE_FORMAT_ATTR(filter_band1, filter_band1, "config1:8-15"); -DEFINE_UNCORE_FORMAT_ATTR(filter_band2, filter_band2, "config1:16-23"); -DEFINE_UNCORE_FORMAT_ATTR(filter_band3, filter_band3, "config1:24-31"); -DEFINE_UNCORE_FORMAT_ATTR(match_rds, match_rds, "config1:48-51"); -DEFINE_UNCORE_FORMAT_ATTR(match_rnid30, match_rnid30, "config1:32-35"); -DEFINE_UNCORE_FORMAT_ATTR(match_rnid4, match_rnid4, "config1:31"); -DEFINE_UNCORE_FORMAT_ATTR(match_dnid, match_dnid, "config1:13-17"); -DEFINE_UNCORE_FORMAT_ATTR(match_mc, match_mc, "config1:9-12"); -DEFINE_UNCORE_FORMAT_ATTR(match_opc, match_opc, "config1:5-8"); -DEFINE_UNCORE_FORMAT_ATTR(match_vnw, match_vnw, "config1:3-4"); -DEFINE_UNCORE_FORMAT_ATTR(match0, match0, "config1:0-31"); -DEFINE_UNCORE_FORMAT_ATTR(match1, match1, "config1:32-63"); -DEFINE_UNCORE_FORMAT_ATTR(mask_rds, mask_rds, "config2:48-51"); -DEFINE_UNCORE_FORMAT_ATTR(mask_rnid30, mask_rnid30, "config2:32-35"); -DEFINE_UNCORE_FORMAT_ATTR(mask_rnid4, mask_rnid4, "config2:31"); -DEFINE_UNCORE_FORMAT_ATTR(mask_dnid, mask_dnid, "config2:13-17"); -DEFINE_UNCORE_FORMAT_ATTR(mask_mc, mask_mc, "config2:9-12"); -DEFINE_UNCORE_FORMAT_ATTR(mask_opc, mask_opc, "config2:5-8"); -DEFINE_UNCORE_FORMAT_ATTR(mask_vnw, mask_vnw, "config2:3-4"); -DEFINE_UNCORE_FORMAT_ATTR(mask0, mask0, "config2:0-31"); -DEFINE_UNCORE_FORMAT_ATTR(mask1, mask1, "config2:32-63"); - -static void snbep_uncore_pci_disable_box(struct intel_uncore_box *box) -{ - struct pci_dev *pdev = box->pci_dev; - int box_ctl = uncore_pci_box_ctl(box); - u32 config = 0; - - if (!pci_read_config_dword(pdev, box_ctl, &config)) { - config |= SNBEP_PMON_BOX_CTL_FRZ; - pci_write_config_dword(pdev, box_ctl, config); - } -} - -static void snbep_uncore_pci_enable_box(struct intel_uncore_box *box) -{ - struct pci_dev *pdev = box->pci_dev; - int box_ctl = uncore_pci_box_ctl(box); - u32 config = 0; - - if (!pci_read_config_dword(pdev, box_ctl, &config)) { - config &= ~SNBEP_PMON_BOX_CTL_FRZ; - pci_write_config_dword(pdev, box_ctl, config); - } -} - -static void snbep_uncore_pci_enable_event(struct intel_uncore_box *box, struct perf_event *event) -{ - struct pci_dev *pdev = box->pci_dev; - struct hw_perf_event *hwc = &event->hw; - - pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN); -} - -static void snbep_uncore_pci_disable_event(struct intel_uncore_box *box, struct perf_event *event) -{ - struct pci_dev *pdev = box->pci_dev; - struct hw_perf_event *hwc = &event->hw; - - pci_write_config_dword(pdev, hwc->config_base, hwc->config); -} - -static u64 snbep_uncore_pci_read_counter(struct intel_uncore_box *box, struct perf_event *event) -{ - struct pci_dev *pdev = box->pci_dev; - struct hw_perf_event *hwc = &event->hw; - u64 count = 0; - - pci_read_config_dword(pdev, hwc->event_base, (u32 *)&count); - pci_read_config_dword(pdev, hwc->event_base + 4, (u32 *)&count + 1); - - return count; -} - -static void snbep_uncore_pci_init_box(struct intel_uncore_box *box) -{ - struct pci_dev *pdev = box->pci_dev; - int box_ctl = uncore_pci_box_ctl(box); - - pci_write_config_dword(pdev, box_ctl, SNBEP_PMON_BOX_CTL_INT); -} - -static void snbep_uncore_msr_disable_box(struct intel_uncore_box *box) -{ - u64 config; - unsigned msr; - - msr = uncore_msr_box_ctl(box); - if (msr) { - rdmsrl(msr, config); - config |= SNBEP_PMON_BOX_CTL_FRZ; - wrmsrl(msr, config); - } -} - -static void snbep_uncore_msr_enable_box(struct intel_uncore_box *box) -{ - u64 config; - unsigned msr; - - msr = uncore_msr_box_ctl(box); - if (msr) { - rdmsrl(msr, config); - config &= ~SNBEP_PMON_BOX_CTL_FRZ; - wrmsrl(msr, config); - } -} - -static void snbep_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - struct hw_perf_event_extra *reg1 = &hwc->extra_reg; - - if (reg1->idx != EXTRA_REG_NONE) - wrmsrl(reg1->reg, uncore_shared_reg_config(box, 0)); - - wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN); -} - -static void snbep_uncore_msr_disable_event(struct intel_uncore_box *box, - struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - - wrmsrl(hwc->config_base, hwc->config); -} - -static void snbep_uncore_msr_init_box(struct intel_uncore_box *box) -{ - unsigned msr = uncore_msr_box_ctl(box); - - if (msr) - wrmsrl(msr, SNBEP_PMON_BOX_CTL_INT); -} - -static struct attribute *snbep_uncore_formats_attr[] = { - &format_attr_event.attr, - &format_attr_umask.attr, - &format_attr_edge.attr, - &format_attr_inv.attr, - &format_attr_thresh8.attr, - NULL, -}; - -static struct attribute *snbep_uncore_ubox_formats_attr[] = { - &format_attr_event.attr, - &format_attr_umask.attr, - &format_attr_edge.attr, - &format_attr_inv.attr, - &format_attr_thresh5.attr, - NULL, -}; - -static struct attribute *snbep_uncore_cbox_formats_attr[] = { - &format_attr_event.attr, - &format_attr_umask.attr, - &format_attr_edge.attr, - &format_attr_tid_en.attr, - &format_attr_inv.attr, - &format_attr_thresh8.attr, - &format_attr_filter_tid.attr, - &format_attr_filter_nid.attr, - &format_attr_filter_state.attr, - &format_attr_filter_opc.attr, - NULL, -}; - -static struct attribute *snbep_uncore_pcu_formats_attr[] = { - &format_attr_event_ext.attr, - &format_attr_occ_sel.attr, - &format_attr_edge.attr, - &format_attr_inv.attr, - &format_attr_thresh5.attr, - &format_attr_occ_invert.attr, - &format_attr_occ_edge.attr, - &format_attr_filter_band0.attr, - &format_attr_filter_band1.attr, - &format_attr_filter_band2.attr, - &format_attr_filter_band3.attr, - NULL, -}; - -static struct attribute *snbep_uncore_qpi_formats_attr[] = { - &format_attr_event_ext.attr, - &format_attr_umask.attr, - &format_attr_edge.attr, - &format_attr_inv.attr, - &format_attr_thresh8.attr, - &format_attr_match_rds.attr, - &format_attr_match_rnid30.attr, - &format_attr_match_rnid4.attr, - &format_attr_match_dnid.attr, - &format_attr_match_mc.attr, - &format_attr_match_opc.attr, - &format_attr_match_vnw.attr, - &format_attr_match0.attr, - &format_attr_match1.attr, - &format_attr_mask_rds.attr, - &format_attr_mask_rnid30.attr, - &format_attr_mask_rnid4.attr, - &format_attr_mask_dnid.attr, - &format_attr_mask_mc.attr, - &format_attr_mask_opc.attr, - &format_attr_mask_vnw.attr, - &format_attr_mask0.attr, - &format_attr_mask1.attr, - NULL, -}; - -static struct uncore_event_desc snbep_uncore_imc_events[] = { - INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"), - INTEL_UNCORE_EVENT_DESC(cas_count_read, "event=0x04,umask=0x03"), - INTEL_UNCORE_EVENT_DESC(cas_count_read.scale, "6.103515625e-5"), - INTEL_UNCORE_EVENT_DESC(cas_count_read.unit, "MiB"), - INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"), - INTEL_UNCORE_EVENT_DESC(cas_count_write.scale, "6.103515625e-5"), - INTEL_UNCORE_EVENT_DESC(cas_count_write.unit, "MiB"), - { /* end: all zeroes */ }, -}; - -static struct uncore_event_desc snbep_uncore_qpi_events[] = { - INTEL_UNCORE_EVENT_DESC(clockticks, "event=0x14"), - INTEL_UNCORE_EVENT_DESC(txl_flits_active, "event=0x00,umask=0x06"), - INTEL_UNCORE_EVENT_DESC(drs_data, "event=0x102,umask=0x08"), - INTEL_UNCORE_EVENT_DESC(ncb_data, "event=0x103,umask=0x04"), - { /* end: all zeroes */ }, -}; - -static struct attribute_group snbep_uncore_format_group = { - .name = "format", - .attrs = snbep_uncore_formats_attr, -}; - -static struct attribute_group snbep_uncore_ubox_format_group = { - .name = "format", - .attrs = snbep_uncore_ubox_formats_attr, -}; - -static struct attribute_group snbep_uncore_cbox_format_group = { - .name = "format", - .attrs = snbep_uncore_cbox_formats_attr, -}; - -static struct attribute_group snbep_uncore_pcu_format_group = { - .name = "format", - .attrs = snbep_uncore_pcu_formats_attr, -}; - -static struct attribute_group snbep_uncore_qpi_format_group = { - .name = "format", - .attrs = snbep_uncore_qpi_formats_attr, -}; - -#define __SNBEP_UNCORE_MSR_OPS_COMMON_INIT() \ - .disable_box = snbep_uncore_msr_disable_box, \ - .enable_box = snbep_uncore_msr_enable_box, \ - .disable_event = snbep_uncore_msr_disable_event, \ - .enable_event = snbep_uncore_msr_enable_event, \ - .read_counter = uncore_msr_read_counter - -#define SNBEP_UNCORE_MSR_OPS_COMMON_INIT() \ - __SNBEP_UNCORE_MSR_OPS_COMMON_INIT(), \ - .init_box = snbep_uncore_msr_init_box \ - -static struct intel_uncore_ops snbep_uncore_msr_ops = { - SNBEP_UNCORE_MSR_OPS_COMMON_INIT(), -}; - -#define SNBEP_UNCORE_PCI_OPS_COMMON_INIT() \ - .init_box = snbep_uncore_pci_init_box, \ - .disable_box = snbep_uncore_pci_disable_box, \ - .enable_box = snbep_uncore_pci_enable_box, \ - .disable_event = snbep_uncore_pci_disable_event, \ - .read_counter = snbep_uncore_pci_read_counter - -static struct intel_uncore_ops snbep_uncore_pci_ops = { - SNBEP_UNCORE_PCI_OPS_COMMON_INIT(), - .enable_event = snbep_uncore_pci_enable_event, \ -}; - -static struct event_constraint snbep_uncore_cbox_constraints[] = { - UNCORE_EVENT_CONSTRAINT(0x01, 0x1), - UNCORE_EVENT_CONSTRAINT(0x02, 0x3), - UNCORE_EVENT_CONSTRAINT(0x04, 0x3), - UNCORE_EVENT_CONSTRAINT(0x05, 0x3), - UNCORE_EVENT_CONSTRAINT(0x07, 0x3), - UNCORE_EVENT_CONSTRAINT(0x09, 0x3), - UNCORE_EVENT_CONSTRAINT(0x11, 0x1), - UNCORE_EVENT_CONSTRAINT(0x12, 0x3), - UNCORE_EVENT_CONSTRAINT(0x13, 0x3), - UNCORE_EVENT_CONSTRAINT(0x1b, 0xc), - UNCORE_EVENT_CONSTRAINT(0x1c, 0xc), - UNCORE_EVENT_CONSTRAINT(0x1d, 0xc), - UNCORE_EVENT_CONSTRAINT(0x1e, 0xc), - EVENT_CONSTRAINT_OVERLAP(0x1f, 0xe, 0xff), - UNCORE_EVENT_CONSTRAINT(0x21, 0x3), - UNCORE_EVENT_CONSTRAINT(0x23, 0x3), - UNCORE_EVENT_CONSTRAINT(0x31, 0x3), - UNCORE_EVENT_CONSTRAINT(0x32, 0x3), - UNCORE_EVENT_CONSTRAINT(0x33, 0x3), - UNCORE_EVENT_CONSTRAINT(0x34, 0x3), - UNCORE_EVENT_CONSTRAINT(0x35, 0x3), - UNCORE_EVENT_CONSTRAINT(0x36, 0x1), - UNCORE_EVENT_CONSTRAINT(0x37, 0x3), - UNCORE_EVENT_CONSTRAINT(0x38, 0x3), - UNCORE_EVENT_CONSTRAINT(0x39, 0x3), - UNCORE_EVENT_CONSTRAINT(0x3b, 0x1), - EVENT_CONSTRAINT_END -}; - -static struct event_constraint snbep_uncore_r2pcie_constraints[] = { - UNCORE_EVENT_CONSTRAINT(0x10, 0x3), - UNCORE_EVENT_CONSTRAINT(0x11, 0x3), - UNCORE_EVENT_CONSTRAINT(0x12, 0x1), - UNCORE_EVENT_CONSTRAINT(0x23, 0x3), - UNCORE_EVENT_CONSTRAINT(0x24, 0x3), - UNCORE_EVENT_CONSTRAINT(0x25, 0x3), - UNCORE_EVENT_CONSTRAINT(0x26, 0x3), - UNCORE_EVENT_CONSTRAINT(0x32, 0x3), - UNCORE_EVENT_CONSTRAINT(0x33, 0x3), - UNCORE_EVENT_CONSTRAINT(0x34, 0x3), - EVENT_CONSTRAINT_END -}; - -static struct event_constraint snbep_uncore_r3qpi_constraints[] = { - UNCORE_EVENT_CONSTRAINT(0x10, 0x3), - UNCORE_EVENT_CONSTRAINT(0x11, 0x3), - UNCORE_EVENT_CONSTRAINT(0x12, 0x3), - UNCORE_EVENT_CONSTRAINT(0x13, 0x1), - UNCORE_EVENT_CONSTRAINT(0x20, 0x3), - UNCORE_EVENT_CONSTRAINT(0x21, 0x3), - UNCORE_EVENT_CONSTRAINT(0x22, 0x3), - UNCORE_EVENT_CONSTRAINT(0x23, 0x3), - UNCORE_EVENT_CONSTRAINT(0x24, 0x3), - UNCORE_EVENT_CONSTRAINT(0x25, 0x3), - UNCORE_EVENT_CONSTRAINT(0x26, 0x3), - UNCORE_EVENT_CONSTRAINT(0x28, 0x3), - UNCORE_EVENT_CONSTRAINT(0x29, 0x3), - UNCORE_EVENT_CONSTRAINT(0x2a, 0x3), - UNCORE_EVENT_CONSTRAINT(0x2b, 0x3), - UNCORE_EVENT_CONSTRAINT(0x2c, 0x3), - UNCORE_EVENT_CONSTRAINT(0x2d, 0x3), - UNCORE_EVENT_CONSTRAINT(0x2e, 0x3), - UNCORE_EVENT_CONSTRAINT(0x2f, 0x3), - UNCORE_EVENT_CONSTRAINT(0x30, 0x3), - UNCORE_EVENT_CONSTRAINT(0x31, 0x3), - UNCORE_EVENT_CONSTRAINT(0x32, 0x3), - UNCORE_EVENT_CONSTRAINT(0x33, 0x3), - UNCORE_EVENT_CONSTRAINT(0x34, 0x3), - UNCORE_EVENT_CONSTRAINT(0x36, 0x3), - UNCORE_EVENT_CONSTRAINT(0x37, 0x3), - UNCORE_EVENT_CONSTRAINT(0x38, 0x3), - UNCORE_EVENT_CONSTRAINT(0x39, 0x3), - EVENT_CONSTRAINT_END -}; - -static struct intel_uncore_type snbep_uncore_ubox = { - .name = "ubox", - .num_counters = 2, - .num_boxes = 1, - .perf_ctr_bits = 44, - .fixed_ctr_bits = 48, - .perf_ctr = SNBEP_U_MSR_PMON_CTR0, - .event_ctl = SNBEP_U_MSR_PMON_CTL0, - .event_mask = SNBEP_U_MSR_PMON_RAW_EVENT_MASK, - .fixed_ctr = SNBEP_U_MSR_PMON_UCLK_FIXED_CTR, - .fixed_ctl = SNBEP_U_MSR_PMON_UCLK_FIXED_CTL, - .ops = &snbep_uncore_msr_ops, - .format_group = &snbep_uncore_ubox_format_group, -}; - -static struct extra_reg snbep_uncore_cbox_extra_regs[] = { - SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN, - SNBEP_CBO_PMON_CTL_TID_EN, 0x1), - SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4), - SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0x6), - SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4), - SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0x6), - SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4), - SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0x6), - SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0x6), - SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x8), - SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x8), - SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0xa), - SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0xa), - SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x2), - SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x2), - SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x2), - SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x2), - SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x8), - SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x8), - SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0xa), - SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0xa), - SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x2), - SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x2), - SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x2), - SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x2), - EVENT_EXTRA_END -}; - -static void snbep_cbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event) -{ - struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; - struct intel_uncore_extra_reg *er = &box->shared_regs[0]; - int i; - - if (uncore_box_is_fake(box)) - return; - - for (i = 0; i < 5; i++) { - if (reg1->alloc & (0x1 << i)) - atomic_sub(1 << (i * 6), &er->ref); - } - reg1->alloc = 0; -} - -static struct event_constraint * -__snbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event, - u64 (*cbox_filter_mask)(int fields)) -{ - struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; - struct intel_uncore_extra_reg *er = &box->shared_regs[0]; - int i, alloc = 0; - unsigned long flags; - u64 mask; - - if (reg1->idx == EXTRA_REG_NONE) - return NULL; - - raw_spin_lock_irqsave(&er->lock, flags); - for (i = 0; i < 5; i++) { - if (!(reg1->idx & (0x1 << i))) - continue; - if (!uncore_box_is_fake(box) && (reg1->alloc & (0x1 << i))) - continue; - - mask = cbox_filter_mask(0x1 << i); - if (!__BITS_VALUE(atomic_read(&er->ref), i, 6) || - !((reg1->config ^ er->config) & mask)) { - atomic_add(1 << (i * 6), &er->ref); - er->config &= ~mask; - er->config |= reg1->config & mask; - alloc |= (0x1 << i); - } else { - break; - } - } - raw_spin_unlock_irqrestore(&er->lock, flags); - if (i < 5) - goto fail; - - if (!uncore_box_is_fake(box)) - reg1->alloc |= alloc; - - return NULL; -fail: - for (; i >= 0; i--) { - if (alloc & (0x1 << i)) - atomic_sub(1 << (i * 6), &er->ref); - } - return &uncore_constraint_empty; -} - -static u64 snbep_cbox_filter_mask(int fields) -{ - u64 mask = 0; - - if (fields & 0x1) - mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_TID; - if (fields & 0x2) - mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_NID; - if (fields & 0x4) - mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_STATE; - if (fields & 0x8) - mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_OPC; - - return mask; -} - -static struct event_constraint * -snbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event) -{ - return __snbep_cbox_get_constraint(box, event, snbep_cbox_filter_mask); -} - -static int snbep_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event) -{ - struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; - struct extra_reg *er; - int idx = 0; - - for (er = snbep_uncore_cbox_extra_regs; er->msr; er++) { - if (er->event != (event->hw.config & er->config_mask)) - continue; - idx |= er->idx; - } - - if (idx) { - reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER + - SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx; - reg1->config = event->attr.config1 & snbep_cbox_filter_mask(idx); - reg1->idx = idx; - } - return 0; -} - -static struct intel_uncore_ops snbep_uncore_cbox_ops = { - SNBEP_UNCORE_MSR_OPS_COMMON_INIT(), - .hw_config = snbep_cbox_hw_config, - .get_constraint = snbep_cbox_get_constraint, - .put_constraint = snbep_cbox_put_constraint, -}; - -static struct intel_uncore_type snbep_uncore_cbox = { - .name = "cbox", - .num_counters = 4, - .num_boxes = 8, - .perf_ctr_bits = 44, - .event_ctl = SNBEP_C0_MSR_PMON_CTL0, - .perf_ctr = SNBEP_C0_MSR_PMON_CTR0, - .event_mask = SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK, - .box_ctl = SNBEP_C0_MSR_PMON_BOX_CTL, - .msr_offset = SNBEP_CBO_MSR_OFFSET, - .num_shared_regs = 1, - .constraints = snbep_uncore_cbox_constraints, - .ops = &snbep_uncore_cbox_ops, - .format_group = &snbep_uncore_cbox_format_group, -}; - -static u64 snbep_pcu_alter_er(struct perf_event *event, int new_idx, bool modify) -{ - struct hw_perf_event *hwc = &event->hw; - struct hw_perf_event_extra *reg1 = &hwc->extra_reg; - u64 config = reg1->config; - - if (new_idx > reg1->idx) - config <<= 8 * (new_idx - reg1->idx); - else - config >>= 8 * (reg1->idx - new_idx); - - if (modify) { - hwc->config += new_idx - reg1->idx; - reg1->config = config; - reg1->idx = new_idx; - } - return config; -} - -static struct event_constraint * -snbep_pcu_get_constraint(struct intel_uncore_box *box, struct perf_event *event) -{ - struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; - struct intel_uncore_extra_reg *er = &box->shared_regs[0]; - unsigned long flags; - int idx = reg1->idx; - u64 mask, config1 = reg1->config; - bool ok = false; - - if (reg1->idx == EXTRA_REG_NONE || - (!uncore_box_is_fake(box) && reg1->alloc)) - return NULL; -again: - mask = 0xffULL << (idx * 8); - raw_spin_lock_irqsave(&er->lock, flags); - if (!__BITS_VALUE(atomic_read(&er->ref), idx, 8) || - !((config1 ^ er->config) & mask)) { - atomic_add(1 << (idx * 8), &er->ref); - er->config &= ~mask; - er->config |= config1 & mask; - ok = true; - } - raw_spin_unlock_irqrestore(&er->lock, flags); - - if (!ok) { - idx = (idx + 1) % 4; - if (idx != reg1->idx) { - config1 = snbep_pcu_alter_er(event, idx, false); - goto again; - } - return &uncore_constraint_empty; - } - - if (!uncore_box_is_fake(box)) { - if (idx != reg1->idx) - snbep_pcu_alter_er(event, idx, true); - reg1->alloc = 1; - } - return NULL; -} - -static void snbep_pcu_put_constraint(struct intel_uncore_box *box, struct perf_event *event) -{ - struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; - struct intel_uncore_extra_reg *er = &box->shared_regs[0]; - - if (uncore_box_is_fake(box) || !reg1->alloc) - return; - - atomic_sub(1 << (reg1->idx * 8), &er->ref); - reg1->alloc = 0; -} - -static int snbep_pcu_hw_config(struct intel_uncore_box *box, struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - struct hw_perf_event_extra *reg1 = &hwc->extra_reg; - int ev_sel = hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK; - - if (ev_sel >= 0xb && ev_sel <= 0xe) { - reg1->reg = SNBEP_PCU_MSR_PMON_BOX_FILTER; - reg1->idx = ev_sel - 0xb; - reg1->config = event->attr.config1 & (0xff << (reg1->idx * 8)); - } - return 0; -} - -static struct intel_uncore_ops snbep_uncore_pcu_ops = { - SNBEP_UNCORE_MSR_OPS_COMMON_INIT(), - .hw_config = snbep_pcu_hw_config, - .get_constraint = snbep_pcu_get_constraint, - .put_constraint = snbep_pcu_put_constraint, -}; - -static struct intel_uncore_type snbep_uncore_pcu = { - .name = "pcu", - .num_counters = 4, - .num_boxes = 1, - .perf_ctr_bits = 48, - .perf_ctr = SNBEP_PCU_MSR_PMON_CTR0, - .event_ctl = SNBEP_PCU_MSR_PMON_CTL0, - .event_mask = SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK, - .box_ctl = SNBEP_PCU_MSR_PMON_BOX_CTL, - .num_shared_regs = 1, - .ops = &snbep_uncore_pcu_ops, - .format_group = &snbep_uncore_pcu_format_group, -}; - -static struct intel_uncore_type *snbep_msr_uncores[] = { - &snbep_uncore_ubox, - &snbep_uncore_cbox, - &snbep_uncore_pcu, - NULL, -}; - -void snbep_uncore_cpu_init(void) -{ - if (snbep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) - snbep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; - uncore_msr_uncores = snbep_msr_uncores; -} - -enum { - SNBEP_PCI_QPI_PORT0_FILTER, - SNBEP_PCI_QPI_PORT1_FILTER, - HSWEP_PCI_PCU_3, -}; - -static int snbep_qpi_hw_config(struct intel_uncore_box *box, struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - struct hw_perf_event_extra *reg1 = &hwc->extra_reg; - struct hw_perf_event_extra *reg2 = &hwc->branch_reg; - - if ((hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK) == 0x38) { - reg1->idx = 0; - reg1->reg = SNBEP_Q_Py_PCI_PMON_PKT_MATCH0; - reg1->config = event->attr.config1; - reg2->reg = SNBEP_Q_Py_PCI_PMON_PKT_MASK0; - reg2->config = event->attr.config2; - } - return 0; -} - -static void snbep_qpi_enable_event(struct intel_uncore_box *box, struct perf_event *event) -{ - struct pci_dev *pdev = box->pci_dev; - struct hw_perf_event *hwc = &event->hw; - struct hw_perf_event_extra *reg1 = &hwc->extra_reg; - struct hw_perf_event_extra *reg2 = &hwc->branch_reg; - - if (reg1->idx != EXTRA_REG_NONE) { - int idx = box->pmu->pmu_idx + SNBEP_PCI_QPI_PORT0_FILTER; - struct pci_dev *filter_pdev = uncore_extra_pci_dev[box->phys_id][idx]; - if (filter_pdev) { - pci_write_config_dword(filter_pdev, reg1->reg, - (u32)reg1->config); - pci_write_config_dword(filter_pdev, reg1->reg + 4, - (u32)(reg1->config >> 32)); - pci_write_config_dword(filter_pdev, reg2->reg, - (u32)reg2->config); - pci_write_config_dword(filter_pdev, reg2->reg + 4, - (u32)(reg2->config >> 32)); - } - } - - pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN); -} - -static struct intel_uncore_ops snbep_uncore_qpi_ops = { - SNBEP_UNCORE_PCI_OPS_COMMON_INIT(), - .enable_event = snbep_qpi_enable_event, - .hw_config = snbep_qpi_hw_config, - .get_constraint = uncore_get_constraint, - .put_constraint = uncore_put_constraint, -}; - -#define SNBEP_UNCORE_PCI_COMMON_INIT() \ - .perf_ctr = SNBEP_PCI_PMON_CTR0, \ - .event_ctl = SNBEP_PCI_PMON_CTL0, \ - .event_mask = SNBEP_PMON_RAW_EVENT_MASK, \ - .box_ctl = SNBEP_PCI_PMON_BOX_CTL, \ - .ops = &snbep_uncore_pci_ops, \ - .format_group = &snbep_uncore_format_group - -static struct intel_uncore_type snbep_uncore_ha = { - .name = "ha", - .num_counters = 4, - .num_boxes = 1, - .perf_ctr_bits = 48, - SNBEP_UNCORE_PCI_COMMON_INIT(), -}; - -static struct intel_uncore_type snbep_uncore_imc = { - .name = "imc", - .num_counters = 4, - .num_boxes = 4, - .perf_ctr_bits = 48, - .fixed_ctr_bits = 48, - .fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR, - .fixed_ctl = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL, - .event_descs = snbep_uncore_imc_events, - SNBEP_UNCORE_PCI_COMMON_INIT(), -}; - -static struct intel_uncore_type snbep_uncore_qpi = { - .name = "qpi", - .num_counters = 4, - .num_boxes = 2, - .perf_ctr_bits = 48, - .perf_ctr = SNBEP_PCI_PMON_CTR0, - .event_ctl = SNBEP_PCI_PMON_CTL0, - .event_mask = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK, - .box_ctl = SNBEP_PCI_PMON_BOX_CTL, - .num_shared_regs = 1, - .ops = &snbep_uncore_qpi_ops, - .event_descs = snbep_uncore_qpi_events, - .format_group = &snbep_uncore_qpi_format_group, -}; - - -static struct intel_uncore_type snbep_uncore_r2pcie = { - .name = "r2pcie", - .num_counters = 4, - .num_boxes = 1, - .perf_ctr_bits = 44, - .constraints = snbep_uncore_r2pcie_constraints, - SNBEP_UNCORE_PCI_COMMON_INIT(), -}; - -static struct intel_uncore_type snbep_uncore_r3qpi = { - .name = "r3qpi", - .num_counters = 3, - .num_boxes = 2, - .perf_ctr_bits = 44, - .constraints = snbep_uncore_r3qpi_constraints, - SNBEP_UNCORE_PCI_COMMON_INIT(), -}; - -enum { - SNBEP_PCI_UNCORE_HA, - SNBEP_PCI_UNCORE_IMC, - SNBEP_PCI_UNCORE_QPI, - SNBEP_PCI_UNCORE_R2PCIE, - SNBEP_PCI_UNCORE_R3QPI, -}; - -static struct intel_uncore_type *snbep_pci_uncores[] = { - [SNBEP_PCI_UNCORE_HA] = &snbep_uncore_ha, - [SNBEP_PCI_UNCORE_IMC] = &snbep_uncore_imc, - [SNBEP_PCI_UNCORE_QPI] = &snbep_uncore_qpi, - [SNBEP_PCI_UNCORE_R2PCIE] = &snbep_uncore_r2pcie, - [SNBEP_PCI_UNCORE_R3QPI] = &snbep_uncore_r3qpi, - NULL, -}; - -static const struct pci_device_id snbep_uncore_pci_ids[] = { - { /* Home Agent */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_HA), - .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_HA, 0), - }, - { /* MC Channel 0 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC0), - .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 0), - }, - { /* MC Channel 1 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC1), - .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 1), - }, - { /* MC Channel 2 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC2), - .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 2), - }, - { /* MC Channel 3 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC3), - .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 3), - }, - { /* QPI Port 0 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI0), - .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 0), - }, - { /* QPI Port 1 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI1), - .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 1), - }, - { /* R2PCIe */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R2PCIE), - .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R2PCIE, 0), - }, - { /* R3QPI Link 0 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI0), - .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 0), - }, - { /* R3QPI Link 1 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI1), - .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 1), - }, - { /* QPI Port 0 filter */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c86), - .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, - SNBEP_PCI_QPI_PORT0_FILTER), - }, - { /* QPI Port 0 filter */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c96), - .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, - SNBEP_PCI_QPI_PORT1_FILTER), - }, - { /* end: all zeroes */ } -}; - -static struct pci_driver snbep_uncore_pci_driver = { - .name = "snbep_uncore", - .id_table = snbep_uncore_pci_ids, -}; - -/* - * build pci bus to socket mapping - */ -static int snbep_pci2phy_map_init(int devid) -{ - struct pci_dev *ubox_dev = NULL; - int i, bus, nodeid, segment; - struct pci2phy_map *map; - int err = 0; - u32 config = 0; - - while (1) { - /* find the UBOX device */ - ubox_dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, ubox_dev); - if (!ubox_dev) - break; - bus = ubox_dev->bus->number; - /* get the Node ID of the local register */ - err = pci_read_config_dword(ubox_dev, 0x40, &config); - if (err) - break; - nodeid = config; - /* get the Node ID mapping */ - err = pci_read_config_dword(ubox_dev, 0x54, &config); - if (err) - break; - - segment = pci_domain_nr(ubox_dev->bus); - raw_spin_lock(&pci2phy_map_lock); - map = __find_pci2phy_map(segment); - if (!map) { - raw_spin_unlock(&pci2phy_map_lock); - err = -ENOMEM; - break; - } - - /* - * every three bits in the Node ID mapping register maps - * to a particular node. - */ - for (i = 0; i < 8; i++) { - if (nodeid == ((config >> (3 * i)) & 0x7)) { - map->pbus_to_physid[bus] = i; - break; - } - } - raw_spin_unlock(&pci2phy_map_lock); - } - - if (!err) { - /* - * For PCI bus with no UBOX device, find the next bus - * that has UBOX device and use its mapping. - */ - raw_spin_lock(&pci2phy_map_lock); - list_for_each_entry(map, &pci2phy_map_head, list) { - i = -1; - for (bus = 255; bus >= 0; bus--) { - if (map->pbus_to_physid[bus] >= 0) - i = map->pbus_to_physid[bus]; - else - map->pbus_to_physid[bus] = i; - } - } - raw_spin_unlock(&pci2phy_map_lock); - } - - pci_dev_put(ubox_dev); - - return err ? pcibios_err_to_errno(err) : 0; -} - -int snbep_uncore_pci_init(void) -{ - int ret = snbep_pci2phy_map_init(0x3ce0); - if (ret) - return ret; - uncore_pci_uncores = snbep_pci_uncores; - uncore_pci_driver = &snbep_uncore_pci_driver; - return 0; -} -/* end of Sandy Bridge-EP uncore support */ - -/* IvyTown uncore support */ -static void ivbep_uncore_msr_init_box(struct intel_uncore_box *box) -{ - unsigned msr = uncore_msr_box_ctl(box); - if (msr) - wrmsrl(msr, IVBEP_PMON_BOX_CTL_INT); -} - -static void ivbep_uncore_pci_init_box(struct intel_uncore_box *box) -{ - struct pci_dev *pdev = box->pci_dev; - - pci_write_config_dword(pdev, SNBEP_PCI_PMON_BOX_CTL, IVBEP_PMON_BOX_CTL_INT); -} - -#define IVBEP_UNCORE_MSR_OPS_COMMON_INIT() \ - .init_box = ivbep_uncore_msr_init_box, \ - .disable_box = snbep_uncore_msr_disable_box, \ - .enable_box = snbep_uncore_msr_enable_box, \ - .disable_event = snbep_uncore_msr_disable_event, \ - .enable_event = snbep_uncore_msr_enable_event, \ - .read_counter = uncore_msr_read_counter - -static struct intel_uncore_ops ivbep_uncore_msr_ops = { - IVBEP_UNCORE_MSR_OPS_COMMON_INIT(), -}; - -static struct intel_uncore_ops ivbep_uncore_pci_ops = { - .init_box = ivbep_uncore_pci_init_box, - .disable_box = snbep_uncore_pci_disable_box, - .enable_box = snbep_uncore_pci_enable_box, - .disable_event = snbep_uncore_pci_disable_event, - .enable_event = snbep_uncore_pci_enable_event, - .read_counter = snbep_uncore_pci_read_counter, -}; - -#define IVBEP_UNCORE_PCI_COMMON_INIT() \ - .perf_ctr = SNBEP_PCI_PMON_CTR0, \ - .event_ctl = SNBEP_PCI_PMON_CTL0, \ - .event_mask = IVBEP_PMON_RAW_EVENT_MASK, \ - .box_ctl = SNBEP_PCI_PMON_BOX_CTL, \ - .ops = &ivbep_uncore_pci_ops, \ - .format_group = &ivbep_uncore_format_group - -static struct attribute *ivbep_uncore_formats_attr[] = { - &format_attr_event.attr, - &format_attr_umask.attr, - &format_attr_edge.attr, - &format_attr_inv.attr, - &format_attr_thresh8.attr, - NULL, -}; - -static struct attribute *ivbep_uncore_ubox_formats_attr[] = { - &format_attr_event.attr, - &format_attr_umask.attr, - &format_attr_edge.attr, - &format_attr_inv.attr, - &format_attr_thresh5.attr, - NULL, -}; - -static struct attribute *ivbep_uncore_cbox_formats_attr[] = { - &format_attr_event.attr, - &format_attr_umask.attr, - &format_attr_edge.attr, - &format_attr_tid_en.attr, - &format_attr_thresh8.attr, - &format_attr_filter_tid.attr, - &format_attr_filter_link.attr, - &format_attr_filter_state2.attr, - &format_attr_filter_nid2.attr, - &format_attr_filter_opc2.attr, - &format_attr_filter_nc.attr, - &format_attr_filter_c6.attr, - &format_attr_filter_isoc.attr, - NULL, -}; - -static struct attribute *ivbep_uncore_pcu_formats_attr[] = { - &format_attr_event_ext.attr, - &format_attr_occ_sel.attr, - &format_attr_edge.attr, - &format_attr_thresh5.attr, - &format_attr_occ_invert.attr, - &format_attr_occ_edge.attr, - &format_attr_filter_band0.attr, - &format_attr_filter_band1.attr, - &format_attr_filter_band2.attr, - &format_attr_filter_band3.attr, - NULL, -}; - -static struct attribute *ivbep_uncore_qpi_formats_attr[] = { - &format_attr_event_ext.attr, - &format_attr_umask.attr, - &format_attr_edge.attr, - &format_attr_thresh8.attr, - &format_attr_match_rds.attr, - &format_attr_match_rnid30.attr, - &format_attr_match_rnid4.attr, - &format_attr_match_dnid.attr, - &format_attr_match_mc.attr, - &format_attr_match_opc.attr, - &format_attr_match_vnw.attr, - &format_attr_match0.attr, - &format_attr_match1.attr, - &format_attr_mask_rds.attr, - &format_attr_mask_rnid30.attr, - &format_attr_mask_rnid4.attr, - &format_attr_mask_dnid.attr, - &format_attr_mask_mc.attr, - &format_attr_mask_opc.attr, - &format_attr_mask_vnw.attr, - &format_attr_mask0.attr, - &format_attr_mask1.attr, - NULL, -}; - -static struct attribute_group ivbep_uncore_format_group = { - .name = "format", - .attrs = ivbep_uncore_formats_attr, -}; - -static struct attribute_group ivbep_uncore_ubox_format_group = { - .name = "format", - .attrs = ivbep_uncore_ubox_formats_attr, -}; - -static struct attribute_group ivbep_uncore_cbox_format_group = { - .name = "format", - .attrs = ivbep_uncore_cbox_formats_attr, -}; - -static struct attribute_group ivbep_uncore_pcu_format_group = { - .name = "format", - .attrs = ivbep_uncore_pcu_formats_attr, -}; - -static struct attribute_group ivbep_uncore_qpi_format_group = { - .name = "format", - .attrs = ivbep_uncore_qpi_formats_attr, -}; - -static struct intel_uncore_type ivbep_uncore_ubox = { - .name = "ubox", - .num_counters = 2, - .num_boxes = 1, - .perf_ctr_bits = 44, - .fixed_ctr_bits = 48, - .perf_ctr = SNBEP_U_MSR_PMON_CTR0, - .event_ctl = SNBEP_U_MSR_PMON_CTL0, - .event_mask = IVBEP_U_MSR_PMON_RAW_EVENT_MASK, - .fixed_ctr = SNBEP_U_MSR_PMON_UCLK_FIXED_CTR, - .fixed_ctl = SNBEP_U_MSR_PMON_UCLK_FIXED_CTL, - .ops = &ivbep_uncore_msr_ops, - .format_group = &ivbep_uncore_ubox_format_group, -}; - -static struct extra_reg ivbep_uncore_cbox_extra_regs[] = { - SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN, - SNBEP_CBO_PMON_CTL_TID_EN, 0x1), - SNBEP_CBO_EVENT_EXTRA_REG(0x1031, 0x10ff, 0x2), - SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4), - SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0xc), - SNBEP_CBO_EVENT_EXTRA_REG(0x5134, 0xffff, 0xc), - SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4), - SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0xc), - SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4), - SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0xc), - SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4), - SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0xc), - SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x10), - SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x10), - SNBEP_CBO_EVENT_EXTRA_REG(0x2135, 0xffff, 0x10), - SNBEP_CBO_EVENT_EXTRA_REG(0x2335, 0xffff, 0x10), - SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0x18), - SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0x18), - SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x8), - SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x8), - SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x8), - SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x8), - SNBEP_CBO_EVENT_EXTRA_REG(0x8135, 0xffff, 0x10), - SNBEP_CBO_EVENT_EXTRA_REG(0x8335, 0xffff, 0x10), - SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x10), - SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x10), - SNBEP_CBO_EVENT_EXTRA_REG(0x2136, 0xffff, 0x10), - SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10), - SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0x18), - SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0x18), - SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x8), - SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x8), - SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x8), - SNBEP_CBO_EVENT_EXTRA_REG(0x5036, 0xffff, 0x8), - SNBEP_CBO_EVENT_EXTRA_REG(0x8136, 0xffff, 0x10), - SNBEP_CBO_EVENT_EXTRA_REG(0x8336, 0xffff, 0x10), - SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x8), - EVENT_EXTRA_END -}; - -static u64 ivbep_cbox_filter_mask(int fields) -{ - u64 mask = 0; - - if (fields & 0x1) - mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_TID; - if (fields & 0x2) - mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_LINK; - if (fields & 0x4) - mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_STATE; - if (fields & 0x8) - mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_NID; - if (fields & 0x10) { - mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_OPC; - mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_NC; - mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_C6; - mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_ISOC; - } - - return mask; -} - -static struct event_constraint * -ivbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event) -{ - return __snbep_cbox_get_constraint(box, event, ivbep_cbox_filter_mask); -} - -static int ivbep_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event) -{ - struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; - struct extra_reg *er; - int idx = 0; - - for (er = ivbep_uncore_cbox_extra_regs; er->msr; er++) { - if (er->event != (event->hw.config & er->config_mask)) - continue; - idx |= er->idx; - } - - if (idx) { - reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER + - SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx; - reg1->config = event->attr.config1 & ivbep_cbox_filter_mask(idx); - reg1->idx = idx; - } - return 0; -} - -static void ivbep_cbox_enable_event(struct intel_uncore_box *box, struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - struct hw_perf_event_extra *reg1 = &hwc->extra_reg; - - if (reg1->idx != EXTRA_REG_NONE) { - u64 filter = uncore_shared_reg_config(box, 0); - wrmsrl(reg1->reg, filter & 0xffffffff); - wrmsrl(reg1->reg + 6, filter >> 32); - } - - wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN); -} - -static struct intel_uncore_ops ivbep_uncore_cbox_ops = { - .init_box = ivbep_uncore_msr_init_box, - .disable_box = snbep_uncore_msr_disable_box, - .enable_box = snbep_uncore_msr_enable_box, - .disable_event = snbep_uncore_msr_disable_event, - .enable_event = ivbep_cbox_enable_event, - .read_counter = uncore_msr_read_counter, - .hw_config = ivbep_cbox_hw_config, - .get_constraint = ivbep_cbox_get_constraint, - .put_constraint = snbep_cbox_put_constraint, -}; - -static struct intel_uncore_type ivbep_uncore_cbox = { - .name = "cbox", - .num_counters = 4, - .num_boxes = 15, - .perf_ctr_bits = 44, - .event_ctl = SNBEP_C0_MSR_PMON_CTL0, - .perf_ctr = SNBEP_C0_MSR_PMON_CTR0, - .event_mask = IVBEP_CBO_MSR_PMON_RAW_EVENT_MASK, - .box_ctl = SNBEP_C0_MSR_PMON_BOX_CTL, - .msr_offset = SNBEP_CBO_MSR_OFFSET, - .num_shared_regs = 1, - .constraints = snbep_uncore_cbox_constraints, - .ops = &ivbep_uncore_cbox_ops, - .format_group = &ivbep_uncore_cbox_format_group, -}; - -static struct intel_uncore_ops ivbep_uncore_pcu_ops = { - IVBEP_UNCORE_MSR_OPS_COMMON_INIT(), - .hw_config = snbep_pcu_hw_config, - .get_constraint = snbep_pcu_get_constraint, - .put_constraint = snbep_pcu_put_constraint, -}; - -static struct intel_uncore_type ivbep_uncore_pcu = { - .name = "pcu", - .num_counters = 4, - .num_boxes = 1, - .perf_ctr_bits = 48, - .perf_ctr = SNBEP_PCU_MSR_PMON_CTR0, - .event_ctl = SNBEP_PCU_MSR_PMON_CTL0, - .event_mask = IVBEP_PCU_MSR_PMON_RAW_EVENT_MASK, - .box_ctl = SNBEP_PCU_MSR_PMON_BOX_CTL, - .num_shared_regs = 1, - .ops = &ivbep_uncore_pcu_ops, - .format_group = &ivbep_uncore_pcu_format_group, -}; - -static struct intel_uncore_type *ivbep_msr_uncores[] = { - &ivbep_uncore_ubox, - &ivbep_uncore_cbox, - &ivbep_uncore_pcu, - NULL, -}; - -void ivbep_uncore_cpu_init(void) -{ - if (ivbep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) - ivbep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; - uncore_msr_uncores = ivbep_msr_uncores; -} - -static struct intel_uncore_type ivbep_uncore_ha = { - .name = "ha", - .num_counters = 4, - .num_boxes = 2, - .perf_ctr_bits = 48, - IVBEP_UNCORE_PCI_COMMON_INIT(), -}; - -static struct intel_uncore_type ivbep_uncore_imc = { - .name = "imc", - .num_counters = 4, - .num_boxes = 8, - .perf_ctr_bits = 48, - .fixed_ctr_bits = 48, - .fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR, - .fixed_ctl = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL, - .event_descs = snbep_uncore_imc_events, - IVBEP_UNCORE_PCI_COMMON_INIT(), -}; - -/* registers in IRP boxes are not properly aligned */ -static unsigned ivbep_uncore_irp_ctls[] = {0xd8, 0xdc, 0xe0, 0xe4}; -static unsigned ivbep_uncore_irp_ctrs[] = {0xa0, 0xb0, 0xb8, 0xc0}; - -static void ivbep_uncore_irp_enable_event(struct intel_uncore_box *box, struct perf_event *event) -{ - struct pci_dev *pdev = box->pci_dev; - struct hw_perf_event *hwc = &event->hw; - - pci_write_config_dword(pdev, ivbep_uncore_irp_ctls[hwc->idx], - hwc->config | SNBEP_PMON_CTL_EN); -} - -static void ivbep_uncore_irp_disable_event(struct intel_uncore_box *box, struct perf_event *event) -{ - struct pci_dev *pdev = box->pci_dev; - struct hw_perf_event *hwc = &event->hw; - - pci_write_config_dword(pdev, ivbep_uncore_irp_ctls[hwc->idx], hwc->config); -} - -static u64 ivbep_uncore_irp_read_counter(struct intel_uncore_box *box, struct perf_event *event) -{ - struct pci_dev *pdev = box->pci_dev; - struct hw_perf_event *hwc = &event->hw; - u64 count = 0; - - pci_read_config_dword(pdev, ivbep_uncore_irp_ctrs[hwc->idx], (u32 *)&count); - pci_read_config_dword(pdev, ivbep_uncore_irp_ctrs[hwc->idx] + 4, (u32 *)&count + 1); - - return count; -} - -static struct intel_uncore_ops ivbep_uncore_irp_ops = { - .init_box = ivbep_uncore_pci_init_box, - .disable_box = snbep_uncore_pci_disable_box, - .enable_box = snbep_uncore_pci_enable_box, - .disable_event = ivbep_uncore_irp_disable_event, - .enable_event = ivbep_uncore_irp_enable_event, - .read_counter = ivbep_uncore_irp_read_counter, -}; - -static struct intel_uncore_type ivbep_uncore_irp = { - .name = "irp", - .num_counters = 4, - .num_boxes = 1, - .perf_ctr_bits = 48, - .event_mask = IVBEP_PMON_RAW_EVENT_MASK, - .box_ctl = SNBEP_PCI_PMON_BOX_CTL, - .ops = &ivbep_uncore_irp_ops, - .format_group = &ivbep_uncore_format_group, -}; - -static struct intel_uncore_ops ivbep_uncore_qpi_ops = { - .init_box = ivbep_uncore_pci_init_box, - .disable_box = snbep_uncore_pci_disable_box, - .enable_box = snbep_uncore_pci_enable_box, - .disable_event = snbep_uncore_pci_disable_event, - .enable_event = snbep_qpi_enable_event, - .read_counter = snbep_uncore_pci_read_counter, - .hw_config = snbep_qpi_hw_config, - .get_constraint = uncore_get_constraint, - .put_constraint = uncore_put_constraint, -}; - -static struct intel_uncore_type ivbep_uncore_qpi = { - .name = "qpi", - .num_counters = 4, - .num_boxes = 3, - .perf_ctr_bits = 48, - .perf_ctr = SNBEP_PCI_PMON_CTR0, - .event_ctl = SNBEP_PCI_PMON_CTL0, - .event_mask = IVBEP_QPI_PCI_PMON_RAW_EVENT_MASK, - .box_ctl = SNBEP_PCI_PMON_BOX_CTL, - .num_shared_regs = 1, - .ops = &ivbep_uncore_qpi_ops, - .format_group = &ivbep_uncore_qpi_format_group, -}; - -static struct intel_uncore_type ivbep_uncore_r2pcie = { - .name = "r2pcie", - .num_counters = 4, - .num_boxes = 1, - .perf_ctr_bits = 44, - .constraints = snbep_uncore_r2pcie_constraints, - IVBEP_UNCORE_PCI_COMMON_INIT(), -}; - -static struct intel_uncore_type ivbep_uncore_r3qpi = { - .name = "r3qpi", - .num_counters = 3, - .num_boxes = 2, - .perf_ctr_bits = 44, - .constraints = snbep_uncore_r3qpi_constraints, - IVBEP_UNCORE_PCI_COMMON_INIT(), -}; - -enum { - IVBEP_PCI_UNCORE_HA, - IVBEP_PCI_UNCORE_IMC, - IVBEP_PCI_UNCORE_IRP, - IVBEP_PCI_UNCORE_QPI, - IVBEP_PCI_UNCORE_R2PCIE, - IVBEP_PCI_UNCORE_R3QPI, -}; - -static struct intel_uncore_type *ivbep_pci_uncores[] = { - [IVBEP_PCI_UNCORE_HA] = &ivbep_uncore_ha, - [IVBEP_PCI_UNCORE_IMC] = &ivbep_uncore_imc, - [IVBEP_PCI_UNCORE_IRP] = &ivbep_uncore_irp, - [IVBEP_PCI_UNCORE_QPI] = &ivbep_uncore_qpi, - [IVBEP_PCI_UNCORE_R2PCIE] = &ivbep_uncore_r2pcie, - [IVBEP_PCI_UNCORE_R3QPI] = &ivbep_uncore_r3qpi, - NULL, -}; - -static const struct pci_device_id ivbep_uncore_pci_ids[] = { - { /* Home Agent 0 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe30), - .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_HA, 0), - }, - { /* Home Agent 1 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe38), - .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_HA, 1), - }, - { /* MC0 Channel 0 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb4), - .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 0), - }, - { /* MC0 Channel 1 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb5), - .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 1), - }, - { /* MC0 Channel 3 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb0), - .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 2), - }, - { /* MC0 Channel 4 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb1), - .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 3), - }, - { /* MC1 Channel 0 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef4), - .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 4), - }, - { /* MC1 Channel 1 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef5), - .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 5), - }, - { /* MC1 Channel 3 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef0), - .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 6), - }, - { /* MC1 Channel 4 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef1), - .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 7), - }, - { /* IRP */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe39), - .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IRP, 0), - }, - { /* QPI0 Port 0 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe32), - .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_QPI, 0), - }, - { /* QPI0 Port 1 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe33), - .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_QPI, 1), - }, - { /* QPI1 Port 2 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3a), - .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_QPI, 2), - }, - { /* R2PCIe */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe34), - .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R2PCIE, 0), - }, - { /* R3QPI0 Link 0 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe36), - .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R3QPI, 0), - }, - { /* R3QPI0 Link 1 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe37), - .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R3QPI, 1), - }, - { /* R3QPI1 Link 2 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3e), - .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R3QPI, 2), - }, - { /* QPI Port 0 filter */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe86), - .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, - SNBEP_PCI_QPI_PORT0_FILTER), - }, - { /* QPI Port 0 filter */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe96), - .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, - SNBEP_PCI_QPI_PORT1_FILTER), - }, - { /* end: all zeroes */ } -}; - -static struct pci_driver ivbep_uncore_pci_driver = { - .name = "ivbep_uncore", - .id_table = ivbep_uncore_pci_ids, -}; - -int ivbep_uncore_pci_init(void) -{ - int ret = snbep_pci2phy_map_init(0x0e1e); - if (ret) - return ret; - uncore_pci_uncores = ivbep_pci_uncores; - uncore_pci_driver = &ivbep_uncore_pci_driver; - return 0; -} -/* end of IvyTown uncore support */ - -/* KNL uncore support */ -static struct attribute *knl_uncore_ubox_formats_attr[] = { - &format_attr_event.attr, - &format_attr_umask.attr, - &format_attr_edge.attr, - &format_attr_tid_en.attr, - &format_attr_inv.attr, - &format_attr_thresh5.attr, - NULL, -}; - -static struct attribute_group knl_uncore_ubox_format_group = { - .name = "format", - .attrs = knl_uncore_ubox_formats_attr, -}; - -static struct intel_uncore_type knl_uncore_ubox = { - .name = "ubox", - .num_counters = 2, - .num_boxes = 1, - .perf_ctr_bits = 48, - .fixed_ctr_bits = 48, - .perf_ctr = HSWEP_U_MSR_PMON_CTR0, - .event_ctl = HSWEP_U_MSR_PMON_CTL0, - .event_mask = KNL_U_MSR_PMON_RAW_EVENT_MASK, - .fixed_ctr = HSWEP_U_MSR_PMON_UCLK_FIXED_CTR, - .fixed_ctl = HSWEP_U_MSR_PMON_UCLK_FIXED_CTL, - .ops = &snbep_uncore_msr_ops, - .format_group = &knl_uncore_ubox_format_group, -}; - -static struct attribute *knl_uncore_cha_formats_attr[] = { - &format_attr_event.attr, - &format_attr_umask.attr, - &format_attr_qor.attr, - &format_attr_edge.attr, - &format_attr_tid_en.attr, - &format_attr_inv.attr, - &format_attr_thresh8.attr, - &format_attr_filter_tid4.attr, - &format_attr_filter_link3.attr, - &format_attr_filter_state4.attr, - &format_attr_filter_local.attr, - &format_attr_filter_all_op.attr, - &format_attr_filter_nnm.attr, - &format_attr_filter_opc3.attr, - &format_attr_filter_nc.attr, - &format_attr_filter_isoc.attr, - NULL, -}; - -static struct attribute_group knl_uncore_cha_format_group = { - .name = "format", - .attrs = knl_uncore_cha_formats_attr, -}; - -static struct event_constraint knl_uncore_cha_constraints[] = { - UNCORE_EVENT_CONSTRAINT(0x11, 0x1), - UNCORE_EVENT_CONSTRAINT(0x1f, 0x1), - UNCORE_EVENT_CONSTRAINT(0x36, 0x1), - EVENT_CONSTRAINT_END -}; - -static struct extra_reg knl_uncore_cha_extra_regs[] = { - SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN, - SNBEP_CBO_PMON_CTL_TID_EN, 0x1), - SNBEP_CBO_EVENT_EXTRA_REG(0x3d, 0xff, 0x2), - SNBEP_CBO_EVENT_EXTRA_REG(0x35, 0xff, 0x4), - SNBEP_CBO_EVENT_EXTRA_REG(0x36, 0xff, 0x4), - EVENT_EXTRA_END -}; - -static u64 knl_cha_filter_mask(int fields) -{ - u64 mask = 0; - - if (fields & 0x1) - mask |= KNL_CHA_MSR_PMON_BOX_FILTER_TID; - if (fields & 0x2) - mask |= KNL_CHA_MSR_PMON_BOX_FILTER_STATE; - if (fields & 0x4) - mask |= KNL_CHA_MSR_PMON_BOX_FILTER_OP; - return mask; -} - -static struct event_constraint * -knl_cha_get_constraint(struct intel_uncore_box *box, struct perf_event *event) -{ - return __snbep_cbox_get_constraint(box, event, knl_cha_filter_mask); -} - -static int knl_cha_hw_config(struct intel_uncore_box *box, - struct perf_event *event) -{ - struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; - struct extra_reg *er; - int idx = 0; - - for (er = knl_uncore_cha_extra_regs; er->msr; er++) { - if (er->event != (event->hw.config & er->config_mask)) - continue; - idx |= er->idx; - } - - if (idx) { - reg1->reg = HSWEP_C0_MSR_PMON_BOX_FILTER0 + - KNL_CHA_MSR_OFFSET * box->pmu->pmu_idx; - reg1->config = event->attr.config1 & knl_cha_filter_mask(idx); - reg1->idx = idx; - } - return 0; -} - -static void hswep_cbox_enable_event(struct intel_uncore_box *box, - struct perf_event *event); - -static struct intel_uncore_ops knl_uncore_cha_ops = { - .init_box = snbep_uncore_msr_init_box, - .disable_box = snbep_uncore_msr_disable_box, - .enable_box = snbep_uncore_msr_enable_box, - .disable_event = snbep_uncore_msr_disable_event, - .enable_event = hswep_cbox_enable_event, - .read_counter = uncore_msr_read_counter, - .hw_config = knl_cha_hw_config, - .get_constraint = knl_cha_get_constraint, - .put_constraint = snbep_cbox_put_constraint, -}; - -static struct intel_uncore_type knl_uncore_cha = { - .name = "cha", - .num_counters = 4, - .num_boxes = 38, - .perf_ctr_bits = 48, - .event_ctl = HSWEP_C0_MSR_PMON_CTL0, - .perf_ctr = HSWEP_C0_MSR_PMON_CTR0, - .event_mask = KNL_CHA_MSR_PMON_RAW_EVENT_MASK, - .box_ctl = HSWEP_C0_MSR_PMON_BOX_CTL, - .msr_offset = KNL_CHA_MSR_OFFSET, - .num_shared_regs = 1, - .constraints = knl_uncore_cha_constraints, - .ops = &knl_uncore_cha_ops, - .format_group = &knl_uncore_cha_format_group, -}; - -static struct attribute *knl_uncore_pcu_formats_attr[] = { - &format_attr_event2.attr, - &format_attr_use_occ_ctr.attr, - &format_attr_occ_sel.attr, - &format_attr_edge.attr, - &format_attr_tid_en.attr, - &format_attr_inv.attr, - &format_attr_thresh6.attr, - &format_attr_occ_invert.attr, - &format_attr_occ_edge_det.attr, - NULL, -}; - -static struct attribute_group knl_uncore_pcu_format_group = { - .name = "format", - .attrs = knl_uncore_pcu_formats_attr, -}; - -static struct intel_uncore_type knl_uncore_pcu = { - .name = "pcu", - .num_counters = 4, - .num_boxes = 1, - .perf_ctr_bits = 48, - .perf_ctr = HSWEP_PCU_MSR_PMON_CTR0, - .event_ctl = HSWEP_PCU_MSR_PMON_CTL0, - .event_mask = KNL_PCU_MSR_PMON_RAW_EVENT_MASK, - .box_ctl = HSWEP_PCU_MSR_PMON_BOX_CTL, - .ops = &snbep_uncore_msr_ops, - .format_group = &knl_uncore_pcu_format_group, -}; - -static struct intel_uncore_type *knl_msr_uncores[] = { - &knl_uncore_ubox, - &knl_uncore_cha, - &knl_uncore_pcu, - NULL, -}; - -void knl_uncore_cpu_init(void) -{ - uncore_msr_uncores = knl_msr_uncores; -} - -static void knl_uncore_imc_enable_box(struct intel_uncore_box *box) -{ - struct pci_dev *pdev = box->pci_dev; - int box_ctl = uncore_pci_box_ctl(box); - - pci_write_config_dword(pdev, box_ctl, 0); -} - -static void knl_uncore_imc_enable_event(struct intel_uncore_box *box, - struct perf_event *event) -{ - struct pci_dev *pdev = box->pci_dev; - struct hw_perf_event *hwc = &event->hw; - - if ((event->attr.config & SNBEP_PMON_CTL_EV_SEL_MASK) - == UNCORE_FIXED_EVENT) - pci_write_config_dword(pdev, hwc->config_base, - hwc->config | KNL_PMON_FIXED_CTL_EN); - else - pci_write_config_dword(pdev, hwc->config_base, - hwc->config | SNBEP_PMON_CTL_EN); -} - -static struct intel_uncore_ops knl_uncore_imc_ops = { - .init_box = snbep_uncore_pci_init_box, - .disable_box = snbep_uncore_pci_disable_box, - .enable_box = knl_uncore_imc_enable_box, - .read_counter = snbep_uncore_pci_read_counter, - .enable_event = knl_uncore_imc_enable_event, - .disable_event = snbep_uncore_pci_disable_event, -}; - -static struct intel_uncore_type knl_uncore_imc_uclk = { - .name = "imc_uclk", - .num_counters = 4, - .num_boxes = 2, - .perf_ctr_bits = 48, - .fixed_ctr_bits = 48, - .perf_ctr = KNL_UCLK_MSR_PMON_CTR0_LOW, - .event_ctl = KNL_UCLK_MSR_PMON_CTL0, - .event_mask = SNBEP_PMON_RAW_EVENT_MASK, - .fixed_ctr = KNL_UCLK_MSR_PMON_UCLK_FIXED_LOW, - .fixed_ctl = KNL_UCLK_MSR_PMON_UCLK_FIXED_CTL, - .box_ctl = KNL_UCLK_MSR_PMON_BOX_CTL, - .ops = &knl_uncore_imc_ops, - .format_group = &snbep_uncore_format_group, -}; - -static struct intel_uncore_type knl_uncore_imc_dclk = { - .name = "imc", - .num_counters = 4, - .num_boxes = 6, - .perf_ctr_bits = 48, - .fixed_ctr_bits = 48, - .perf_ctr = KNL_MC0_CH0_MSR_PMON_CTR0_LOW, - .event_ctl = KNL_MC0_CH0_MSR_PMON_CTL0, - .event_mask = SNBEP_PMON_RAW_EVENT_MASK, - .fixed_ctr = KNL_MC0_CH0_MSR_PMON_FIXED_LOW, - .fixed_ctl = KNL_MC0_CH0_MSR_PMON_FIXED_CTL, - .box_ctl = KNL_MC0_CH0_MSR_PMON_BOX_CTL, - .ops = &knl_uncore_imc_ops, - .format_group = &snbep_uncore_format_group, -}; - -static struct intel_uncore_type knl_uncore_edc_uclk = { - .name = "edc_uclk", - .num_counters = 4, - .num_boxes = 8, - .perf_ctr_bits = 48, - .fixed_ctr_bits = 48, - .perf_ctr = KNL_UCLK_MSR_PMON_CTR0_LOW, - .event_ctl = KNL_UCLK_MSR_PMON_CTL0, - .event_mask = SNBEP_PMON_RAW_EVENT_MASK, - .fixed_ctr = KNL_UCLK_MSR_PMON_UCLK_FIXED_LOW, - .fixed_ctl = KNL_UCLK_MSR_PMON_UCLK_FIXED_CTL, - .box_ctl = KNL_UCLK_MSR_PMON_BOX_CTL, - .ops = &knl_uncore_imc_ops, - .format_group = &snbep_uncore_format_group, -}; - -static struct intel_uncore_type knl_uncore_edc_eclk = { - .name = "edc_eclk", - .num_counters = 4, - .num_boxes = 8, - .perf_ctr_bits = 48, - .fixed_ctr_bits = 48, - .perf_ctr = KNL_EDC0_ECLK_MSR_PMON_CTR0_LOW, - .event_ctl = KNL_EDC0_ECLK_MSR_PMON_CTL0, - .event_mask = SNBEP_PMON_RAW_EVENT_MASK, - .fixed_ctr = KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_LOW, - .fixed_ctl = KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_CTL, - .box_ctl = KNL_EDC0_ECLK_MSR_PMON_BOX_CTL, - .ops = &knl_uncore_imc_ops, - .format_group = &snbep_uncore_format_group, -}; - -static struct event_constraint knl_uncore_m2pcie_constraints[] = { - UNCORE_EVENT_CONSTRAINT(0x23, 0x3), - EVENT_CONSTRAINT_END -}; - -static struct intel_uncore_type knl_uncore_m2pcie = { - .name = "m2pcie", - .num_counters = 4, - .num_boxes = 1, - .perf_ctr_bits = 48, - .constraints = knl_uncore_m2pcie_constraints, - SNBEP_UNCORE_PCI_COMMON_INIT(), -}; - -static struct attribute *knl_uncore_irp_formats_attr[] = { - &format_attr_event.attr, - &format_attr_umask.attr, - &format_attr_qor.attr, - &format_attr_edge.attr, - &format_attr_inv.attr, - &format_attr_thresh8.attr, - NULL, -}; - -static struct attribute_group knl_uncore_irp_format_group = { - .name = "format", - .attrs = knl_uncore_irp_formats_attr, -}; - -static struct intel_uncore_type knl_uncore_irp = { - .name = "irp", - .num_counters = 2, - .num_boxes = 1, - .perf_ctr_bits = 48, - .perf_ctr = SNBEP_PCI_PMON_CTR0, - .event_ctl = SNBEP_PCI_PMON_CTL0, - .event_mask = KNL_IRP_PCI_PMON_RAW_EVENT_MASK, - .box_ctl = KNL_IRP_PCI_PMON_BOX_CTL, - .ops = &snbep_uncore_pci_ops, - .format_group = &knl_uncore_irp_format_group, -}; - -enum { - KNL_PCI_UNCORE_MC_UCLK, - KNL_PCI_UNCORE_MC_DCLK, - KNL_PCI_UNCORE_EDC_UCLK, - KNL_PCI_UNCORE_EDC_ECLK, - KNL_PCI_UNCORE_M2PCIE, - KNL_PCI_UNCORE_IRP, -}; - -static struct intel_uncore_type *knl_pci_uncores[] = { - [KNL_PCI_UNCORE_MC_UCLK] = &knl_uncore_imc_uclk, - [KNL_PCI_UNCORE_MC_DCLK] = &knl_uncore_imc_dclk, - [KNL_PCI_UNCORE_EDC_UCLK] = &knl_uncore_edc_uclk, - [KNL_PCI_UNCORE_EDC_ECLK] = &knl_uncore_edc_eclk, - [KNL_PCI_UNCORE_M2PCIE] = &knl_uncore_m2pcie, - [KNL_PCI_UNCORE_IRP] = &knl_uncore_irp, - NULL, -}; - -/* - * KNL uses a common PCI device ID for multiple instances of an Uncore PMU - * device type. prior to KNL, each instance of a PMU device type had a unique - * device ID. - * - * PCI Device ID Uncore PMU Devices - * ---------------------------------- - * 0x7841 MC0 UClk, MC1 UClk - * 0x7843 MC0 DClk CH 0, MC0 DClk CH 1, MC0 DClk CH 2, - * MC1 DClk CH 0, MC1 DClk CH 1, MC1 DClk CH 2 - * 0x7833 EDC0 UClk, EDC1 UClk, EDC2 UClk, EDC3 UClk, - * EDC4 UClk, EDC5 UClk, EDC6 UClk, EDC7 UClk - * 0x7835 EDC0 EClk, EDC1 EClk, EDC2 EClk, EDC3 EClk, - * EDC4 EClk, EDC5 EClk, EDC6 EClk, EDC7 EClk - * 0x7817 M2PCIe - * 0x7814 IRP -*/ - -static const struct pci_device_id knl_uncore_pci_ids[] = { - { /* MC UClk */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7841), - .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_MC_UCLK, 0), - }, - { /* MC DClk Channel */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7843), - .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_MC_DCLK, 0), - }, - { /* EDC UClk */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7833), - .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_EDC_UCLK, 0), - }, - { /* EDC EClk */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7835), - .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_EDC_ECLK, 0), - }, - { /* M2PCIe */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7817), - .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_M2PCIE, 0), - }, - { /* IRP */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7814), - .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_IRP, 0), - }, - { /* end: all zeroes */ } -}; - -static struct pci_driver knl_uncore_pci_driver = { - .name = "knl_uncore", - .id_table = knl_uncore_pci_ids, -}; - -int knl_uncore_pci_init(void) -{ - int ret; - - /* All KNL PCI based PMON units are on the same PCI bus except IRP */ - ret = snb_pci2phy_map_init(0x7814); /* IRP */ - if (ret) - return ret; - ret = snb_pci2phy_map_init(0x7817); /* M2PCIe */ - if (ret) - return ret; - uncore_pci_uncores = knl_pci_uncores; - uncore_pci_driver = &knl_uncore_pci_driver; - return 0; -} - -/* end of KNL uncore support */ - -/* Haswell-EP uncore support */ -static struct attribute *hswep_uncore_ubox_formats_attr[] = { - &format_attr_event.attr, - &format_attr_umask.attr, - &format_attr_edge.attr, - &format_attr_inv.attr, - &format_attr_thresh5.attr, - &format_attr_filter_tid2.attr, - &format_attr_filter_cid.attr, - NULL, -}; - -static struct attribute_group hswep_uncore_ubox_format_group = { - .name = "format", - .attrs = hswep_uncore_ubox_formats_attr, -}; - -static int hswep_ubox_hw_config(struct intel_uncore_box *box, struct perf_event *event) -{ - struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; - reg1->reg = HSWEP_U_MSR_PMON_FILTER; - reg1->config = event->attr.config1 & HSWEP_U_MSR_PMON_BOX_FILTER_MASK; - reg1->idx = 0; - return 0; -} - -static struct intel_uncore_ops hswep_uncore_ubox_ops = { - SNBEP_UNCORE_MSR_OPS_COMMON_INIT(), - .hw_config = hswep_ubox_hw_config, - .get_constraint = uncore_get_constraint, - .put_constraint = uncore_put_constraint, -}; - -static struct intel_uncore_type hswep_uncore_ubox = { - .name = "ubox", - .num_counters = 2, - .num_boxes = 1, - .perf_ctr_bits = 44, - .fixed_ctr_bits = 48, - .perf_ctr = HSWEP_U_MSR_PMON_CTR0, - .event_ctl = HSWEP_U_MSR_PMON_CTL0, - .event_mask = SNBEP_U_MSR_PMON_RAW_EVENT_MASK, - .fixed_ctr = HSWEP_U_MSR_PMON_UCLK_FIXED_CTR, - .fixed_ctl = HSWEP_U_MSR_PMON_UCLK_FIXED_CTL, - .num_shared_regs = 1, - .ops = &hswep_uncore_ubox_ops, - .format_group = &hswep_uncore_ubox_format_group, -}; - -static struct attribute *hswep_uncore_cbox_formats_attr[] = { - &format_attr_event.attr, - &format_attr_umask.attr, - &format_attr_edge.attr, - &format_attr_tid_en.attr, - &format_attr_thresh8.attr, - &format_attr_filter_tid3.attr, - &format_attr_filter_link2.attr, - &format_attr_filter_state3.attr, - &format_attr_filter_nid2.attr, - &format_attr_filter_opc2.attr, - &format_attr_filter_nc.attr, - &format_attr_filter_c6.attr, - &format_attr_filter_isoc.attr, - NULL, -}; - -static struct attribute_group hswep_uncore_cbox_format_group = { - .name = "format", - .attrs = hswep_uncore_cbox_formats_attr, -}; - -static struct event_constraint hswep_uncore_cbox_constraints[] = { - UNCORE_EVENT_CONSTRAINT(0x01, 0x1), - UNCORE_EVENT_CONSTRAINT(0x09, 0x1), - UNCORE_EVENT_CONSTRAINT(0x11, 0x1), - UNCORE_EVENT_CONSTRAINT(0x36, 0x1), - UNCORE_EVENT_CONSTRAINT(0x38, 0x3), - UNCORE_EVENT_CONSTRAINT(0x3b, 0x1), - UNCORE_EVENT_CONSTRAINT(0x3e, 0x1), - EVENT_CONSTRAINT_END -}; - -static struct extra_reg hswep_uncore_cbox_extra_regs[] = { - SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN, - SNBEP_CBO_PMON_CTL_TID_EN, 0x1), - SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4), - SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4), - SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4), - SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4), - SNBEP_CBO_EVENT_EXTRA_REG(0x2134, 0xffff, 0x4), - SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0x4), - SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x8), - SNBEP_CBO_EVENT_EXTRA_REG(0x4028, 0x40ff, 0x8), - SNBEP_CBO_EVENT_EXTRA_REG(0x4032, 0x40ff, 0x8), - SNBEP_CBO_EVENT_EXTRA_REG(0x4029, 0x40ff, 0x8), - SNBEP_CBO_EVENT_EXTRA_REG(0x4033, 0x40ff, 0x8), - SNBEP_CBO_EVENT_EXTRA_REG(0x402A, 0x40ff, 0x8), - SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x12), - SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x10), - SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0x18), - SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x8), - SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x8), - SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x8), - SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0x18), - SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x8), - SNBEP_CBO_EVENT_EXTRA_REG(0x2335, 0xffff, 0x10), - SNBEP_CBO_EVENT_EXTRA_REG(0x8335, 0xffff, 0x10), - SNBEP_CBO_EVENT_EXTRA_REG(0x2135, 0xffff, 0x10), - SNBEP_CBO_EVENT_EXTRA_REG(0x8135, 0xffff, 0x10), - SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x10), - SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x10), - SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0x18), - SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x8), - SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x8), - SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0x18), - SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x8), - SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10), - SNBEP_CBO_EVENT_EXTRA_REG(0x8336, 0xffff, 0x10), - SNBEP_CBO_EVENT_EXTRA_REG(0x2136, 0xffff, 0x10), - SNBEP_CBO_EVENT_EXTRA_REG(0x8136, 0xffff, 0x10), - SNBEP_CBO_EVENT_EXTRA_REG(0x5036, 0xffff, 0x8), - EVENT_EXTRA_END -}; - -static u64 hswep_cbox_filter_mask(int fields) -{ - u64 mask = 0; - if (fields & 0x1) - mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_TID; - if (fields & 0x2) - mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_LINK; - if (fields & 0x4) - mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_STATE; - if (fields & 0x8) - mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_NID; - if (fields & 0x10) { - mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_OPC; - mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_NC; - mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_C6; - mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_ISOC; - } - return mask; -} - -static struct event_constraint * -hswep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event) -{ - return __snbep_cbox_get_constraint(box, event, hswep_cbox_filter_mask); -} - -static int hswep_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event) -{ - struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; - struct extra_reg *er; - int idx = 0; - - for (er = hswep_uncore_cbox_extra_regs; er->msr; er++) { - if (er->event != (event->hw.config & er->config_mask)) - continue; - idx |= er->idx; - } - - if (idx) { - reg1->reg = HSWEP_C0_MSR_PMON_BOX_FILTER0 + - HSWEP_CBO_MSR_OFFSET * box->pmu->pmu_idx; - reg1->config = event->attr.config1 & hswep_cbox_filter_mask(idx); - reg1->idx = idx; - } - return 0; -} - -static void hswep_cbox_enable_event(struct intel_uncore_box *box, - struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - struct hw_perf_event_extra *reg1 = &hwc->extra_reg; - - if (reg1->idx != EXTRA_REG_NONE) { - u64 filter = uncore_shared_reg_config(box, 0); - wrmsrl(reg1->reg, filter & 0xffffffff); - wrmsrl(reg1->reg + 1, filter >> 32); - } - - wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN); -} - -static struct intel_uncore_ops hswep_uncore_cbox_ops = { - .init_box = snbep_uncore_msr_init_box, - .disable_box = snbep_uncore_msr_disable_box, - .enable_box = snbep_uncore_msr_enable_box, - .disable_event = snbep_uncore_msr_disable_event, - .enable_event = hswep_cbox_enable_event, - .read_counter = uncore_msr_read_counter, - .hw_config = hswep_cbox_hw_config, - .get_constraint = hswep_cbox_get_constraint, - .put_constraint = snbep_cbox_put_constraint, -}; - -static struct intel_uncore_type hswep_uncore_cbox = { - .name = "cbox", - .num_counters = 4, - .num_boxes = 18, - .perf_ctr_bits = 48, - .event_ctl = HSWEP_C0_MSR_PMON_CTL0, - .perf_ctr = HSWEP_C0_MSR_PMON_CTR0, - .event_mask = SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK, - .box_ctl = HSWEP_C0_MSR_PMON_BOX_CTL, - .msr_offset = HSWEP_CBO_MSR_OFFSET, - .num_shared_regs = 1, - .constraints = hswep_uncore_cbox_constraints, - .ops = &hswep_uncore_cbox_ops, - .format_group = &hswep_uncore_cbox_format_group, -}; - -/* - * Write SBOX Initialization register bit by bit to avoid spurious #GPs - */ -static void hswep_uncore_sbox_msr_init_box(struct intel_uncore_box *box) -{ - unsigned msr = uncore_msr_box_ctl(box); - - if (msr) { - u64 init = SNBEP_PMON_BOX_CTL_INT; - u64 flags = 0; - int i; - - for_each_set_bit(i, (unsigned long *)&init, 64) { - flags |= (1ULL << i); - wrmsrl(msr, flags); - } - } -} - -static struct intel_uncore_ops hswep_uncore_sbox_msr_ops = { - __SNBEP_UNCORE_MSR_OPS_COMMON_INIT(), - .init_box = hswep_uncore_sbox_msr_init_box -}; - -static struct attribute *hswep_uncore_sbox_formats_attr[] = { - &format_attr_event.attr, - &format_attr_umask.attr, - &format_attr_edge.attr, - &format_attr_tid_en.attr, - &format_attr_inv.attr, - &format_attr_thresh8.attr, - NULL, -}; - -static struct attribute_group hswep_uncore_sbox_format_group = { - .name = "format", - .attrs = hswep_uncore_sbox_formats_attr, -}; - -static struct intel_uncore_type hswep_uncore_sbox = { - .name = "sbox", - .num_counters = 4, - .num_boxes = 4, - .perf_ctr_bits = 44, - .event_ctl = HSWEP_S0_MSR_PMON_CTL0, - .perf_ctr = HSWEP_S0_MSR_PMON_CTR0, - .event_mask = HSWEP_S_MSR_PMON_RAW_EVENT_MASK, - .box_ctl = HSWEP_S0_MSR_PMON_BOX_CTL, - .msr_offset = HSWEP_SBOX_MSR_OFFSET, - .ops = &hswep_uncore_sbox_msr_ops, - .format_group = &hswep_uncore_sbox_format_group, -}; - -static int hswep_pcu_hw_config(struct intel_uncore_box *box, struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - struct hw_perf_event_extra *reg1 = &hwc->extra_reg; - int ev_sel = hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK; - - if (ev_sel >= 0xb && ev_sel <= 0xe) { - reg1->reg = HSWEP_PCU_MSR_PMON_BOX_FILTER; - reg1->idx = ev_sel - 0xb; - reg1->config = event->attr.config1 & (0xff << reg1->idx); - } - return 0; -} - -static struct intel_uncore_ops hswep_uncore_pcu_ops = { - SNBEP_UNCORE_MSR_OPS_COMMON_INIT(), - .hw_config = hswep_pcu_hw_config, - .get_constraint = snbep_pcu_get_constraint, - .put_constraint = snbep_pcu_put_constraint, -}; - -static struct intel_uncore_type hswep_uncore_pcu = { - .name = "pcu", - .num_counters = 4, - .num_boxes = 1, - .perf_ctr_bits = 48, - .perf_ctr = HSWEP_PCU_MSR_PMON_CTR0, - .event_ctl = HSWEP_PCU_MSR_PMON_CTL0, - .event_mask = SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK, - .box_ctl = HSWEP_PCU_MSR_PMON_BOX_CTL, - .num_shared_regs = 1, - .ops = &hswep_uncore_pcu_ops, - .format_group = &snbep_uncore_pcu_format_group, -}; - -static struct intel_uncore_type *hswep_msr_uncores[] = { - &hswep_uncore_ubox, - &hswep_uncore_cbox, - &hswep_uncore_sbox, - &hswep_uncore_pcu, - NULL, -}; - -void hswep_uncore_cpu_init(void) -{ - if (hswep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) - hswep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; - - /* Detect 6-8 core systems with only two SBOXes */ - if (uncore_extra_pci_dev[0][HSWEP_PCI_PCU_3]) { - u32 capid4; - - pci_read_config_dword(uncore_extra_pci_dev[0][HSWEP_PCI_PCU_3], - 0x94, &capid4); - if (((capid4 >> 6) & 0x3) == 0) - hswep_uncore_sbox.num_boxes = 2; - } - - uncore_msr_uncores = hswep_msr_uncores; -} - -static struct intel_uncore_type hswep_uncore_ha = { - .name = "ha", - .num_counters = 5, - .num_boxes = 2, - .perf_ctr_bits = 48, - SNBEP_UNCORE_PCI_COMMON_INIT(), -}; - -static struct uncore_event_desc hswep_uncore_imc_events[] = { - INTEL_UNCORE_EVENT_DESC(clockticks, "event=0x00,umask=0x00"), - INTEL_UNCORE_EVENT_DESC(cas_count_read, "event=0x04,umask=0x03"), - INTEL_UNCORE_EVENT_DESC(cas_count_read.scale, "6.103515625e-5"), - INTEL_UNCORE_EVENT_DESC(cas_count_read.unit, "MiB"), - INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"), - INTEL_UNCORE_EVENT_DESC(cas_count_write.scale, "6.103515625e-5"), - INTEL_UNCORE_EVENT_DESC(cas_count_write.unit, "MiB"), - { /* end: all zeroes */ }, -}; - -static struct intel_uncore_type hswep_uncore_imc = { - .name = "imc", - .num_counters = 5, - .num_boxes = 8, - .perf_ctr_bits = 48, - .fixed_ctr_bits = 48, - .fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR, - .fixed_ctl = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL, - .event_descs = hswep_uncore_imc_events, - SNBEP_UNCORE_PCI_COMMON_INIT(), -}; - -static unsigned hswep_uncore_irp_ctrs[] = {0xa0, 0xa8, 0xb0, 0xb8}; - -static u64 hswep_uncore_irp_read_counter(struct intel_uncore_box *box, struct perf_event *event) -{ - struct pci_dev *pdev = box->pci_dev; - struct hw_perf_event *hwc = &event->hw; - u64 count = 0; - - pci_read_config_dword(pdev, hswep_uncore_irp_ctrs[hwc->idx], (u32 *)&count); - pci_read_config_dword(pdev, hswep_uncore_irp_ctrs[hwc->idx] + 4, (u32 *)&count + 1); - - return count; -} - -static struct intel_uncore_ops hswep_uncore_irp_ops = { - .init_box = snbep_uncore_pci_init_box, - .disable_box = snbep_uncore_pci_disable_box, - .enable_box = snbep_uncore_pci_enable_box, - .disable_event = ivbep_uncore_irp_disable_event, - .enable_event = ivbep_uncore_irp_enable_event, - .read_counter = hswep_uncore_irp_read_counter, -}; - -static struct intel_uncore_type hswep_uncore_irp = { - .name = "irp", - .num_counters = 4, - .num_boxes = 1, - .perf_ctr_bits = 48, - .event_mask = SNBEP_PMON_RAW_EVENT_MASK, - .box_ctl = SNBEP_PCI_PMON_BOX_CTL, - .ops = &hswep_uncore_irp_ops, - .format_group = &snbep_uncore_format_group, -}; - -static struct intel_uncore_type hswep_uncore_qpi = { - .name = "qpi", - .num_counters = 5, - .num_boxes = 3, - .perf_ctr_bits = 48, - .perf_ctr = SNBEP_PCI_PMON_CTR0, - .event_ctl = SNBEP_PCI_PMON_CTL0, - .event_mask = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK, - .box_ctl = SNBEP_PCI_PMON_BOX_CTL, - .num_shared_regs = 1, - .ops = &snbep_uncore_qpi_ops, - .format_group = &snbep_uncore_qpi_format_group, -}; - -static struct event_constraint hswep_uncore_r2pcie_constraints[] = { - UNCORE_EVENT_CONSTRAINT(0x10, 0x3), - UNCORE_EVENT_CONSTRAINT(0x11, 0x3), - UNCORE_EVENT_CONSTRAINT(0x13, 0x1), - UNCORE_EVENT_CONSTRAINT(0x23, 0x1), - UNCORE_EVENT_CONSTRAINT(0x24, 0x1), - UNCORE_EVENT_CONSTRAINT(0x25, 0x1), - UNCORE_EVENT_CONSTRAINT(0x26, 0x3), - UNCORE_EVENT_CONSTRAINT(0x27, 0x1), - UNCORE_EVENT_CONSTRAINT(0x28, 0x3), - UNCORE_EVENT_CONSTRAINT(0x29, 0x3), - UNCORE_EVENT_CONSTRAINT(0x2a, 0x1), - UNCORE_EVENT_CONSTRAINT(0x2b, 0x3), - UNCORE_EVENT_CONSTRAINT(0x2c, 0x3), - UNCORE_EVENT_CONSTRAINT(0x2d, 0x3), - UNCORE_EVENT_CONSTRAINT(0x32, 0x3), - UNCORE_EVENT_CONSTRAINT(0x33, 0x3), - UNCORE_EVENT_CONSTRAINT(0x34, 0x3), - UNCORE_EVENT_CONSTRAINT(0x35, 0x3), - EVENT_CONSTRAINT_END -}; - -static struct intel_uncore_type hswep_uncore_r2pcie = { - .name = "r2pcie", - .num_counters = 4, - .num_boxes = 1, - .perf_ctr_bits = 48, - .constraints = hswep_uncore_r2pcie_constraints, - SNBEP_UNCORE_PCI_COMMON_INIT(), -}; - -static struct event_constraint hswep_uncore_r3qpi_constraints[] = { - UNCORE_EVENT_CONSTRAINT(0x01, 0x3), - UNCORE_EVENT_CONSTRAINT(0x07, 0x7), - UNCORE_EVENT_CONSTRAINT(0x08, 0x7), - UNCORE_EVENT_CONSTRAINT(0x09, 0x7), - UNCORE_EVENT_CONSTRAINT(0x0a, 0x7), - UNCORE_EVENT_CONSTRAINT(0x0e, 0x7), - UNCORE_EVENT_CONSTRAINT(0x10, 0x3), - UNCORE_EVENT_CONSTRAINT(0x11, 0x3), - UNCORE_EVENT_CONSTRAINT(0x12, 0x3), - UNCORE_EVENT_CONSTRAINT(0x13, 0x1), - UNCORE_EVENT_CONSTRAINT(0x14, 0x3), - UNCORE_EVENT_CONSTRAINT(0x15, 0x3), - UNCORE_EVENT_CONSTRAINT(0x1f, 0x3), - UNCORE_EVENT_CONSTRAINT(0x20, 0x3), - UNCORE_EVENT_CONSTRAINT(0x21, 0x3), - UNCORE_EVENT_CONSTRAINT(0x22, 0x3), - UNCORE_EVENT_CONSTRAINT(0x23, 0x3), - UNCORE_EVENT_CONSTRAINT(0x25, 0x3), - UNCORE_EVENT_CONSTRAINT(0x26, 0x3), - UNCORE_EVENT_CONSTRAINT(0x28, 0x3), - UNCORE_EVENT_CONSTRAINT(0x29, 0x3), - UNCORE_EVENT_CONSTRAINT(0x2c, 0x3), - UNCORE_EVENT_CONSTRAINT(0x2d, 0x3), - UNCORE_EVENT_CONSTRAINT(0x2e, 0x3), - UNCORE_EVENT_CONSTRAINT(0x2f, 0x3), - UNCORE_EVENT_CONSTRAINT(0x31, 0x3), - UNCORE_EVENT_CONSTRAINT(0x32, 0x3), - UNCORE_EVENT_CONSTRAINT(0x33, 0x3), - UNCORE_EVENT_CONSTRAINT(0x34, 0x3), - UNCORE_EVENT_CONSTRAINT(0x36, 0x3), - UNCORE_EVENT_CONSTRAINT(0x37, 0x3), - UNCORE_EVENT_CONSTRAINT(0x38, 0x3), - UNCORE_EVENT_CONSTRAINT(0x39, 0x3), - EVENT_CONSTRAINT_END -}; - -static struct intel_uncore_type hswep_uncore_r3qpi = { - .name = "r3qpi", - .num_counters = 4, - .num_boxes = 3, - .perf_ctr_bits = 44, - .constraints = hswep_uncore_r3qpi_constraints, - SNBEP_UNCORE_PCI_COMMON_INIT(), -}; - -enum { - HSWEP_PCI_UNCORE_HA, - HSWEP_PCI_UNCORE_IMC, - HSWEP_PCI_UNCORE_IRP, - HSWEP_PCI_UNCORE_QPI, - HSWEP_PCI_UNCORE_R2PCIE, - HSWEP_PCI_UNCORE_R3QPI, -}; - -static struct intel_uncore_type *hswep_pci_uncores[] = { - [HSWEP_PCI_UNCORE_HA] = &hswep_uncore_ha, - [HSWEP_PCI_UNCORE_IMC] = &hswep_uncore_imc, - [HSWEP_PCI_UNCORE_IRP] = &hswep_uncore_irp, - [HSWEP_PCI_UNCORE_QPI] = &hswep_uncore_qpi, - [HSWEP_PCI_UNCORE_R2PCIE] = &hswep_uncore_r2pcie, - [HSWEP_PCI_UNCORE_R3QPI] = &hswep_uncore_r3qpi, - NULL, -}; - -static const struct pci_device_id hswep_uncore_pci_ids[] = { - { /* Home Agent 0 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f30), - .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_HA, 0), - }, - { /* Home Agent 1 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f38), - .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_HA, 1), - }, - { /* MC0 Channel 0 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb0), - .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 0), - }, - { /* MC0 Channel 1 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb1), - .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 1), - }, - { /* MC0 Channel 2 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb4), - .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 2), - }, - { /* MC0 Channel 3 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb5), - .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 3), - }, - { /* MC1 Channel 0 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd0), - .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 4), - }, - { /* MC1 Channel 1 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd1), - .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 5), - }, - { /* MC1 Channel 2 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd4), - .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 6), - }, - { /* MC1 Channel 3 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd5), - .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 7), - }, - { /* IRP */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f39), - .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IRP, 0), - }, - { /* QPI0 Port 0 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f32), - .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_QPI, 0), - }, - { /* QPI0 Port 1 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f33), - .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_QPI, 1), - }, - { /* QPI1 Port 2 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f3a), - .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_QPI, 2), - }, - { /* R2PCIe */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f34), - .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R2PCIE, 0), - }, - { /* R3QPI0 Link 0 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f36), - .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R3QPI, 0), - }, - { /* R3QPI0 Link 1 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f37), - .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R3QPI, 1), - }, - { /* R3QPI1 Link 2 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f3e), - .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R3QPI, 2), - }, - { /* QPI Port 0 filter */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f86), - .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, - SNBEP_PCI_QPI_PORT0_FILTER), - }, - { /* QPI Port 1 filter */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f96), - .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, - SNBEP_PCI_QPI_PORT1_FILTER), - }, - { /* PCU.3 (for Capability registers) */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fc0), - .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, - HSWEP_PCI_PCU_3), - }, - { /* end: all zeroes */ } -}; - -static struct pci_driver hswep_uncore_pci_driver = { - .name = "hswep_uncore", - .id_table = hswep_uncore_pci_ids, -}; - -int hswep_uncore_pci_init(void) -{ - int ret = snbep_pci2phy_map_init(0x2f1e); - if (ret) - return ret; - uncore_pci_uncores = hswep_pci_uncores; - uncore_pci_driver = &hswep_uncore_pci_driver; - return 0; -} -/* end of Haswell-EP uncore support */ - -/* BDX uncore support */ - -static struct intel_uncore_type bdx_uncore_ubox = { - .name = "ubox", - .num_counters = 2, - .num_boxes = 1, - .perf_ctr_bits = 48, - .fixed_ctr_bits = 48, - .perf_ctr = HSWEP_U_MSR_PMON_CTR0, - .event_ctl = HSWEP_U_MSR_PMON_CTL0, - .event_mask = SNBEP_U_MSR_PMON_RAW_EVENT_MASK, - .fixed_ctr = HSWEP_U_MSR_PMON_UCLK_FIXED_CTR, - .fixed_ctl = HSWEP_U_MSR_PMON_UCLK_FIXED_CTL, - .num_shared_regs = 1, - .ops = &ivbep_uncore_msr_ops, - .format_group = &ivbep_uncore_ubox_format_group, -}; - -static struct event_constraint bdx_uncore_cbox_constraints[] = { - UNCORE_EVENT_CONSTRAINT(0x09, 0x3), - UNCORE_EVENT_CONSTRAINT(0x11, 0x1), - UNCORE_EVENT_CONSTRAINT(0x36, 0x1), - UNCORE_EVENT_CONSTRAINT(0x3e, 0x1), - EVENT_CONSTRAINT_END -}; - -static struct intel_uncore_type bdx_uncore_cbox = { - .name = "cbox", - .num_counters = 4, - .num_boxes = 24, - .perf_ctr_bits = 48, - .event_ctl = HSWEP_C0_MSR_PMON_CTL0, - .perf_ctr = HSWEP_C0_MSR_PMON_CTR0, - .event_mask = SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK, - .box_ctl = HSWEP_C0_MSR_PMON_BOX_CTL, - .msr_offset = HSWEP_CBO_MSR_OFFSET, - .num_shared_regs = 1, - .constraints = bdx_uncore_cbox_constraints, - .ops = &hswep_uncore_cbox_ops, - .format_group = &hswep_uncore_cbox_format_group, -}; - -static struct intel_uncore_type bdx_uncore_sbox = { - .name = "sbox", - .num_counters = 4, - .num_boxes = 4, - .perf_ctr_bits = 48, - .event_ctl = HSWEP_S0_MSR_PMON_CTL0, - .perf_ctr = HSWEP_S0_MSR_PMON_CTR0, - .event_mask = HSWEP_S_MSR_PMON_RAW_EVENT_MASK, - .box_ctl = HSWEP_S0_MSR_PMON_BOX_CTL, - .msr_offset = HSWEP_SBOX_MSR_OFFSET, - .ops = &hswep_uncore_sbox_msr_ops, - .format_group = &hswep_uncore_sbox_format_group, -}; - -static struct intel_uncore_type *bdx_msr_uncores[] = { - &bdx_uncore_ubox, - &bdx_uncore_cbox, - &bdx_uncore_sbox, - &hswep_uncore_pcu, - NULL, -}; - -void bdx_uncore_cpu_init(void) -{ - if (bdx_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) - bdx_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; - uncore_msr_uncores = bdx_msr_uncores; -} - -static struct intel_uncore_type bdx_uncore_ha = { - .name = "ha", - .num_counters = 4, - .num_boxes = 2, - .perf_ctr_bits = 48, - SNBEP_UNCORE_PCI_COMMON_INIT(), -}; - -static struct intel_uncore_type bdx_uncore_imc = { - .name = "imc", - .num_counters = 5, - .num_boxes = 8, - .perf_ctr_bits = 48, - .fixed_ctr_bits = 48, - .fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR, - .fixed_ctl = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL, - .event_descs = hswep_uncore_imc_events, - SNBEP_UNCORE_PCI_COMMON_INIT(), -}; - -static struct intel_uncore_type bdx_uncore_irp = { - .name = "irp", - .num_counters = 4, - .num_boxes = 1, - .perf_ctr_bits = 48, - .event_mask = SNBEP_PMON_RAW_EVENT_MASK, - .box_ctl = SNBEP_PCI_PMON_BOX_CTL, - .ops = &hswep_uncore_irp_ops, - .format_group = &snbep_uncore_format_group, -}; - -static struct intel_uncore_type bdx_uncore_qpi = { - .name = "qpi", - .num_counters = 4, - .num_boxes = 3, - .perf_ctr_bits = 48, - .perf_ctr = SNBEP_PCI_PMON_CTR0, - .event_ctl = SNBEP_PCI_PMON_CTL0, - .event_mask = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK, - .box_ctl = SNBEP_PCI_PMON_BOX_CTL, - .num_shared_regs = 1, - .ops = &snbep_uncore_qpi_ops, - .format_group = &snbep_uncore_qpi_format_group, -}; - -static struct event_constraint bdx_uncore_r2pcie_constraints[] = { - UNCORE_EVENT_CONSTRAINT(0x10, 0x3), - UNCORE_EVENT_CONSTRAINT(0x11, 0x3), - UNCORE_EVENT_CONSTRAINT(0x13, 0x1), - UNCORE_EVENT_CONSTRAINT(0x23, 0x1), - UNCORE_EVENT_CONSTRAINT(0x25, 0x1), - UNCORE_EVENT_CONSTRAINT(0x26, 0x3), - UNCORE_EVENT_CONSTRAINT(0x28, 0x3), - UNCORE_EVENT_CONSTRAINT(0x2c, 0x3), - UNCORE_EVENT_CONSTRAINT(0x2d, 0x3), - EVENT_CONSTRAINT_END -}; - -static struct intel_uncore_type bdx_uncore_r2pcie = { - .name = "r2pcie", - .num_counters = 4, - .num_boxes = 1, - .perf_ctr_bits = 48, - .constraints = bdx_uncore_r2pcie_constraints, - SNBEP_UNCORE_PCI_COMMON_INIT(), -}; - -static struct event_constraint bdx_uncore_r3qpi_constraints[] = { - UNCORE_EVENT_CONSTRAINT(0x01, 0x7), - UNCORE_EVENT_CONSTRAINT(0x07, 0x7), - UNCORE_EVENT_CONSTRAINT(0x08, 0x7), - UNCORE_EVENT_CONSTRAINT(0x09, 0x7), - UNCORE_EVENT_CONSTRAINT(0x0a, 0x7), - UNCORE_EVENT_CONSTRAINT(0x0e, 0x7), - UNCORE_EVENT_CONSTRAINT(0x10, 0x3), - UNCORE_EVENT_CONSTRAINT(0x11, 0x3), - UNCORE_EVENT_CONSTRAINT(0x13, 0x1), - UNCORE_EVENT_CONSTRAINT(0x14, 0x3), - UNCORE_EVENT_CONSTRAINT(0x15, 0x3), - UNCORE_EVENT_CONSTRAINT(0x1f, 0x3), - UNCORE_EVENT_CONSTRAINT(0x20, 0x3), - UNCORE_EVENT_CONSTRAINT(0x21, 0x3), - UNCORE_EVENT_CONSTRAINT(0x22, 0x3), - UNCORE_EVENT_CONSTRAINT(0x23, 0x3), - UNCORE_EVENT_CONSTRAINT(0x25, 0x3), - UNCORE_EVENT_CONSTRAINT(0x26, 0x3), - UNCORE_EVENT_CONSTRAINT(0x28, 0x3), - UNCORE_EVENT_CONSTRAINT(0x29, 0x3), - UNCORE_EVENT_CONSTRAINT(0x2c, 0x3), - UNCORE_EVENT_CONSTRAINT(0x2d, 0x3), - UNCORE_EVENT_CONSTRAINT(0x2e, 0x3), - UNCORE_EVENT_CONSTRAINT(0x2f, 0x3), - UNCORE_EVENT_CONSTRAINT(0x33, 0x3), - UNCORE_EVENT_CONSTRAINT(0x34, 0x3), - UNCORE_EVENT_CONSTRAINT(0x36, 0x3), - UNCORE_EVENT_CONSTRAINT(0x37, 0x3), - UNCORE_EVENT_CONSTRAINT(0x38, 0x3), - UNCORE_EVENT_CONSTRAINT(0x39, 0x3), - EVENT_CONSTRAINT_END -}; - -static struct intel_uncore_type bdx_uncore_r3qpi = { - .name = "r3qpi", - .num_counters = 3, - .num_boxes = 3, - .perf_ctr_bits = 48, - .constraints = bdx_uncore_r3qpi_constraints, - SNBEP_UNCORE_PCI_COMMON_INIT(), -}; - -enum { - BDX_PCI_UNCORE_HA, - BDX_PCI_UNCORE_IMC, - BDX_PCI_UNCORE_IRP, - BDX_PCI_UNCORE_QPI, - BDX_PCI_UNCORE_R2PCIE, - BDX_PCI_UNCORE_R3QPI, -}; - -static struct intel_uncore_type *bdx_pci_uncores[] = { - [BDX_PCI_UNCORE_HA] = &bdx_uncore_ha, - [BDX_PCI_UNCORE_IMC] = &bdx_uncore_imc, - [BDX_PCI_UNCORE_IRP] = &bdx_uncore_irp, - [BDX_PCI_UNCORE_QPI] = &bdx_uncore_qpi, - [BDX_PCI_UNCORE_R2PCIE] = &bdx_uncore_r2pcie, - [BDX_PCI_UNCORE_R3QPI] = &bdx_uncore_r3qpi, - NULL, -}; - -static const struct pci_device_id bdx_uncore_pci_ids[] = { - { /* Home Agent 0 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f30), - .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_HA, 0), - }, - { /* Home Agent 1 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f38), - .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_HA, 1), - }, - { /* MC0 Channel 0 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb0), - .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 0), - }, - { /* MC0 Channel 1 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb1), - .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 1), - }, - { /* MC0 Channel 2 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb4), - .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 2), - }, - { /* MC0 Channel 3 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb5), - .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 3), - }, - { /* MC1 Channel 0 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd0), - .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 4), - }, - { /* MC1 Channel 1 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd1), - .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 5), - }, - { /* MC1 Channel 2 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd4), - .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 6), - }, - { /* MC1 Channel 3 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd5), - .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 7), - }, - { /* IRP */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f39), - .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IRP, 0), - }, - { /* QPI0 Port 0 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f32), - .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_QPI, 0), - }, - { /* QPI0 Port 1 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f33), - .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_QPI, 1), - }, - { /* QPI1 Port 2 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f3a), - .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_QPI, 2), - }, - { /* R2PCIe */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f34), - .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R2PCIE, 0), - }, - { /* R3QPI0 Link 0 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f36), - .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R3QPI, 0), - }, - { /* R3QPI0 Link 1 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f37), - .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R3QPI, 1), - }, - { /* R3QPI1 Link 2 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f3e), - .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R3QPI, 2), - }, - { /* QPI Port 0 filter */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f86), - .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 0), - }, - { /* QPI Port 1 filter */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f96), - .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 1), - }, - { /* QPI Port 2 filter */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f46), - .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 2), - }, - { /* end: all zeroes */ } -}; - -static struct pci_driver bdx_uncore_pci_driver = { - .name = "bdx_uncore", - .id_table = bdx_uncore_pci_ids, -}; - -int bdx_uncore_pci_init(void) -{ - int ret = snbep_pci2phy_map_init(0x6f1e); - - if (ret) - return ret; - uncore_pci_uncores = bdx_pci_uncores; - uncore_pci_driver = &bdx_uncore_pci_driver; - return 0; -} - -/* end of BDX uncore support */ diff --git a/arch/x86/kernel/cpu/perf_event_knc.c b/arch/x86/kernel/cpu/perf_event_knc.c deleted file mode 100644 index 5b0c232d1ee6..000000000000 --- a/arch/x86/kernel/cpu/perf_event_knc.c +++ /dev/null @@ -1,319 +0,0 @@ -/* Driver for Intel Xeon Phi "Knights Corner" PMU */ - -#include <linux/perf_event.h> -#include <linux/types.h> - -#include <asm/hardirq.h> - -#include "perf_event.h" - -static const u64 knc_perfmon_event_map[] = -{ - [PERF_COUNT_HW_CPU_CYCLES] = 0x002a, - [PERF_COUNT_HW_INSTRUCTIONS] = 0x0016, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0028, - [PERF_COUNT_HW_CACHE_MISSES] = 0x0029, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x0012, - [PERF_COUNT_HW_BRANCH_MISSES] = 0x002b, -}; - -static const u64 __initconst knc_hw_cache_event_ids - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = -{ - [ C(L1D) ] = { - [ C(OP_READ) ] = { - /* On Xeon Phi event "0" is a valid DATA_READ */ - /* (L1 Data Cache Reads) Instruction. */ - /* We code this as ARCH_PERFMON_EVENTSEL_INT as this */ - /* bit will always be set in x86_pmu_hw_config(). */ - [ C(RESULT_ACCESS) ] = ARCH_PERFMON_EVENTSEL_INT, - /* DATA_READ */ - [ C(RESULT_MISS) ] = 0x0003, /* DATA_READ_MISS */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0001, /* DATA_WRITE */ - [ C(RESULT_MISS) ] = 0x0004, /* DATA_WRITE_MISS */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0011, /* L1_DATA_PF1 */ - [ C(RESULT_MISS) ] = 0x001c, /* L1_DATA_PF1_MISS */ - }, - }, - [ C(L1I ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x000c, /* CODE_READ */ - [ C(RESULT_MISS) ] = 0x000e, /* CODE_CACHE_MISS */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0x0, - }, - }, - [ C(LL ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0x10cb, /* L2_READ_MISS */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x10cc, /* L2_WRITE_HIT */ - [ C(RESULT_MISS) ] = 0, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x10fc, /* L2_DATA_PF2 */ - [ C(RESULT_MISS) ] = 0x10fe, /* L2_DATA_PF2_MISS */ - }, - }, - [ C(DTLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = ARCH_PERFMON_EVENTSEL_INT, - /* DATA_READ */ - /* see note on L1 OP_READ */ - [ C(RESULT_MISS) ] = 0x0002, /* DATA_PAGE_WALK */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0001, /* DATA_WRITE */ - [ C(RESULT_MISS) ] = 0x0002, /* DATA_PAGE_WALK */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0x0, - }, - }, - [ C(ITLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x000c, /* CODE_READ */ - [ C(RESULT_MISS) ] = 0x000d, /* CODE_PAGE_WALK */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, - [ C(BPU ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0012, /* BRANCHES */ - [ C(RESULT_MISS) ] = 0x002b, /* BRANCHES_MISPREDICTED */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, -}; - - -static u64 knc_pmu_event_map(int hw_event) -{ - return knc_perfmon_event_map[hw_event]; -} - -static struct event_constraint knc_event_constraints[] = -{ - INTEL_EVENT_CONSTRAINT(0xc3, 0x1), /* HWP_L2HIT */ - INTEL_EVENT_CONSTRAINT(0xc4, 0x1), /* HWP_L2MISS */ - INTEL_EVENT_CONSTRAINT(0xc8, 0x1), /* L2_READ_HIT_E */ - INTEL_EVENT_CONSTRAINT(0xc9, 0x1), /* L2_READ_HIT_M */ - INTEL_EVENT_CONSTRAINT(0xca, 0x1), /* L2_READ_HIT_S */ - INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* L2_READ_MISS */ - INTEL_EVENT_CONSTRAINT(0xcc, 0x1), /* L2_WRITE_HIT */ - INTEL_EVENT_CONSTRAINT(0xce, 0x1), /* L2_STRONGLY_ORDERED_STREAMING_VSTORES_MISS */ - INTEL_EVENT_CONSTRAINT(0xcf, 0x1), /* L2_WEAKLY_ORDERED_STREAMING_VSTORE_MISS */ - INTEL_EVENT_CONSTRAINT(0xd7, 0x1), /* L2_VICTIM_REQ_WITH_DATA */ - INTEL_EVENT_CONSTRAINT(0xe3, 0x1), /* SNP_HITM_BUNIT */ - INTEL_EVENT_CONSTRAINT(0xe6, 0x1), /* SNP_HIT_L2 */ - INTEL_EVENT_CONSTRAINT(0xe7, 0x1), /* SNP_HITM_L2 */ - INTEL_EVENT_CONSTRAINT(0xf1, 0x1), /* L2_DATA_READ_MISS_CACHE_FILL */ - INTEL_EVENT_CONSTRAINT(0xf2, 0x1), /* L2_DATA_WRITE_MISS_CACHE_FILL */ - INTEL_EVENT_CONSTRAINT(0xf6, 0x1), /* L2_DATA_READ_MISS_MEM_FILL */ - INTEL_EVENT_CONSTRAINT(0xf7, 0x1), /* L2_DATA_WRITE_MISS_MEM_FILL */ - INTEL_EVENT_CONSTRAINT(0xfc, 0x1), /* L2_DATA_PF2 */ - INTEL_EVENT_CONSTRAINT(0xfd, 0x1), /* L2_DATA_PF2_DROP */ - INTEL_EVENT_CONSTRAINT(0xfe, 0x1), /* L2_DATA_PF2_MISS */ - INTEL_EVENT_CONSTRAINT(0xff, 0x1), /* L2_DATA_HIT_INFLIGHT_PF2 */ - EVENT_CONSTRAINT_END -}; - -#define MSR_KNC_IA32_PERF_GLOBAL_STATUS 0x0000002d -#define MSR_KNC_IA32_PERF_GLOBAL_OVF_CONTROL 0x0000002e -#define MSR_KNC_IA32_PERF_GLOBAL_CTRL 0x0000002f - -#define KNC_ENABLE_COUNTER0 0x00000001 -#define KNC_ENABLE_COUNTER1 0x00000002 - -static void knc_pmu_disable_all(void) -{ - u64 val; - - rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val); - val &= ~(KNC_ENABLE_COUNTER0|KNC_ENABLE_COUNTER1); - wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val); -} - -static void knc_pmu_enable_all(int added) -{ - u64 val; - - rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val); - val |= (KNC_ENABLE_COUNTER0|KNC_ENABLE_COUNTER1); - wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val); -} - -static inline void -knc_pmu_disable_event(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - u64 val; - - val = hwc->config; - val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; - - (void)wrmsrl_safe(hwc->config_base + hwc->idx, val); -} - -static void knc_pmu_enable_event(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - u64 val; - - val = hwc->config; - val |= ARCH_PERFMON_EVENTSEL_ENABLE; - - (void)wrmsrl_safe(hwc->config_base + hwc->idx, val); -} - -static inline u64 knc_pmu_get_status(void) -{ - u64 status; - - rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_STATUS, status); - - return status; -} - -static inline void knc_pmu_ack_status(u64 ack) -{ - wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_OVF_CONTROL, ack); -} - -static int knc_pmu_handle_irq(struct pt_regs *regs) -{ - struct perf_sample_data data; - struct cpu_hw_events *cpuc; - int handled = 0; - int bit, loops; - u64 status; - - cpuc = this_cpu_ptr(&cpu_hw_events); - - knc_pmu_disable_all(); - - status = knc_pmu_get_status(); - if (!status) { - knc_pmu_enable_all(0); - return handled; - } - - loops = 0; -again: - knc_pmu_ack_status(status); - if (++loops > 100) { - WARN_ONCE(1, "perf: irq loop stuck!\n"); - perf_event_print_debug(); - goto done; - } - - inc_irq_stat(apic_perf_irqs); - - for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { - struct perf_event *event = cpuc->events[bit]; - - handled++; - - if (!test_bit(bit, cpuc->active_mask)) - continue; - - if (!intel_pmu_save_and_restart(event)) - continue; - - perf_sample_data_init(&data, 0, event->hw.last_period); - - if (perf_event_overflow(event, &data, regs)) - x86_pmu_stop(event, 0); - } - - /* - * Repeat if there is more work to be done: - */ - status = knc_pmu_get_status(); - if (status) - goto again; - -done: - knc_pmu_enable_all(0); - - return handled; -} - - -PMU_FORMAT_ATTR(event, "config:0-7" ); -PMU_FORMAT_ATTR(umask, "config:8-15" ); -PMU_FORMAT_ATTR(edge, "config:18" ); -PMU_FORMAT_ATTR(inv, "config:23" ); -PMU_FORMAT_ATTR(cmask, "config:24-31" ); - -static struct attribute *intel_knc_formats_attr[] = { - &format_attr_event.attr, - &format_attr_umask.attr, - &format_attr_edge.attr, - &format_attr_inv.attr, - &format_attr_cmask.attr, - NULL, -}; - -static const struct x86_pmu knc_pmu __initconst = { - .name = "knc", - .handle_irq = knc_pmu_handle_irq, - .disable_all = knc_pmu_disable_all, - .enable_all = knc_pmu_enable_all, - .enable = knc_pmu_enable_event, - .disable = knc_pmu_disable_event, - .hw_config = x86_pmu_hw_config, - .schedule_events = x86_schedule_events, - .eventsel = MSR_KNC_EVNTSEL0, - .perfctr = MSR_KNC_PERFCTR0, - .event_map = knc_pmu_event_map, - .max_events = ARRAY_SIZE(knc_perfmon_event_map), - .apic = 1, - .max_period = (1ULL << 39) - 1, - .version = 0, - .num_counters = 2, - .cntval_bits = 40, - .cntval_mask = (1ULL << 40) - 1, - .get_event_constraints = x86_get_event_constraints, - .event_constraints = knc_event_constraints, - .format_attrs = intel_knc_formats_attr, -}; - -__init int knc_pmu_init(void) -{ - x86_pmu = knc_pmu; - - memcpy(hw_cache_event_ids, knc_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); - - return 0; -} diff --git a/arch/x86/kernel/cpu/perf_event_msr.c b/arch/x86/kernel/cpu/perf_event_msr.c deleted file mode 100644 index ec863b9a9f78..000000000000 --- a/arch/x86/kernel/cpu/perf_event_msr.c +++ /dev/null @@ -1,241 +0,0 @@ -#include <linux/perf_event.h> - -enum perf_msr_id { - PERF_MSR_TSC = 0, - PERF_MSR_APERF = 1, - PERF_MSR_MPERF = 2, - PERF_MSR_PPERF = 3, - PERF_MSR_SMI = 4, - - PERF_MSR_EVENT_MAX, -}; - -static bool test_aperfmperf(int idx) -{ - return boot_cpu_has(X86_FEATURE_APERFMPERF); -} - -static bool test_intel(int idx) -{ - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || - boot_cpu_data.x86 != 6) - return false; - - switch (boot_cpu_data.x86_model) { - case 30: /* 45nm Nehalem */ - case 26: /* 45nm Nehalem-EP */ - case 46: /* 45nm Nehalem-EX */ - - case 37: /* 32nm Westmere */ - case 44: /* 32nm Westmere-EP */ - case 47: /* 32nm Westmere-EX */ - - case 42: /* 32nm SandyBridge */ - case 45: /* 32nm SandyBridge-E/EN/EP */ - - case 58: /* 22nm IvyBridge */ - case 62: /* 22nm IvyBridge-EP/EX */ - - case 60: /* 22nm Haswell Core */ - case 63: /* 22nm Haswell Server */ - case 69: /* 22nm Haswell ULT */ - case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */ - - case 61: /* 14nm Broadwell Core-M */ - case 86: /* 14nm Broadwell Xeon D */ - case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */ - case 79: /* 14nm Broadwell Server */ - - case 55: /* 22nm Atom "Silvermont" */ - case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */ - case 76: /* 14nm Atom "Airmont" */ - if (idx == PERF_MSR_SMI) - return true; - break; - - case 78: /* 14nm Skylake Mobile */ - case 94: /* 14nm Skylake Desktop */ - if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF) - return true; - break; - } - - return false; -} - -struct perf_msr { - u64 msr; - struct perf_pmu_events_attr *attr; - bool (*test)(int idx); -}; - -PMU_EVENT_ATTR_STRING(tsc, evattr_tsc, "event=0x00"); -PMU_EVENT_ATTR_STRING(aperf, evattr_aperf, "event=0x01"); -PMU_EVENT_ATTR_STRING(mperf, evattr_mperf, "event=0x02"); -PMU_EVENT_ATTR_STRING(pperf, evattr_pperf, "event=0x03"); -PMU_EVENT_ATTR_STRING(smi, evattr_smi, "event=0x04"); - -static struct perf_msr msr[] = { - [PERF_MSR_TSC] = { 0, &evattr_tsc, NULL, }, - [PERF_MSR_APERF] = { MSR_IA32_APERF, &evattr_aperf, test_aperfmperf, }, - [PERF_MSR_MPERF] = { MSR_IA32_MPERF, &evattr_mperf, test_aperfmperf, }, - [PERF_MSR_PPERF] = { MSR_PPERF, &evattr_pperf, test_intel, }, - [PERF_MSR_SMI] = { MSR_SMI_COUNT, &evattr_smi, test_intel, }, -}; - -static struct attribute *events_attrs[PERF_MSR_EVENT_MAX + 1] = { - NULL, -}; - -static struct attribute_group events_attr_group = { - .name = "events", - .attrs = events_attrs, -}; - -PMU_FORMAT_ATTR(event, "config:0-63"); -static struct attribute *format_attrs[] = { - &format_attr_event.attr, - NULL, -}; -static struct attribute_group format_attr_group = { - .name = "format", - .attrs = format_attrs, -}; - -static const struct attribute_group *attr_groups[] = { - &events_attr_group, - &format_attr_group, - NULL, -}; - -static int msr_event_init(struct perf_event *event) -{ - u64 cfg = event->attr.config; - - if (event->attr.type != event->pmu->type) - return -ENOENT; - - if (cfg >= PERF_MSR_EVENT_MAX) - return -EINVAL; - - /* unsupported modes and filters */ - if (event->attr.exclude_user || - event->attr.exclude_kernel || - event->attr.exclude_hv || - event->attr.exclude_idle || - event->attr.exclude_host || - event->attr.exclude_guest || - event->attr.sample_period) /* no sampling */ - return -EINVAL; - - if (!msr[cfg].attr) - return -EINVAL; - - event->hw.idx = -1; - event->hw.event_base = msr[cfg].msr; - event->hw.config = cfg; - - return 0; -} - -static inline u64 msr_read_counter(struct perf_event *event) -{ - u64 now; - - if (event->hw.event_base) - rdmsrl(event->hw.event_base, now); - else - rdtscll(now); - - return now; -} -static void msr_event_update(struct perf_event *event) -{ - u64 prev, now; - s64 delta; - - /* Careful, an NMI might modify the previous event value. */ -again: - prev = local64_read(&event->hw.prev_count); - now = msr_read_counter(event); - - if (local64_cmpxchg(&event->hw.prev_count, prev, now) != prev) - goto again; - - delta = now - prev; - if (unlikely(event->hw.event_base == MSR_SMI_COUNT)) - delta = sign_extend64(delta, 31); - - local64_add(now - prev, &event->count); -} - -static void msr_event_start(struct perf_event *event, int flags) -{ - u64 now; - - now = msr_read_counter(event); - local64_set(&event->hw.prev_count, now); -} - -static void msr_event_stop(struct perf_event *event, int flags) -{ - msr_event_update(event); -} - -static void msr_event_del(struct perf_event *event, int flags) -{ - msr_event_stop(event, PERF_EF_UPDATE); -} - -static int msr_event_add(struct perf_event *event, int flags) -{ - if (flags & PERF_EF_START) - msr_event_start(event, flags); - - return 0; -} - -static struct pmu pmu_msr = { - .task_ctx_nr = perf_sw_context, - .attr_groups = attr_groups, - .event_init = msr_event_init, - .add = msr_event_add, - .del = msr_event_del, - .start = msr_event_start, - .stop = msr_event_stop, - .read = msr_event_update, - .capabilities = PERF_PMU_CAP_NO_INTERRUPT, -}; - -static int __init msr_init(void) -{ - int i, j = 0; - - if (!boot_cpu_has(X86_FEATURE_TSC)) { - pr_cont("no MSR PMU driver.\n"); - return 0; - } - - /* Probe the MSRs. */ - for (i = PERF_MSR_TSC + 1; i < PERF_MSR_EVENT_MAX; i++) { - u64 val; - - /* - * Virt sucks arse; you cannot tell if a R/O MSR is present :/ - */ - if (!msr[i].test(i) || rdmsrl_safe(msr[i].msr, &val)) - msr[i].attr = NULL; - } - - /* List remaining MSRs in the sysfs attrs. */ - for (i = 0; i < PERF_MSR_EVENT_MAX; i++) { - if (msr[i].attr) - events_attrs[j++] = &msr[i].attr->attr.attr; - } - events_attrs[j] = NULL; - - perf_pmu_register(&pmu_msr, "msr", -1); - - return 0; -} -device_initcall(msr_init); diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c deleted file mode 100644 index f2e56783af3d..000000000000 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ /dev/null @@ -1,1376 +0,0 @@ -/* - * Netburst Performance Events (P4, old Xeon) - * - * Copyright (C) 2010 Parallels, Inc., Cyrill Gorcunov <gorcunov@openvz.org> - * Copyright (C) 2010 Intel Corporation, Lin Ming <ming.m.lin@intel.com> - * - * For licencing details see kernel-base/COPYING - */ - -#include <linux/perf_event.h> - -#include <asm/perf_event_p4.h> -#include <asm/hardirq.h> -#include <asm/apic.h> - -#include "perf_event.h" - -#define P4_CNTR_LIMIT 3 -/* - * array indices: 0,1 - HT threads, used with HT enabled cpu - */ -struct p4_event_bind { - unsigned int opcode; /* Event code and ESCR selector */ - unsigned int escr_msr[2]; /* ESCR MSR for this event */ - unsigned int escr_emask; /* valid ESCR EventMask bits */ - unsigned int shared; /* event is shared across threads */ - char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on abscence */ -}; - -struct p4_pebs_bind { - unsigned int metric_pebs; - unsigned int metric_vert; -}; - -/* it sets P4_PEBS_ENABLE_UOP_TAG as well */ -#define P4_GEN_PEBS_BIND(name, pebs, vert) \ - [P4_PEBS_METRIC__##name] = { \ - .metric_pebs = pebs | P4_PEBS_ENABLE_UOP_TAG, \ - .metric_vert = vert, \ - } - -/* - * note we have P4_PEBS_ENABLE_UOP_TAG always set here - * - * it's needed for mapping P4_PEBS_CONFIG_METRIC_MASK bits of - * event configuration to find out which values are to be - * written into MSR_IA32_PEBS_ENABLE and MSR_P4_PEBS_MATRIX_VERT - * resgisters - */ -static struct p4_pebs_bind p4_pebs_bind_map[] = { - P4_GEN_PEBS_BIND(1stl_cache_load_miss_retired, 0x0000001, 0x0000001), - P4_GEN_PEBS_BIND(2ndl_cache_load_miss_retired, 0x0000002, 0x0000001), - P4_GEN_PEBS_BIND(dtlb_load_miss_retired, 0x0000004, 0x0000001), - P4_GEN_PEBS_BIND(dtlb_store_miss_retired, 0x0000004, 0x0000002), - P4_GEN_PEBS_BIND(dtlb_all_miss_retired, 0x0000004, 0x0000003), - P4_GEN_PEBS_BIND(tagged_mispred_branch, 0x0018000, 0x0000010), - P4_GEN_PEBS_BIND(mob_load_replay_retired, 0x0000200, 0x0000001), - P4_GEN_PEBS_BIND(split_load_retired, 0x0000400, 0x0000001), - P4_GEN_PEBS_BIND(split_store_retired, 0x0000400, 0x0000002), -}; - -/* - * Note that we don't use CCCR1 here, there is an - * exception for P4_BSQ_ALLOCATION but we just have - * no workaround - * - * consider this binding as resources which particular - * event may borrow, it doesn't contain EventMask, - * Tags and friends -- they are left to a caller - */ -static struct p4_event_bind p4_event_bind_map[] = { - [P4_EVENT_TC_DELIVER_MODE] = { - .opcode = P4_OPCODE(P4_EVENT_TC_DELIVER_MODE), - .escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 }, - .escr_emask = - P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DD) | - P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DB) | - P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DI) | - P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BD) | - P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BB) | - P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BI) | - P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, ID), - .shared = 1, - .cntr = { {4, 5, -1}, {6, 7, -1} }, - }, - [P4_EVENT_BPU_FETCH_REQUEST] = { - .opcode = P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST), - .escr_msr = { MSR_P4_BPU_ESCR0, MSR_P4_BPU_ESCR1 }, - .escr_emask = - P4_ESCR_EMASK_BIT(P4_EVENT_BPU_FETCH_REQUEST, TCMISS), - .cntr = { {0, -1, -1}, {2, -1, -1} }, - }, - [P4_EVENT_ITLB_REFERENCE] = { - .opcode = P4_OPCODE(P4_EVENT_ITLB_REFERENCE), - .escr_msr = { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 }, - .escr_emask = - P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT) | - P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, MISS) | - P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT_UK), - .cntr = { {0, -1, -1}, {2, -1, -1} }, - }, - [P4_EVENT_MEMORY_CANCEL] = { - .opcode = P4_OPCODE(P4_EVENT_MEMORY_CANCEL), - .escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 }, - .escr_emask = - P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, ST_RB_FULL) | - P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, 64K_CONF), - .cntr = { {8, 9, -1}, {10, 11, -1} }, - }, - [P4_EVENT_MEMORY_COMPLETE] = { - .opcode = P4_OPCODE(P4_EVENT_MEMORY_COMPLETE), - .escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 }, - .escr_emask = - P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, LSC) | - P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, SSC), - .cntr = { {8, 9, -1}, {10, 11, -1} }, - }, - [P4_EVENT_LOAD_PORT_REPLAY] = { - .opcode = P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY), - .escr_msr = { MSR_P4_SAAT_ESCR0, MSR_P4_SAAT_ESCR1 }, - .escr_emask = - P4_ESCR_EMASK_BIT(P4_EVENT_LOAD_PORT_REPLAY, SPLIT_LD), - .cntr = { {8, 9, -1}, {10, 11, -1} }, - }, - [P4_EVENT_STORE_PORT_REPLAY] = { - .opcode = P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY), - .escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 }, - .escr_emask = - P4_ESCR_EMASK_BIT(P4_EVENT_STORE_PORT_REPLAY, SPLIT_ST), - .cntr = { {8, 9, -1}, {10, 11, -1} }, - }, - [P4_EVENT_MOB_LOAD_REPLAY] = { - .opcode = P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY), - .escr_msr = { MSR_P4_MOB_ESCR0, MSR_P4_MOB_ESCR1 }, - .escr_emask = - P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STA) | - P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STD) | - P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, PARTIAL_DATA) | - P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, UNALGN_ADDR), - .cntr = { {0, -1, -1}, {2, -1, -1} }, - }, - [P4_EVENT_PAGE_WALK_TYPE] = { - .opcode = P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE), - .escr_msr = { MSR_P4_PMH_ESCR0, MSR_P4_PMH_ESCR1 }, - .escr_emask = - P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, DTMISS) | - P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, ITMISS), - .shared = 1, - .cntr = { {0, -1, -1}, {2, -1, -1} }, - }, - [P4_EVENT_BSQ_CACHE_REFERENCE] = { - .opcode = P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE), - .escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 }, - .escr_emask = - P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS) | - P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE) | - P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM) | - P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS) | - P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE) | - P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM) | - P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS) | - P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS) | - P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS), - .cntr = { {0, -1, -1}, {2, -1, -1} }, - }, - [P4_EVENT_IOQ_ALLOCATION] = { - .opcode = P4_OPCODE(P4_EVENT_IOQ_ALLOCATION), - .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, - .escr_emask = - P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, DEFAULT) | - P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_READ) | - P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_WRITE) | - P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_UC) | - P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WC) | - P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WT) | - P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WP) | - P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WB) | - P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OWN) | - P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OTHER) | - P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, PREFETCH), - .cntr = { {0, -1, -1}, {2, -1, -1} }, - }, - [P4_EVENT_IOQ_ACTIVE_ENTRIES] = { /* shared ESCR */ - .opcode = P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES), - .escr_msr = { MSR_P4_FSB_ESCR1, MSR_P4_FSB_ESCR1 }, - .escr_emask = - P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, DEFAULT) | - P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_READ) | - P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_WRITE) | - P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_UC) | - P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WC) | - P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WT) | - P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WP) | - P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WB) | - P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OWN) | - P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OTHER) | - P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, PREFETCH), - .cntr = { {2, -1, -1}, {3, -1, -1} }, - }, - [P4_EVENT_FSB_DATA_ACTIVITY] = { - .opcode = P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY), - .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, - .escr_emask = - P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV) | - P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN) | - P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OTHER) | - P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_DRV) | - P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OWN) | - P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OTHER), - .shared = 1, - .cntr = { {0, -1, -1}, {2, -1, -1} }, - }, - [P4_EVENT_BSQ_ALLOCATION] = { /* shared ESCR, broken CCCR1 */ - .opcode = P4_OPCODE(P4_EVENT_BSQ_ALLOCATION), - .escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR0 }, - .escr_emask = - P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE0) | - P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE1) | - P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN0) | - P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN1) | - P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_IO_TYPE) | - P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LOCK_TYPE) | - P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_CACHE_TYPE) | - P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_SPLIT_TYPE) | - P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_DEM_TYPE) | - P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_ORD_TYPE) | - P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE0) | - P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE1) | - P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE2), - .cntr = { {0, -1, -1}, {1, -1, -1} }, - }, - [P4_EVENT_BSQ_ACTIVE_ENTRIES] = { /* shared ESCR */ - .opcode = P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES), - .escr_msr = { MSR_P4_BSU_ESCR1 , MSR_P4_BSU_ESCR1 }, - .escr_emask = - P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE0) | - P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE1) | - P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN0) | - P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN1) | - P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_IO_TYPE) | - P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LOCK_TYPE) | - P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_CACHE_TYPE) | - P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_SPLIT_TYPE) | - P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_DEM_TYPE) | - P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_ORD_TYPE) | - P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE0) | - P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE1) | - P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE2), - .cntr = { {2, -1, -1}, {3, -1, -1} }, - }, - [P4_EVENT_SSE_INPUT_ASSIST] = { - .opcode = P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST), - .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, - .escr_emask = - P4_ESCR_EMASK_BIT(P4_EVENT_SSE_INPUT_ASSIST, ALL), - .shared = 1, - .cntr = { {8, 9, -1}, {10, 11, -1} }, - }, - [P4_EVENT_PACKED_SP_UOP] = { - .opcode = P4_OPCODE(P4_EVENT_PACKED_SP_UOP), - .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, - .escr_emask = - P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_SP_UOP, ALL), - .shared = 1, - .cntr = { {8, 9, -1}, {10, 11, -1} }, - }, - [P4_EVENT_PACKED_DP_UOP] = { - .opcode = P4_OPCODE(P4_EVENT_PACKED_DP_UOP), - .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, - .escr_emask = - P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_DP_UOP, ALL), - .shared = 1, - .cntr = { {8, 9, -1}, {10, 11, -1} }, - }, - [P4_EVENT_SCALAR_SP_UOP] = { - .opcode = P4_OPCODE(P4_EVENT_SCALAR_SP_UOP), - .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, - .escr_emask = - P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_SP_UOP, ALL), - .shared = 1, - .cntr = { {8, 9, -1}, {10, 11, -1} }, - }, - [P4_EVENT_SCALAR_DP_UOP] = { - .opcode = P4_OPCODE(P4_EVENT_SCALAR_DP_UOP), - .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, - .escr_emask = - P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_DP_UOP, ALL), - .shared = 1, - .cntr = { {8, 9, -1}, {10, 11, -1} }, - }, - [P4_EVENT_64BIT_MMX_UOP] = { - .opcode = P4_OPCODE(P4_EVENT_64BIT_MMX_UOP), - .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, - .escr_emask = - P4_ESCR_EMASK_BIT(P4_EVENT_64BIT_MMX_UOP, ALL), - .shared = 1, - .cntr = { {8, 9, -1}, {10, 11, -1} }, - }, - [P4_EVENT_128BIT_MMX_UOP] = { - .opcode = P4_OPCODE(P4_EVENT_128BIT_MMX_UOP), - .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, - .escr_emask = - P4_ESCR_EMASK_BIT(P4_EVENT_128BIT_MMX_UOP, ALL), - .shared = 1, - .cntr = { {8, 9, -1}, {10, 11, -1} }, - }, - [P4_EVENT_X87_FP_UOP] = { - .opcode = P4_OPCODE(P4_EVENT_X87_FP_UOP), - .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, - .escr_emask = - P4_ESCR_EMASK_BIT(P4_EVENT_X87_FP_UOP, ALL), - .shared = 1, - .cntr = { {8, 9, -1}, {10, 11, -1} }, - }, - [P4_EVENT_TC_MISC] = { - .opcode = P4_OPCODE(P4_EVENT_TC_MISC), - .escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 }, - .escr_emask = - P4_ESCR_EMASK_BIT(P4_EVENT_TC_MISC, FLUSH), - .cntr = { {4, 5, -1}, {6, 7, -1} }, - }, - [P4_EVENT_GLOBAL_POWER_EVENTS] = { - .opcode = P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS), - .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, - .escr_emask = - P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING), - .cntr = { {0, -1, -1}, {2, -1, -1} }, - }, - [P4_EVENT_TC_MS_XFER] = { - .opcode = P4_OPCODE(P4_EVENT_TC_MS_XFER), - .escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 }, - .escr_emask = - P4_ESCR_EMASK_BIT(P4_EVENT_TC_MS_XFER, CISC), - .cntr = { {4, 5, -1}, {6, 7, -1} }, - }, - [P4_EVENT_UOP_QUEUE_WRITES] = { - .opcode = P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES), - .escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 }, - .escr_emask = - P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_BUILD) | - P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_DELIVER) | - P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_ROM), - .cntr = { {4, 5, -1}, {6, 7, -1} }, - }, - [P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE] = { - .opcode = P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE), - .escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR0 }, - .escr_emask = - P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CONDITIONAL) | - P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CALL) | - P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, RETURN) | - P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, INDIRECT), - .cntr = { {4, 5, -1}, {6, 7, -1} }, - }, - [P4_EVENT_RETIRED_BRANCH_TYPE] = { - .opcode = P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE), - .escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR1 }, - .escr_emask = - P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL) | - P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL) | - P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN) | - P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT), - .cntr = { {4, 5, -1}, {6, 7, -1} }, - }, - [P4_EVENT_RESOURCE_STALL] = { - .opcode = P4_OPCODE(P4_EVENT_RESOURCE_STALL), - .escr_msr = { MSR_P4_ALF_ESCR0, MSR_P4_ALF_ESCR1 }, - .escr_emask = - P4_ESCR_EMASK_BIT(P4_EVENT_RESOURCE_STALL, SBFULL), - .cntr = { {12, 13, 16}, {14, 15, 17} }, - }, - [P4_EVENT_WC_BUFFER] = { - .opcode = P4_OPCODE(P4_EVENT_WC_BUFFER), - .escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 }, - .escr_emask = - P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_EVICTS) | - P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_FULL_EVICTS), - .shared = 1, - .cntr = { {8, 9, -1}, {10, 11, -1} }, - }, - [P4_EVENT_B2B_CYCLES] = { - .opcode = P4_OPCODE(P4_EVENT_B2B_CYCLES), - .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, - .escr_emask = 0, - .cntr = { {0, -1, -1}, {2, -1, -1} }, - }, - [P4_EVENT_BNR] = { - .opcode = P4_OPCODE(P4_EVENT_BNR), - .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, - .escr_emask = 0, - .cntr = { {0, -1, -1}, {2, -1, -1} }, - }, - [P4_EVENT_SNOOP] = { - .opcode = P4_OPCODE(P4_EVENT_SNOOP), - .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, - .escr_emask = 0, - .cntr = { {0, -1, -1}, {2, -1, -1} }, - }, - [P4_EVENT_RESPONSE] = { - .opcode = P4_OPCODE(P4_EVENT_RESPONSE), - .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, - .escr_emask = 0, - .cntr = { {0, -1, -1}, {2, -1, -1} }, - }, - [P4_EVENT_FRONT_END_EVENT] = { - .opcode = P4_OPCODE(P4_EVENT_FRONT_END_EVENT), - .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, - .escr_emask = - P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, NBOGUS) | - P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, BOGUS), - .cntr = { {12, 13, 16}, {14, 15, 17} }, - }, - [P4_EVENT_EXECUTION_EVENT] = { - .opcode = P4_OPCODE(P4_EVENT_EXECUTION_EVENT), - .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, - .escr_emask = - P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0) | - P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1) | - P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2) | - P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3) | - P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) | - P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) | - P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) | - P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3), - .cntr = { {12, 13, 16}, {14, 15, 17} }, - }, - [P4_EVENT_REPLAY_EVENT] = { - .opcode = P4_OPCODE(P4_EVENT_REPLAY_EVENT), - .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, - .escr_emask = - P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, NBOGUS) | - P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, BOGUS), - .cntr = { {12, 13, 16}, {14, 15, 17} }, - }, - [P4_EVENT_INSTR_RETIRED] = { - .opcode = P4_OPCODE(P4_EVENT_INSTR_RETIRED), - .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, - .escr_emask = - P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG) | - P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSTAG) | - P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG) | - P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSTAG), - .cntr = { {12, 13, 16}, {14, 15, 17} }, - }, - [P4_EVENT_UOPS_RETIRED] = { - .opcode = P4_OPCODE(P4_EVENT_UOPS_RETIRED), - .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, - .escr_emask = - P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, NBOGUS) | - P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, BOGUS), - .cntr = { {12, 13, 16}, {14, 15, 17} }, - }, - [P4_EVENT_UOP_TYPE] = { - .opcode = P4_OPCODE(P4_EVENT_UOP_TYPE), - .escr_msr = { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 }, - .escr_emask = - P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGLOADS) | - P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGSTORES), - .cntr = { {12, 13, 16}, {14, 15, 17} }, - }, - [P4_EVENT_BRANCH_RETIRED] = { - .opcode = P4_OPCODE(P4_EVENT_BRANCH_RETIRED), - .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, - .escr_emask = - P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNP) | - P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNM) | - P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTP) | - P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTM), - .cntr = { {12, 13, 16}, {14, 15, 17} }, - }, - [P4_EVENT_MISPRED_BRANCH_RETIRED] = { - .opcode = P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED), - .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, - .escr_emask = - P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS), - .cntr = { {12, 13, 16}, {14, 15, 17} }, - }, - [P4_EVENT_X87_ASSIST] = { - .opcode = P4_OPCODE(P4_EVENT_X87_ASSIST), - .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, - .escr_emask = - P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSU) | - P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSO) | - P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAO) | - P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAU) | - P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, PREA), - .cntr = { {12, 13, 16}, {14, 15, 17} }, - }, - [P4_EVENT_MACHINE_CLEAR] = { - .opcode = P4_OPCODE(P4_EVENT_MACHINE_CLEAR), - .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, - .escr_emask = - P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, CLEAR) | - P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, MOCLEAR) | - P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, SMCLEAR), - .cntr = { {12, 13, 16}, {14, 15, 17} }, - }, - [P4_EVENT_INSTR_COMPLETED] = { - .opcode = P4_OPCODE(P4_EVENT_INSTR_COMPLETED), - .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, - .escr_emask = - P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, NBOGUS) | - P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, BOGUS), - .cntr = { {12, 13, 16}, {14, 15, 17} }, - }, -}; - -#define P4_GEN_CACHE_EVENT(event, bit, metric) \ - p4_config_pack_escr(P4_ESCR_EVENT(event) | \ - P4_ESCR_EMASK_BIT(event, bit)) | \ - p4_config_pack_cccr(metric | \ - P4_CCCR_ESEL(P4_OPCODE_ESEL(P4_OPCODE(event)))) - -static __initconst const u64 p4_hw_cache_event_ids - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = -{ - [ C(L1D ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, - P4_PEBS_METRIC__1stl_cache_load_miss_retired), - }, - }, - [ C(LL ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, - P4_PEBS_METRIC__2ndl_cache_load_miss_retired), - }, -}, - [ C(DTLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, - P4_PEBS_METRIC__dtlb_load_miss_retired), - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, - P4_PEBS_METRIC__dtlb_store_miss_retired), - }, - }, - [ C(ITLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, HIT, - P4_PEBS_METRIC__none), - [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, MISS, - P4_PEBS_METRIC__none), - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, - [ C(NODE) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, -}; - -/* - * Because of Netburst being quite restricted in how many - * identical events may run simultaneously, we introduce event aliases, - * ie the different events which have the same functionality but - * utilize non-intersected resources (ESCR/CCCR/counter registers). - * - * This allow us to relax restrictions a bit and run two or more - * identical events together. - * - * Never set any custom internal bits such as P4_CONFIG_HT, - * P4_CONFIG_ALIASABLE or bits for P4_PEBS_METRIC, they are - * either up to date automatically or not applicable at all. - */ -struct p4_event_alias { - u64 original; - u64 alternative; -} p4_event_aliases[] = { - { - /* - * Non-halted cycles can be substituted with non-sleeping cycles (see - * Intel SDM Vol3b for details). We need this alias to be able - * to run nmi-watchdog and 'perf top' (or any other user space tool - * which is interested in running PERF_COUNT_HW_CPU_CYCLES) - * simultaneously. - */ - .original = - p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS) | - P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)), - .alternative = - p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_EXECUTION_EVENT) | - P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0)| - P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1)| - P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2)| - P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3)| - P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) | - P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) | - P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) | - P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3))| - p4_config_pack_cccr(P4_CCCR_THRESHOLD(15) | P4_CCCR_COMPLEMENT | - P4_CCCR_COMPARE), - }, -}; - -static u64 p4_get_alias_event(u64 config) -{ - u64 config_match; - int i; - - /* - * Only event with special mark is allowed, - * we're to be sure it didn't come as malformed - * RAW event. - */ - if (!(config & P4_CONFIG_ALIASABLE)) - return 0; - - config_match = config & P4_CONFIG_EVENT_ALIAS_MASK; - - for (i = 0; i < ARRAY_SIZE(p4_event_aliases); i++) { - if (config_match == p4_event_aliases[i].original) { - config_match = p4_event_aliases[i].alternative; - break; - } else if (config_match == p4_event_aliases[i].alternative) { - config_match = p4_event_aliases[i].original; - break; - } - } - - if (i >= ARRAY_SIZE(p4_event_aliases)) - return 0; - - return config_match | (config & P4_CONFIG_EVENT_ALIAS_IMMUTABLE_BITS); -} - -static u64 p4_general_events[PERF_COUNT_HW_MAX] = { - /* non-halted CPU clocks */ - [PERF_COUNT_HW_CPU_CYCLES] = - p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS) | - P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)) | - P4_CONFIG_ALIASABLE, - - /* - * retired instructions - * in a sake of simplicity we don't use the FSB tagging - */ - [PERF_COUNT_HW_INSTRUCTIONS] = - p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_INSTR_RETIRED) | - P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG) | - P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG)), - - /* cache hits */ - [PERF_COUNT_HW_CACHE_REFERENCES] = - p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE) | - P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS) | - P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE) | - P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM) | - P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS) | - P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE) | - P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM)), - - /* cache misses */ - [PERF_COUNT_HW_CACHE_MISSES] = - p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE) | - P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS) | - P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS) | - P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS)), - - /* branch instructions retired */ - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = - p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_RETIRED_BRANCH_TYPE) | - P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL) | - P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL) | - P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN) | - P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT)), - - /* mispredicted branches retired */ - [PERF_COUNT_HW_BRANCH_MISSES] = - p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_MISPRED_BRANCH_RETIRED) | - P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS)), - - /* bus ready clocks (cpu is driving #DRDY_DRV\#DRDY_OWN): */ - [PERF_COUNT_HW_BUS_CYCLES] = - p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_FSB_DATA_ACTIVITY) | - P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV) | - P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN)) | - p4_config_pack_cccr(P4_CCCR_EDGE | P4_CCCR_COMPARE), -}; - -static struct p4_event_bind *p4_config_get_bind(u64 config) -{ - unsigned int evnt = p4_config_unpack_event(config); - struct p4_event_bind *bind = NULL; - - if (evnt < ARRAY_SIZE(p4_event_bind_map)) - bind = &p4_event_bind_map[evnt]; - - return bind; -} - -static u64 p4_pmu_event_map(int hw_event) -{ - struct p4_event_bind *bind; - unsigned int esel; - u64 config; - - config = p4_general_events[hw_event]; - bind = p4_config_get_bind(config); - esel = P4_OPCODE_ESEL(bind->opcode); - config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel)); - - return config; -} - -/* check cpu model specifics */ -static bool p4_event_match_cpu_model(unsigned int event_idx) -{ - /* INSTR_COMPLETED event only exist for model 3, 4, 6 (Prescott) */ - if (event_idx == P4_EVENT_INSTR_COMPLETED) { - if (boot_cpu_data.x86_model != 3 && - boot_cpu_data.x86_model != 4 && - boot_cpu_data.x86_model != 6) - return false; - } - - /* - * For info - * - IQ_ESCR0, IQ_ESCR1 only for models 1 and 2 - */ - - return true; -} - -static int p4_validate_raw_event(struct perf_event *event) -{ - unsigned int v, emask; - - /* User data may have out-of-bound event index */ - v = p4_config_unpack_event(event->attr.config); - if (v >= ARRAY_SIZE(p4_event_bind_map)) - return -EINVAL; - - /* It may be unsupported: */ - if (!p4_event_match_cpu_model(v)) - return -EINVAL; - - /* - * NOTE: P4_CCCR_THREAD_ANY has not the same meaning as - * in Architectural Performance Monitoring, it means not - * on _which_ logical cpu to count but rather _when_, ie it - * depends on logical cpu state -- count event if one cpu active, - * none, both or any, so we just allow user to pass any value - * desired. - * - * In turn we always set Tx_OS/Tx_USR bits bound to logical - * cpu without their propagation to another cpu - */ - - /* - * if an event is shared across the logical threads - * the user needs special permissions to be able to use it - */ - if (p4_ht_active() && p4_event_bind_map[v].shared) { - if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) - return -EACCES; - } - - /* ESCR EventMask bits may be invalid */ - emask = p4_config_unpack_escr(event->attr.config) & P4_ESCR_EVENTMASK_MASK; - if (emask & ~p4_event_bind_map[v].escr_emask) - return -EINVAL; - - /* - * it may have some invalid PEBS bits - */ - if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE)) - return -EINVAL; - - v = p4_config_unpack_metric(event->attr.config); - if (v >= ARRAY_SIZE(p4_pebs_bind_map)) - return -EINVAL; - - return 0; -} - -static int p4_hw_config(struct perf_event *event) -{ - int cpu = get_cpu(); - int rc = 0; - u32 escr, cccr; - - /* - * the reason we use cpu that early is that: if we get scheduled - * first time on the same cpu -- we will not need swap thread - * specific flags in config (and will save some cpu cycles) - */ - - cccr = p4_default_cccr_conf(cpu); - escr = p4_default_escr_conf(cpu, event->attr.exclude_kernel, - event->attr.exclude_user); - event->hw.config = p4_config_pack_escr(escr) | - p4_config_pack_cccr(cccr); - - if (p4_ht_active() && p4_ht_thread(cpu)) - event->hw.config = p4_set_ht_bit(event->hw.config); - - if (event->attr.type == PERF_TYPE_RAW) { - struct p4_event_bind *bind; - unsigned int esel; - /* - * Clear bits we reserve to be managed by kernel itself - * and never allowed from a user space - */ - event->attr.config &= P4_CONFIG_MASK; - - rc = p4_validate_raw_event(event); - if (rc) - goto out; - - /* - * Note that for RAW events we allow user to use P4_CCCR_RESERVED - * bits since we keep additional info here (for cache events and etc) - */ - event->hw.config |= event->attr.config; - bind = p4_config_get_bind(event->attr.config); - if (!bind) { - rc = -EINVAL; - goto out; - } - esel = P4_OPCODE_ESEL(bind->opcode); - event->hw.config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel)); - } - - rc = x86_setup_perfctr(event); -out: - put_cpu(); - return rc; -} - -static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc) -{ - u64 v; - - /* an official way for overflow indication */ - rdmsrl(hwc->config_base, v); - if (v & P4_CCCR_OVF) { - wrmsrl(hwc->config_base, v & ~P4_CCCR_OVF); - return 1; - } - - /* - * In some circumstances the overflow might issue an NMI but did - * not set P4_CCCR_OVF bit. Because a counter holds a negative value - * we simply check for high bit being set, if it's cleared it means - * the counter has reached zero value and continued counting before - * real NMI signal was received: - */ - rdmsrl(hwc->event_base, v); - if (!(v & ARCH_P4_UNFLAGGED_BIT)) - return 1; - - return 0; -} - -static void p4_pmu_disable_pebs(void) -{ - /* - * FIXME - * - * It's still allowed that two threads setup same cache - * events so we can't simply clear metrics until we knew - * no one is depending on us, so we need kind of counter - * for "ReplayEvent" users. - * - * What is more complex -- RAW events, if user (for some - * reason) will pass some cache event metric with improper - * event opcode -- it's fine from hardware point of view - * but completely nonsense from "meaning" of such action. - * - * So at moment let leave metrics turned on forever -- it's - * ok for now but need to be revisited! - * - * (void)wrmsrl_safe(MSR_IA32_PEBS_ENABLE, 0); - * (void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT, 0); - */ -} - -static inline void p4_pmu_disable_event(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - - /* - * If event gets disabled while counter is in overflowed - * state we need to clear P4_CCCR_OVF, otherwise interrupt get - * asserted again and again - */ - (void)wrmsrl_safe(hwc->config_base, - p4_config_unpack_cccr(hwc->config) & ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED); -} - -static void p4_pmu_disable_all(void) -{ - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - int idx; - - for (idx = 0; idx < x86_pmu.num_counters; idx++) { - struct perf_event *event = cpuc->events[idx]; - if (!test_bit(idx, cpuc->active_mask)) - continue; - p4_pmu_disable_event(event); - } - - p4_pmu_disable_pebs(); -} - -/* configuration must be valid */ -static void p4_pmu_enable_pebs(u64 config) -{ - struct p4_pebs_bind *bind; - unsigned int idx; - - BUILD_BUG_ON(P4_PEBS_METRIC__max > P4_PEBS_CONFIG_METRIC_MASK); - - idx = p4_config_unpack_metric(config); - if (idx == P4_PEBS_METRIC__none) - return; - - bind = &p4_pebs_bind_map[idx]; - - (void)wrmsrl_safe(MSR_IA32_PEBS_ENABLE, (u64)bind->metric_pebs); - (void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT, (u64)bind->metric_vert); -} - -static void p4_pmu_enable_event(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - int thread = p4_ht_config_thread(hwc->config); - u64 escr_conf = p4_config_unpack_escr(p4_clear_ht_bit(hwc->config)); - unsigned int idx = p4_config_unpack_event(hwc->config); - struct p4_event_bind *bind; - u64 escr_addr, cccr; - - bind = &p4_event_bind_map[idx]; - escr_addr = bind->escr_msr[thread]; - - /* - * - we dont support cascaded counters yet - * - and counter 1 is broken (erratum) - */ - WARN_ON_ONCE(p4_is_event_cascaded(hwc->config)); - WARN_ON_ONCE(hwc->idx == 1); - - /* we need a real Event value */ - escr_conf &= ~P4_ESCR_EVENT_MASK; - escr_conf |= P4_ESCR_EVENT(P4_OPCODE_EVNT(bind->opcode)); - - cccr = p4_config_unpack_cccr(hwc->config); - - /* - * it could be Cache event so we need to write metrics - * into additional MSRs - */ - p4_pmu_enable_pebs(hwc->config); - - (void)wrmsrl_safe(escr_addr, escr_conf); - (void)wrmsrl_safe(hwc->config_base, - (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE); -} - -static void p4_pmu_enable_all(int added) -{ - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - int idx; - - for (idx = 0; idx < x86_pmu.num_counters; idx++) { - struct perf_event *event = cpuc->events[idx]; - if (!test_bit(idx, cpuc->active_mask)) - continue; - p4_pmu_enable_event(event); - } -} - -static int p4_pmu_handle_irq(struct pt_regs *regs) -{ - struct perf_sample_data data; - struct cpu_hw_events *cpuc; - struct perf_event *event; - struct hw_perf_event *hwc; - int idx, handled = 0; - u64 val; - - cpuc = this_cpu_ptr(&cpu_hw_events); - - for (idx = 0; idx < x86_pmu.num_counters; idx++) { - int overflow; - - if (!test_bit(idx, cpuc->active_mask)) { - /* catch in-flight IRQs */ - if (__test_and_clear_bit(idx, cpuc->running)) - handled++; - continue; - } - - event = cpuc->events[idx]; - hwc = &event->hw; - - WARN_ON_ONCE(hwc->idx != idx); - - /* it might be unflagged overflow */ - overflow = p4_pmu_clear_cccr_ovf(hwc); - - val = x86_perf_event_update(event); - if (!overflow && (val & (1ULL << (x86_pmu.cntval_bits - 1)))) - continue; - - handled += overflow; - - /* event overflow for sure */ - perf_sample_data_init(&data, 0, hwc->last_period); - - if (!x86_perf_event_set_period(event)) - continue; - - - if (perf_event_overflow(event, &data, regs)) - x86_pmu_stop(event, 0); - } - - if (handled) - inc_irq_stat(apic_perf_irqs); - - /* - * When dealing with the unmasking of the LVTPC on P4 perf hw, it has - * been observed that the OVF bit flag has to be cleared first _before_ - * the LVTPC can be unmasked. - * - * The reason is the NMI line will continue to be asserted while the OVF - * bit is set. This causes a second NMI to generate if the LVTPC is - * unmasked before the OVF bit is cleared, leading to unknown NMI - * messages. - */ - apic_write(APIC_LVTPC, APIC_DM_NMI); - - return handled; -} - -/* - * swap thread specific fields according to a thread - * we are going to run on - */ -static void p4_pmu_swap_config_ts(struct hw_perf_event *hwc, int cpu) -{ - u32 escr, cccr; - - /* - * we either lucky and continue on same cpu or no HT support - */ - if (!p4_should_swap_ts(hwc->config, cpu)) - return; - - /* - * the event is migrated from an another logical - * cpu, so we need to swap thread specific flags - */ - - escr = p4_config_unpack_escr(hwc->config); - cccr = p4_config_unpack_cccr(hwc->config); - - if (p4_ht_thread(cpu)) { - cccr &= ~P4_CCCR_OVF_PMI_T0; - cccr |= P4_CCCR_OVF_PMI_T1; - if (escr & P4_ESCR_T0_OS) { - escr &= ~P4_ESCR_T0_OS; - escr |= P4_ESCR_T1_OS; - } - if (escr & P4_ESCR_T0_USR) { - escr &= ~P4_ESCR_T0_USR; - escr |= P4_ESCR_T1_USR; - } - hwc->config = p4_config_pack_escr(escr); - hwc->config |= p4_config_pack_cccr(cccr); - hwc->config |= P4_CONFIG_HT; - } else { - cccr &= ~P4_CCCR_OVF_PMI_T1; - cccr |= P4_CCCR_OVF_PMI_T0; - if (escr & P4_ESCR_T1_OS) { - escr &= ~P4_ESCR_T1_OS; - escr |= P4_ESCR_T0_OS; - } - if (escr & P4_ESCR_T1_USR) { - escr &= ~P4_ESCR_T1_USR; - escr |= P4_ESCR_T0_USR; - } - hwc->config = p4_config_pack_escr(escr); - hwc->config |= p4_config_pack_cccr(cccr); - hwc->config &= ~P4_CONFIG_HT; - } -} - -/* - * ESCR address hashing is tricky, ESCRs are not sequential - * in memory but all starts from MSR_P4_BSU_ESCR0 (0x03a0) and - * the metric between any ESCRs is laid in range [0xa0,0xe1] - * - * so we make ~70% filled hashtable - */ - -#define P4_ESCR_MSR_BASE 0x000003a0 -#define P4_ESCR_MSR_MAX 0x000003e1 -#define P4_ESCR_MSR_TABLE_SIZE (P4_ESCR_MSR_MAX - P4_ESCR_MSR_BASE + 1) -#define P4_ESCR_MSR_IDX(msr) (msr - P4_ESCR_MSR_BASE) -#define P4_ESCR_MSR_TABLE_ENTRY(msr) [P4_ESCR_MSR_IDX(msr)] = msr - -static const unsigned int p4_escr_table[P4_ESCR_MSR_TABLE_SIZE] = { - P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR0), - P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR1), - P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR0), - P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR1), - P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR0), - P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR1), - P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR0), - P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR1), - P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR2), - P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR3), - P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR4), - P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR5), - P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR0), - P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR1), - P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR0), - P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR1), - P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR0), - P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR1), - P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR0), - P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR1), - P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR0), - P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR1), - P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR0), - P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR1), - P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR0), - P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR1), - P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR0), - P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR1), - P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR0), - P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR1), - P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR0), - P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR1), - P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR0), - P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR1), - P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR0), - P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR1), - P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR0), - P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR1), - P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR0), - P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR1), - P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR0), - P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR1), - P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR0), - P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR1), - P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR0), - P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR1), -}; - -static int p4_get_escr_idx(unsigned int addr) -{ - unsigned int idx = P4_ESCR_MSR_IDX(addr); - - if (unlikely(idx >= P4_ESCR_MSR_TABLE_SIZE || - !p4_escr_table[idx] || - p4_escr_table[idx] != addr)) { - WARN_ONCE(1, "P4 PMU: Wrong address passed: %x\n", addr); - return -1; - } - - return idx; -} - -static int p4_next_cntr(int thread, unsigned long *used_mask, - struct p4_event_bind *bind) -{ - int i, j; - - for (i = 0; i < P4_CNTR_LIMIT; i++) { - j = bind->cntr[thread][i]; - if (j != -1 && !test_bit(j, used_mask)) - return j; - } - - return -1; -} - -static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) -{ - unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; - unsigned long escr_mask[BITS_TO_LONGS(P4_ESCR_MSR_TABLE_SIZE)]; - int cpu = smp_processor_id(); - struct hw_perf_event *hwc; - struct p4_event_bind *bind; - unsigned int i, thread, num; - int cntr_idx, escr_idx; - u64 config_alias; - int pass; - - bitmap_zero(used_mask, X86_PMC_IDX_MAX); - bitmap_zero(escr_mask, P4_ESCR_MSR_TABLE_SIZE); - - for (i = 0, num = n; i < n; i++, num--) { - - hwc = &cpuc->event_list[i]->hw; - thread = p4_ht_thread(cpu); - pass = 0; - -again: - /* - * It's possible to hit a circular lock - * between original and alternative events - * if both are scheduled already. - */ - if (pass > 2) - goto done; - - bind = p4_config_get_bind(hwc->config); - escr_idx = p4_get_escr_idx(bind->escr_msr[thread]); - if (unlikely(escr_idx == -1)) - goto done; - - if (hwc->idx != -1 && !p4_should_swap_ts(hwc->config, cpu)) { - cntr_idx = hwc->idx; - if (assign) - assign[i] = hwc->idx; - goto reserve; - } - - cntr_idx = p4_next_cntr(thread, used_mask, bind); - if (cntr_idx == -1 || test_bit(escr_idx, escr_mask)) { - /* - * Check whether an event alias is still available. - */ - config_alias = p4_get_alias_event(hwc->config); - if (!config_alias) - goto done; - hwc->config = config_alias; - pass++; - goto again; - } - /* - * Perf does test runs to see if a whole group can be assigned - * together succesfully. There can be multiple rounds of this. - * Unfortunately, p4_pmu_swap_config_ts touches the hwc->config - * bits, such that the next round of group assignments will - * cause the above p4_should_swap_ts to pass instead of fail. - * This leads to counters exclusive to thread0 being used by - * thread1. - * - * Solve this with a cheap hack, reset the idx back to -1 to - * force a new lookup (p4_next_cntr) to get the right counter - * for the right thread. - * - * This probably doesn't comply with the general spirit of how - * perf wants to work, but P4 is special. :-( - */ - if (p4_should_swap_ts(hwc->config, cpu)) - hwc->idx = -1; - p4_pmu_swap_config_ts(hwc, cpu); - if (assign) - assign[i] = cntr_idx; -reserve: - set_bit(cntr_idx, used_mask); - set_bit(escr_idx, escr_mask); - } - -done: - return num ? -EINVAL : 0; -} - -PMU_FORMAT_ATTR(cccr, "config:0-31" ); -PMU_FORMAT_ATTR(escr, "config:32-62"); -PMU_FORMAT_ATTR(ht, "config:63" ); - -static struct attribute *intel_p4_formats_attr[] = { - &format_attr_cccr.attr, - &format_attr_escr.attr, - &format_attr_ht.attr, - NULL, -}; - -static __initconst const struct x86_pmu p4_pmu = { - .name = "Netburst P4/Xeon", - .handle_irq = p4_pmu_handle_irq, - .disable_all = p4_pmu_disable_all, - .enable_all = p4_pmu_enable_all, - .enable = p4_pmu_enable_event, - .disable = p4_pmu_disable_event, - .eventsel = MSR_P4_BPU_CCCR0, - .perfctr = MSR_P4_BPU_PERFCTR0, - .event_map = p4_pmu_event_map, - .max_events = ARRAY_SIZE(p4_general_events), - .get_event_constraints = x86_get_event_constraints, - /* - * IF HT disabled we may need to use all - * ARCH_P4_MAX_CCCR counters simulaneously - * though leave it restricted at moment assuming - * HT is on - */ - .num_counters = ARCH_P4_MAX_CCCR, - .apic = 1, - .cntval_bits = ARCH_P4_CNTRVAL_BITS, - .cntval_mask = ARCH_P4_CNTRVAL_MASK, - .max_period = (1ULL << (ARCH_P4_CNTRVAL_BITS - 1)) - 1, - .hw_config = p4_hw_config, - .schedule_events = p4_pmu_schedule_events, - /* - * This handles erratum N15 in intel doc 249199-029, - * the counter may not be updated correctly on write - * so we need a second write operation to do the trick - * (the official workaround didn't work) - * - * the former idea is taken from OProfile code - */ - .perfctr_second_write = 1, - - .format_attrs = intel_p4_formats_attr, -}; - -__init int p4_pmu_init(void) -{ - unsigned int low, high; - int i, reg; - - /* If we get stripped -- indexing fails */ - BUILD_BUG_ON(ARCH_P4_MAX_CCCR > INTEL_PMC_MAX_GENERIC); - - rdmsr(MSR_IA32_MISC_ENABLE, low, high); - if (!(low & (1 << 7))) { - pr_cont("unsupported Netburst CPU model %d ", - boot_cpu_data.x86_model); - return -ENODEV; - } - - memcpy(hw_cache_event_ids, p4_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); - - pr_cont("Netburst events, "); - - x86_pmu = p4_pmu; - - /* - * Even though the counters are configured to interrupt a particular - * logical processor when an overflow happens, testing has shown that - * on kdump kernels (which uses a single cpu), thread1's counter - * continues to run and will report an NMI on thread0. Due to the - * overflow bug, this leads to a stream of unknown NMIs. - * - * Solve this by zero'ing out the registers to mimic a reset. - */ - for (i = 0; i < x86_pmu.num_counters; i++) { - reg = x86_pmu_config_addr(i); - wrmsrl_safe(reg, 0ULL); - } - - return 0; -} diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c deleted file mode 100644 index 7c1a0c07b607..000000000000 --- a/arch/x86/kernel/cpu/perf_event_p6.c +++ /dev/null @@ -1,279 +0,0 @@ -#include <linux/perf_event.h> -#include <linux/types.h> - -#include "perf_event.h" - -/* - * Not sure about some of these - */ -static const u64 p6_perfmon_event_map[] = -{ - [PERF_COUNT_HW_CPU_CYCLES] = 0x0079, /* CPU_CLK_UNHALTED */ - [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, /* INST_RETIRED */ - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e, /* L2_RQSTS:M:E:S:I */ - [PERF_COUNT_HW_CACHE_MISSES] = 0x012e, /* L2_RQSTS:I */ - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, /* BR_INST_RETIRED */ - [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, /* BR_MISS_PRED_RETIRED */ - [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, /* BUS_DRDY_CLOCKS */ - [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00a2, /* RESOURCE_STALLS */ - -}; - -static const u64 __initconst p6_hw_cache_event_ids - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = -{ - [ C(L1D) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0043, /* DATA_MEM_REFS */ - [ C(RESULT_MISS) ] = 0x0045, /* DCU_LINES_IN */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0x0f29, /* L2_LD:M:E:S:I */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(L1I ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0080, /* IFU_IFETCH */ - [ C(RESULT_MISS) ] = 0x0f28, /* L2_IFETCH:M:E:S:I */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(LL ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0x0025, /* L2_M_LINES_INM */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(DTLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0043, /* DATA_MEM_REFS */ - [ C(RESULT_MISS) ] = 0, - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(ITLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0080, /* IFU_IFETCH */ - [ C(RESULT_MISS) ] = 0x0085, /* ITLB_MISS */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, - [ C(BPU ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED */ - [ C(RESULT_MISS) ] = 0x00c5, /* BR_MISS_PRED_RETIRED */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, -}; - -static u64 p6_pmu_event_map(int hw_event) -{ - return p6_perfmon_event_map[hw_event]; -} - -/* - * Event setting that is specified not to count anything. - * We use this to effectively disable a counter. - * - * L2_RQSTS with 0 MESI unit mask. - */ -#define P6_NOP_EVENT 0x0000002EULL - -static struct event_constraint p6_event_constraints[] = -{ - INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */ - INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ - INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ - INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ - INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ - INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ - EVENT_CONSTRAINT_END -}; - -static void p6_pmu_disable_all(void) -{ - u64 val; - - /* p6 only has one enable register */ - rdmsrl(MSR_P6_EVNTSEL0, val); - val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; - wrmsrl(MSR_P6_EVNTSEL0, val); -} - -static void p6_pmu_enable_all(int added) -{ - unsigned long val; - - /* p6 only has one enable register */ - rdmsrl(MSR_P6_EVNTSEL0, val); - val |= ARCH_PERFMON_EVENTSEL_ENABLE; - wrmsrl(MSR_P6_EVNTSEL0, val); -} - -static inline void -p6_pmu_disable_event(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - u64 val = P6_NOP_EVENT; - - (void)wrmsrl_safe(hwc->config_base, val); -} - -static void p6_pmu_enable_event(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - u64 val; - - val = hwc->config; - - /* - * p6 only has a global event enable, set on PerfEvtSel0 - * We "disable" events by programming P6_NOP_EVENT - * and we rely on p6_pmu_enable_all() being called - * to actually enable the events. - */ - - (void)wrmsrl_safe(hwc->config_base, val); -} - -PMU_FORMAT_ATTR(event, "config:0-7" ); -PMU_FORMAT_ATTR(umask, "config:8-15" ); -PMU_FORMAT_ATTR(edge, "config:18" ); -PMU_FORMAT_ATTR(pc, "config:19" ); -PMU_FORMAT_ATTR(inv, "config:23" ); -PMU_FORMAT_ATTR(cmask, "config:24-31" ); - -static struct attribute *intel_p6_formats_attr[] = { - &format_attr_event.attr, - &format_attr_umask.attr, - &format_attr_edge.attr, - &format_attr_pc.attr, - &format_attr_inv.attr, - &format_attr_cmask.attr, - NULL, -}; - -static __initconst const struct x86_pmu p6_pmu = { - .name = "p6", - .handle_irq = x86_pmu_handle_irq, - .disable_all = p6_pmu_disable_all, - .enable_all = p6_pmu_enable_all, - .enable = p6_pmu_enable_event, - .disable = p6_pmu_disable_event, - .hw_config = x86_pmu_hw_config, - .schedule_events = x86_schedule_events, - .eventsel = MSR_P6_EVNTSEL0, - .perfctr = MSR_P6_PERFCTR0, - .event_map = p6_pmu_event_map, - .max_events = ARRAY_SIZE(p6_perfmon_event_map), - .apic = 1, - .max_period = (1ULL << 31) - 1, - .version = 0, - .num_counters = 2, - /* - * Events have 40 bits implemented. However they are designed such - * that bits [32-39] are sign extensions of bit 31. As such the - * effective width of a event for P6-like PMU is 32 bits only. - * - * See IA-32 Intel Architecture Software developer manual Vol 3B - */ - .cntval_bits = 32, - .cntval_mask = (1ULL << 32) - 1, - .get_event_constraints = x86_get_event_constraints, - .event_constraints = p6_event_constraints, - - .format_attrs = intel_p6_formats_attr, - .events_sysfs_show = intel_event_sysfs_show, - -}; - -static __init void p6_pmu_rdpmc_quirk(void) -{ - if (boot_cpu_data.x86_mask < 9) { - /* - * PPro erratum 26; fixed in stepping 9 and above. - */ - pr_warn("Userspace RDPMC support disabled due to a CPU erratum\n"); - x86_pmu.attr_rdpmc_broken = 1; - x86_pmu.attr_rdpmc = 0; - } -} - -__init int p6_pmu_init(void) -{ - x86_pmu = p6_pmu; - - switch (boot_cpu_data.x86_model) { - case 1: /* Pentium Pro */ - x86_add_quirk(p6_pmu_rdpmc_quirk); - break; - - case 3: /* Pentium II - Klamath */ - case 5: /* Pentium II - Deschutes */ - case 6: /* Pentium II - Mendocino */ - break; - - case 7: /* Pentium III - Katmai */ - case 8: /* Pentium III - Coppermine */ - case 10: /* Pentium III Xeon */ - case 11: /* Pentium III - Tualatin */ - break; - - case 9: /* Pentium M - Banias */ - case 13: /* Pentium M - Dothan */ - break; - - default: - pr_cont("unsupported p6 CPU model %d ", boot_cpu_data.x86_model); - return -ENODEV; - } - - memcpy(hw_cache_event_ids, p6_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); - - return 0; -} diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c index 819d94982e07..f6f50c4ceaec 100644 --- a/arch/x86/kernel/cpu/rdrand.c +++ b/arch/x86/kernel/cpu/rdrand.c @@ -51,7 +51,7 @@ void x86_init_rdrand(struct cpuinfo_x86 *c) for (i = 0; i < SANITY_CHECK_LOOPS; i++) { if (!rdrand_long(&tmp)) { clear_cpu_cap(c, X86_FEATURE_RDRAND); - printk_once(KERN_WARNING "rdrand: disabled\n"); + pr_warn_once("rdrand: disabled\n"); return; } } diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index 4c60eaf0571c..cd531355e838 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -87,10 +87,10 @@ void detect_extended_topology(struct cpuinfo_x86 *c) c->x86_max_cores = (core_level_siblings / smp_num_siblings); if (!printed) { - printk(KERN_INFO "CPU: Physical Processor ID: %d\n", + pr_info("CPU: Physical Processor ID: %d\n", c->phys_proc_id); if (c->x86_max_cores > 1) - printk(KERN_INFO "CPU: Processor Core ID: %d\n", + pr_info("CPU: Processor Core ID: %d\n", c->cpu_core_id); printed = 1; } diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c index 252da7aceca6..34178564be2a 100644 --- a/arch/x86/kernel/cpu/transmeta.c +++ b/arch/x86/kernel/cpu/transmeta.c @@ -1,6 +1,6 @@ #include <linux/kernel.h> #include <linux/mm.h> -#include <asm/processor.h> +#include <asm/cpufeature.h> #include <asm/msr.h> #include "cpu.h" @@ -33,7 +33,7 @@ static void init_transmeta(struct cpuinfo_x86 *c) if (max >= 0x80860001) { cpuid(0x80860001, &dummy, &cpu_rev, &cpu_freq, &cpu_flags); if (cpu_rev != 0x02000000) { - printk(KERN_INFO "CPU: Processor revision %u.%u.%u.%u, %u MHz\n", + pr_info("CPU: Processor revision %u.%u.%u.%u, %u MHz\n", (cpu_rev >> 24) & 0xff, (cpu_rev >> 16) & 0xff, (cpu_rev >> 8) & 0xff, @@ -44,10 +44,10 @@ static void init_transmeta(struct cpuinfo_x86 *c) if (max >= 0x80860002) { cpuid(0x80860002, &new_cpu_rev, &cms_rev1, &cms_rev2, &dummy); if (cpu_rev == 0x02000000) { - printk(KERN_INFO "CPU: Processor revision %08X, %u MHz\n", + pr_info("CPU: Processor revision %08X, %u MHz\n", new_cpu_rev, cpu_freq); } - printk(KERN_INFO "CPU: Code Morphing Software revision %u.%u.%u-%u-%u\n", + pr_info("CPU: Code Morphing Software revision %u.%u.%u-%u-%u\n", (cms_rev1 >> 24) & 0xff, (cms_rev1 >> 16) & 0xff, (cms_rev1 >> 8) & 0xff, @@ -76,7 +76,7 @@ static void init_transmeta(struct cpuinfo_x86 *c) (void *)&cpu_info[56], (void *)&cpu_info[60]); cpu_info[64] = '\0'; - printk(KERN_INFO "CPU: %s\n", cpu_info); + pr_info("CPU: %s\n", cpu_info); } /* Unhide possibly hidden capability flags */ diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 628a059a9a06..364e58346897 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -62,7 +62,7 @@ static unsigned long vmware_get_tsc_khz(void) tsc_hz = eax | (((uint64_t)ebx) << 32); do_div(tsc_hz, 1000); BUG_ON(tsc_hz >> 32); - printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n", + pr_info("TSC freq read from hypervisor : %lu.%03lu MHz\n", (unsigned long) tsc_hz / 1000, (unsigned long) tsc_hz % 1000); @@ -84,8 +84,7 @@ static void __init vmware_platform_setup(void) if (ebx != UINT_MAX) x86_platform.calibrate_tsc = vmware_get_tsc_khz; else - printk(KERN_WARNING - "Failed to get TSC freq from the hypervisor\n"); + pr_warn("Failed to get TSC freq from the hypervisor\n"); } /* diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 58f34319b29a..9ef978d69c22 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -57,10 +57,9 @@ struct crash_elf_data { struct kimage *image; /* * Total number of ram ranges we have after various adjustments for - * GART, crash reserved region etc. + * crash reserved region, etc. */ unsigned int max_nr_ranges; - unsigned long gart_start, gart_end; /* Pointer to elf header */ void *ehdr; @@ -201,17 +200,6 @@ static int get_nr_ram_ranges_callback(u64 start, u64 end, void *arg) return 0; } -static int get_gart_ranges_callback(u64 start, u64 end, void *arg) -{ - struct crash_elf_data *ced = arg; - - ced->gart_start = start; - ced->gart_end = end; - - /* Not expecting more than 1 gart aperture */ - return 1; -} - /* Gather all the required information to prepare elf headers for ram regions */ static void fill_up_crash_elf_data(struct crash_elf_data *ced, @@ -226,22 +214,6 @@ static void fill_up_crash_elf_data(struct crash_elf_data *ced, ced->max_nr_ranges = nr_ranges; - /* - * We don't create ELF headers for GART aperture as an attempt - * to dump this memory in second kernel leads to hang/crash. - * If gart aperture is present, one needs to exclude that region - * and that could lead to need of extra phdr. - */ - walk_iomem_res("GART", IORESOURCE_MEM, 0, -1, - ced, get_gart_ranges_callback); - - /* - * If we have gart region, excluding that could potentially split - * a memory range, resulting in extra header. Account for that. - */ - if (ced->gart_end) - ced->max_nr_ranges++; - /* Exclusion of crash region could split memory ranges */ ced->max_nr_ranges++; @@ -350,13 +322,6 @@ static int elf_header_exclude_ranges(struct crash_elf_data *ced, return ret; } - /* Exclude GART region */ - if (ced->gart_end) { - ret = exclude_mem_range(cmem, ced->gart_start, ced->gart_end); - if (ret) - return ret; - } - return ret; } @@ -599,12 +564,12 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params) /* Add ACPI tables */ cmd.type = E820_ACPI; flags = IORESOURCE_MEM | IORESOURCE_BUSY; - walk_iomem_res("ACPI Tables", flags, 0, -1, &cmd, + walk_iomem_res_desc(IORES_DESC_ACPI_TABLES, flags, 0, -1, &cmd, memmap_entry_callback); /* Add ACPI Non-volatile Storage */ cmd.type = E820_NVS; - walk_iomem_res("ACPI Non-volatile Storage", flags, 0, -1, &cmd, + walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1, &cmd, memmap_entry_callback); /* Add crashk_low_res region */ diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 9c30acfadae2..8efa57a5f29e 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -135,7 +135,8 @@ print_context_stack_bp(struct thread_info *tinfo, if (!__kernel_text_address(addr)) break; - ops->address(data, addr, 1); + if (ops->address(data, addr, 1)) + break; frame = frame->next_frame; ret_addr = &frame->return_address; print_ftrace_graph_addr(addr, data, ops, tinfo, graph); @@ -154,10 +155,11 @@ static int print_trace_stack(void *data, char *name) /* * Print one address/symbol entries per line. */ -static void print_trace_address(void *data, unsigned long addr, int reliable) +static int print_trace_address(void *data, unsigned long addr, int reliable) { touch_nmi_watchdog(); printk_stack_address(addr, reliable, data); + return 0; } static const struct stacktrace_ops print_trace_ops = { @@ -265,9 +267,8 @@ int __die(const char *str, struct pt_regs *regs, long err) #ifdef CONFIG_SMP printk("SMP "); #endif -#ifdef CONFIG_DEBUG_PAGEALLOC - printk("DEBUG_PAGEALLOC "); -#endif + if (debug_pagealloc_enabled()) + printk("DEBUG_PAGEALLOC "); #ifdef CONFIG_KASAN printk("KASAN"); #endif diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 569c1e4f96fe..621b501f8935 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -24,6 +24,7 @@ #include <asm/e820.h> #include <asm/proto.h> #include <asm/setup.h> +#include <asm/cpufeature.h> /* * The e820 map is the map that gets modified e.g. with command line parameters @@ -925,6 +926,41 @@ static const char *e820_type_to_string(int e820_type) } } +static unsigned long e820_type_to_iomem_type(int e820_type) +{ + switch (e820_type) { + case E820_RESERVED_KERN: + case E820_RAM: + return IORESOURCE_SYSTEM_RAM; + case E820_ACPI: + case E820_NVS: + case E820_UNUSABLE: + case E820_PRAM: + case E820_PMEM: + default: + return IORESOURCE_MEM; + } +} + +static unsigned long e820_type_to_iores_desc(int e820_type) +{ + switch (e820_type) { + case E820_ACPI: + return IORES_DESC_ACPI_TABLES; + case E820_NVS: + return IORES_DESC_ACPI_NV_STORAGE; + case E820_PMEM: + return IORES_DESC_PERSISTENT_MEMORY; + case E820_PRAM: + return IORES_DESC_PERSISTENT_MEMORY_LEGACY; + case E820_RESERVED_KERN: + case E820_RAM: + case E820_UNUSABLE: + default: + return IORES_DESC_NONE; + } +} + static bool do_mark_busy(u32 type, struct resource *res) { /* this is the legacy bios/dos rom-shadow + mmio region */ @@ -967,7 +1003,8 @@ void __init e820_reserve_resources(void) res->start = e820.map[i].addr; res->end = end; - res->flags = IORESOURCE_MEM; + res->flags = e820_type_to_iomem_type(e820.map[i].type); + res->desc = e820_type_to_iores_desc(e820.map[i].type); /* * don't register the region that could be conflicted with diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 21bf92490a7b..8a121991e5ba 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -287,7 +287,7 @@ static __init void early_pci_serial_init(char *s) } /* - * Lastly, initalize the hardware + * Lastly, initialize the hardware */ if (*s) { if (strcmp(s, "nocfg") == 0) diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index d25097c3fc1d..8e37cc8a539a 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -114,6 +114,10 @@ void __kernel_fpu_begin(void) kernel_fpu_disable(); if (fpu->fpregs_active) { + /* + * Ignore return value -- we don't care if reg state + * is clobbered. + */ copy_fpregs_to_fpstate(fpu); } else { this_cpu_write(fpu_fpregs_owner_ctx, NULL); @@ -189,8 +193,12 @@ void fpu__save(struct fpu *fpu) preempt_disable(); if (fpu->fpregs_active) { - if (!copy_fpregs_to_fpstate(fpu)) - fpregs_deactivate(fpu); + if (!copy_fpregs_to_fpstate(fpu)) { + if (use_eager_fpu()) + copy_kernel_to_fpregs(&fpu->state); + else + fpregs_deactivate(fpu); + } } preempt_enable(); } @@ -223,14 +231,15 @@ void fpstate_init(union fpregs_state *state) } EXPORT_SYMBOL_GPL(fpstate_init); -/* - * Copy the current task's FPU state to a new task's FPU context. - * - * In both the 'eager' and the 'lazy' case we save hardware registers - * directly to the destination buffer. - */ -static void fpu_copy(struct fpu *dst_fpu, struct fpu *src_fpu) +int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) { + dst_fpu->counter = 0; + dst_fpu->fpregs_active = 0; + dst_fpu->last_cpu = -1; + + if (!src_fpu->fpstate_active || !cpu_has_fpu) + return 0; + WARN_ON_FPU(src_fpu != ¤t->thread.fpu); /* @@ -243,10 +252,9 @@ static void fpu_copy(struct fpu *dst_fpu, struct fpu *src_fpu) /* * Save current FPU registers directly into the child * FPU context, without any memory-to-memory copying. - * - * If the FPU context got destroyed in the process (FNSAVE - * done on old CPUs) then copy it back into the source - * context and mark the current task for lazy restore. + * In lazy mode, if the FPU context isn't loaded into + * fpregs, CR0.TS will be set and do_device_not_available + * will load the FPU context. * * We have to do all this with preemption disabled, * mostly because of the FNSAVE case, because in that @@ -259,19 +267,13 @@ static void fpu_copy(struct fpu *dst_fpu, struct fpu *src_fpu) preempt_disable(); if (!copy_fpregs_to_fpstate(dst_fpu)) { memcpy(&src_fpu->state, &dst_fpu->state, xstate_size); - fpregs_deactivate(src_fpu); + + if (use_eager_fpu()) + copy_kernel_to_fpregs(&src_fpu->state); + else + fpregs_deactivate(src_fpu); } preempt_enable(); -} - -int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) -{ - dst_fpu->counter = 0; - dst_fpu->fpregs_active = 0; - dst_fpu->last_cpu = -1; - - if (src_fpu->fpstate_active && cpu_has_fpu) - fpu_copy(dst_fpu, src_fpu); return 0; } @@ -352,6 +354,69 @@ void fpu__activate_fpstate_write(struct fpu *fpu) } /* + * This function must be called before we write the current + * task's fpstate. + * + * This call gets the current FPU register state and moves + * it in to the 'fpstate'. Preemption is disabled so that + * no writes to the 'fpstate' can occur from context + * swiches. + * + * Must be followed by a fpu__current_fpstate_write_end(). + */ +void fpu__current_fpstate_write_begin(void) +{ + struct fpu *fpu = ¤t->thread.fpu; + + /* + * Ensure that the context-switching code does not write + * over the fpstate while we are doing our update. + */ + preempt_disable(); + + /* + * Move the fpregs in to the fpu's 'fpstate'. + */ + fpu__activate_fpstate_read(fpu); + + /* + * The caller is about to write to 'fpu'. Ensure that no + * CPU thinks that its fpregs match the fpstate. This + * ensures we will not be lazy and skip a XRSTOR in the + * future. + */ + fpu->last_cpu = -1; +} + +/* + * This function must be paired with fpu__current_fpstate_write_begin() + * + * This will ensure that the modified fpstate gets placed back in + * the fpregs if necessary. + * + * Note: This function may be called whether or not an _actual_ + * write to the fpstate occurred. + */ +void fpu__current_fpstate_write_end(void) +{ + struct fpu *fpu = ¤t->thread.fpu; + + /* + * 'fpu' now has an updated copy of the state, but the + * registers may still be out of date. Update them with + * an XRSTOR if they are active. + */ + if (fpregs_active()) + copy_kernel_to_fpregs(&fpu->state); + + /* + * Our update is done and the fpregs/fpstate are in sync + * if necessary. Context switches can happen again. + */ + preempt_enable(); +} + +/* * 'fpu__restore()' is called to copy FPU registers from * the FPU fpstate to the live hw registers and to activate * access to the hardware registers, so that FPU instructions @@ -409,8 +474,10 @@ static inline void copy_init_fpstate_to_fpregs(void) { if (use_xsave()) copy_kernel_to_xregs(&init_fpstate.xsave, -1); - else + else if (static_cpu_has(X86_FEATURE_FXSR)) copy_kernel_to_fxregs(&init_fpstate.fxsave); + else + copy_kernel_to_fregs(&init_fpstate.fsave); } /* @@ -423,7 +490,7 @@ void fpu__clear(struct fpu *fpu) { WARN_ON_FPU(fpu != ¤t->thread.fpu); /* Almost certainly an anomaly */ - if (!use_eager_fpu()) { + if (!use_eager_fpu() || !static_cpu_has(X86_FEATURE_FPU)) { /* FPU state will be reallocated lazily at the first use. */ fpu__drop(fpu); } else { diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 6d9f0a7ef4c8..54c86fffbf9f 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -78,13 +78,15 @@ static void fpu__init_system_early_generic(struct cpuinfo_x86 *c) cr0 &= ~(X86_CR0_TS | X86_CR0_EM); write_cr0(cr0); - asm volatile("fninit ; fnstsw %0 ; fnstcw %1" - : "+m" (fsw), "+m" (fcw)); + if (!test_bit(X86_FEATURE_FPU, (unsigned long *)cpu_caps_cleared)) { + asm volatile("fninit ; fnstsw %0 ; fnstcw %1" + : "+m" (fsw), "+m" (fcw)); - if (fsw == 0 && (fcw & 0x103f) == 0x003f) - set_cpu_cap(c, X86_FEATURE_FPU); - else - clear_cpu_cap(c, X86_FEATURE_FPU); + if (fsw == 0 && (fcw & 0x103f) == 0x003f) + set_cpu_cap(c, X86_FEATURE_FPU); + else + clear_cpu_cap(c, X86_FEATURE_FPU); + } #ifndef CONFIG_MATH_EMULATION if (!cpu_has_fpu) { @@ -132,7 +134,7 @@ static void __init fpu__init_system_generic(void) * Set up the legacy init FPU context. (xstate init might overwrite this * with a more modern format, if the CPU supports it.) */ - fpstate_init_fxstate(&init_fpstate.fxsave); + fpstate_init(&init_fpstate); fpu__init_system_mxcsr(); } @@ -260,7 +262,10 @@ static void __init fpu__init_system_xstate_size_legacy(void) * not only saved the restores along the way, but we also have the * FPU ready to be used for the original task. * - * 'eager' switching is used on modern CPUs, there we switch the FPU + * 'lazy' is deprecated because it's almost never a performance win + * and it's much more complicated than 'eager'. + * + * 'eager' switching is by default on all CPUs, there we switch the FPU * state during every context switch, regardless of whether the task * has used FPU instructions in that time slice or not. This is done * because modern FPU context saving instructions are able to optimize @@ -271,7 +276,7 @@ static void __init fpu__init_system_xstate_size_legacy(void) * to use 'eager' restores, if we detect that a task is using the FPU * frequently. See the fpu->counter logic in fpu/internal.h for that. ] */ -static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO; +static enum { ENABLE, DISABLE } eagerfpu = ENABLE; /* * Find supported xfeatures based on cpu features and command-line input. @@ -300,12 +305,6 @@ u64 __init fpu__get_supported_xfeatures_mask(void) static void __init fpu__clear_eager_fpu_features(void) { setup_clear_cpu_cap(X86_FEATURE_MPX); - setup_clear_cpu_cap(X86_FEATURE_AVX); - setup_clear_cpu_cap(X86_FEATURE_AVX2); - setup_clear_cpu_cap(X86_FEATURE_AVX512F); - setup_clear_cpu_cap(X86_FEATURE_AVX512PF); - setup_clear_cpu_cap(X86_FEATURE_AVX512ER); - setup_clear_cpu_cap(X86_FEATURE_AVX512CD); } /* @@ -348,15 +347,9 @@ static void __init fpu__init_system_ctx_switch(void) */ static void __init fpu__init_parse_early_param(void) { - /* - * No need to check "eagerfpu=auto" again, since it is the - * initial default. - */ if (cmdline_find_option_bool(boot_command_line, "eagerfpu=off")) { eagerfpu = DISABLE; fpu__clear_eager_fpu_features(); - } else if (cmdline_find_option_bool(boot_command_line, "eagerfpu=on")) { - eagerfpu = ENABLE; } if (cmdline_find_option_bool(boot_command_line, "no387")) diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 0bc3490420c5..8bd1c003942a 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -8,7 +8,7 @@ /* * The xstateregs_active() routine is the same as the regset_fpregs_active() routine, * as the "regset->n" for the xstate regset will be updated based on the feature - * capabilites supported by the xsave. + * capabilities supported by the xsave. */ int regset_fpregs_active(struct task_struct *target, const struct user_regset *regset) { diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index d425cda5ae6d..b48ef35b28d4 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -5,6 +5,7 @@ */ #include <linux/compat.h> #include <linux/cpu.h> +#include <linux/pkeys.h> #include <asm/fpu/api.h> #include <asm/fpu/internal.h> @@ -13,6 +14,11 @@ #include <asm/tlbflush.h> +/* + * Although we spell it out in here, the Processor Trace + * xfeature is completely unused. We use other mechanisms + * to save/restore PT state in Linux. + */ static const char *xfeature_names[] = { "x87 floating point registers" , @@ -23,6 +29,8 @@ static const char *xfeature_names[] = "AVX-512 opmask" , "AVX-512 Hi256" , "AVX-512 ZMM_Hi256" , + "Processor Trace (unused)" , + "Protection Keys User registers", "unknown xstate feature" , }; @@ -51,8 +59,12 @@ void fpu__xstate_clear_all_cpu_caps(void) setup_clear_cpu_cap(X86_FEATURE_AVX512PF); setup_clear_cpu_cap(X86_FEATURE_AVX512ER); setup_clear_cpu_cap(X86_FEATURE_AVX512CD); + setup_clear_cpu_cap(X86_FEATURE_AVX512DQ); + setup_clear_cpu_cap(X86_FEATURE_AVX512BW); + setup_clear_cpu_cap(X86_FEATURE_AVX512VL); setup_clear_cpu_cap(X86_FEATURE_MPX); setup_clear_cpu_cap(X86_FEATURE_XGETBV1); + setup_clear_cpu_cap(X86_FEATURE_PKU); } /* @@ -231,7 +243,7 @@ static void __init print_xstate_feature(u64 xstate_mask) const char *feature_name; if (cpu_has_xfeatures(xstate_mask, &feature_name)) - pr_info("x86/fpu: Supporting XSAVE feature 0x%02Lx: '%s'\n", xstate_mask, feature_name); + pr_info("x86/fpu: Supporting XSAVE feature 0x%03Lx: '%s'\n", xstate_mask, feature_name); } /* @@ -247,6 +259,7 @@ static void __init print_xstate_features(void) print_xstate_feature(XFEATURE_MASK_OPMASK); print_xstate_feature(XFEATURE_MASK_ZMM_Hi256); print_xstate_feature(XFEATURE_MASK_Hi16_ZMM); + print_xstate_feature(XFEATURE_MASK_PKRU); } /* @@ -463,6 +476,7 @@ static void check_xstate_against_struct(int nr) XCHECK_SZ(sz, nr, XFEATURE_OPMASK, struct avx_512_opmask_state); XCHECK_SZ(sz, nr, XFEATURE_ZMM_Hi256, struct avx_512_zmm_uppers_state); XCHECK_SZ(sz, nr, XFEATURE_Hi16_ZMM, struct avx_512_hi16_state); + XCHECK_SZ(sz, nr, XFEATURE_PKRU, struct pkru_state); /* * Make *SURE* to add any feature numbers in below if @@ -470,7 +484,8 @@ static void check_xstate_against_struct(int nr) * numbers. */ if ((nr < XFEATURE_YMM) || - (nr >= XFEATURE_MAX)) { + (nr >= XFEATURE_MAX) || + (nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR)) { WARN_ONCE(1, "no structure for xstate: %d\n", nr); XSTATE_WARN_ON(1); } @@ -668,6 +683,19 @@ void fpu__resume_cpu(void) } /* + * Given an xstate feature mask, calculate where in the xsave + * buffer the state is. Callers should ensure that the buffer + * is valid. + * + * Note: does not work for compacted buffers. + */ +void *__raw_xsave_addr(struct xregs_state *xsave, int xstate_feature_mask) +{ + int feature_nr = fls64(xstate_feature_mask) - 1; + + return (void *)xsave + xstate_comp_offsets[feature_nr]; +} +/* * Given the xsave area and a state inside, this function returns the * address of the state. * @@ -687,7 +715,6 @@ void fpu__resume_cpu(void) */ void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature) { - int feature_nr = fls64(xstate_feature) - 1; /* * Do we even *have* xsave state? */ @@ -715,7 +742,7 @@ void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature) if (!(xsave->header.xfeatures & xstate_feature)) return NULL; - return (void *)xsave + xstate_comp_offsets[feature_nr]; + return __raw_xsave_addr(xsave, xstate_feature); } EXPORT_SYMBOL_GPL(get_xsave_addr); @@ -750,3 +777,156 @@ const void *get_xsave_field_ptr(int xsave_state) return get_xsave_addr(&fpu->state.xsave, xsave_state); } + + +/* + * Set xfeatures (aka XSTATE_BV) bit for a feature that we want + * to take out of its "init state". This will ensure that an + * XRSTOR actually restores the state. + */ +static void fpu__xfeature_set_non_init(struct xregs_state *xsave, + int xstate_feature_mask) +{ + xsave->header.xfeatures |= xstate_feature_mask; +} + +/* + * This function is safe to call whether the FPU is in use or not. + * + * Note that this only works on the current task. + * + * Inputs: + * @xsave_state: state which is defined in xsave.h (e.g. XFEATURE_MASK_FP, + * XFEATURE_MASK_SSE, etc...) + * @xsave_state_ptr: a pointer to a copy of the state that you would + * like written in to the current task's FPU xsave state. This pointer + * must not be located in the current tasks's xsave area. + * Output: + * address of the state in the xsave area or NULL if the state + * is not present or is in its 'init state'. + */ +static void fpu__xfeature_set_state(int xstate_feature_mask, + void *xstate_feature_src, size_t len) +{ + struct xregs_state *xsave = ¤t->thread.fpu.state.xsave; + struct fpu *fpu = ¤t->thread.fpu; + void *dst; + + if (!boot_cpu_has(X86_FEATURE_XSAVE)) { + WARN_ONCE(1, "%s() attempted with no xsave support", __func__); + return; + } + + /* + * Tell the FPU code that we need the FPU state to be in + * 'fpu' (not in the registers), and that we need it to + * be stable while we write to it. + */ + fpu__current_fpstate_write_begin(); + + /* + * This method *WILL* *NOT* work for compact-format + * buffers. If the 'xstate_feature_mask' is unset in + * xcomp_bv then we may need to move other feature state + * "up" in the buffer. + */ + if (xsave->header.xcomp_bv & xstate_feature_mask) { + WARN_ON_ONCE(1); + goto out; + } + + /* find the location in the xsave buffer of the desired state */ + dst = __raw_xsave_addr(&fpu->state.xsave, xstate_feature_mask); + + /* + * Make sure that the pointer being passed in did not + * come from the xsave buffer itself. + */ + WARN_ONCE(xstate_feature_src == dst, "set from xsave buffer itself"); + + /* put the caller-provided data in the location */ + memcpy(dst, xstate_feature_src, len); + + /* + * Mark the xfeature so that the CPU knows there is state + * in the buffer now. + */ + fpu__xfeature_set_non_init(xsave, xstate_feature_mask); +out: + /* + * We are done writing to the 'fpu'. Reenable preeption + * and (possibly) move the fpstate back in to the fpregs. + */ + fpu__current_fpstate_write_end(); +} + +#define NR_VALID_PKRU_BITS (CONFIG_NR_PROTECTION_KEYS * 2) +#define PKRU_VALID_MASK (NR_VALID_PKRU_BITS - 1) + +/* + * This will go out and modify the XSAVE buffer so that PKRU is + * set to a particular state for access to 'pkey'. + * + * PKRU state does affect kernel access to user memory. We do + * not modfiy PKRU *itself* here, only the XSAVE state that will + * be restored in to PKRU when we return back to userspace. + */ +int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, + unsigned long init_val) +{ + struct xregs_state *xsave = &tsk->thread.fpu.state.xsave; + struct pkru_state *old_pkru_state; + struct pkru_state new_pkru_state; + int pkey_shift = (pkey * PKRU_BITS_PER_PKEY); + u32 new_pkru_bits = 0; + + /* + * This check implies XSAVE support. OSPKE only gets + * set if we enable XSAVE and we enable PKU in XCR0. + */ + if (!boot_cpu_has(X86_FEATURE_OSPKE)) + return -EINVAL; + + /* Set the bits we need in PKRU */ + if (init_val & PKEY_DISABLE_ACCESS) + new_pkru_bits |= PKRU_AD_BIT; + if (init_val & PKEY_DISABLE_WRITE) + new_pkru_bits |= PKRU_WD_BIT; + + /* Shift the bits in to the correct place in PKRU for pkey. */ + new_pkru_bits <<= pkey_shift; + + /* Locate old copy of the state in the xsave buffer */ + old_pkru_state = get_xsave_addr(xsave, XFEATURE_MASK_PKRU); + + /* + * When state is not in the buffer, it is in the init + * state, set it manually. Otherwise, copy out the old + * state. + */ + if (!old_pkru_state) + new_pkru_state.pkru = 0; + else + new_pkru_state.pkru = old_pkru_state->pkru; + + /* mask off any old bits in place */ + new_pkru_state.pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift); + /* Set the newly-requested bits */ + new_pkru_state.pkru |= new_pkru_bits; + + /* + * We could theoretically live without zeroing pkru.pad. + * The current XSAVE feature state definition says that + * only bytes 0->3 are used. But we do not want to + * chance leaking kernel stack out to userspace in case a + * memcpy() of the whole xsave buffer was done. + * + * They're in the same cacheline anyway. + */ + new_pkru_state.pad = 0; + + fpu__xfeature_set_state(XFEATURE_MASK_PKRU, &new_pkru_state, + sizeof(new_pkru_state)); + + return 0; +} diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 29408d6d6626..d036cfb4495d 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -1,5 +1,5 @@ /* - * Code for replacing ftrace calls with jumps. + * Dynamic function tracing support. * * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com> * @@ -81,9 +81,9 @@ within(unsigned long addr, unsigned long start, unsigned long end) static unsigned long text_ip_addr(unsigned long ip) { /* - * On x86_64, kernel text mappings are mapped read-only with - * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead - * of the kernel text mapping to modify the kernel text. + * On x86_64, kernel text mappings are mapped read-only, so we use + * the kernel identity mapping instead of the kernel text mapping + * to modify the kernel text. * * For 32bit kernels, these mappings are same and we can use * kernel identity mapping to modify code. @@ -697,9 +697,8 @@ static inline void tramp_free(void *tramp) { } #endif /* Defined as markers to the end of the ftrace default trampolines */ -extern void ftrace_caller_end(void); extern void ftrace_regs_caller_end(void); -extern void ftrace_return(void); +extern void ftrace_epilogue(void); extern void ftrace_caller_op_ptr(void); extern void ftrace_regs_caller_op_ptr(void); @@ -746,7 +745,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) op_offset = (unsigned long)ftrace_regs_caller_op_ptr; } else { start_offset = (unsigned long)ftrace_caller; - end_offset = (unsigned long)ftrace_caller_end; + end_offset = (unsigned long)ftrace_epilogue; op_offset = (unsigned long)ftrace_caller_op_ptr; } @@ -754,7 +753,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) /* * Allocate enough size to store the ftrace_caller code, - * the jmp to ftrace_return, as well as the address of + * the jmp to ftrace_epilogue, as well as the address of * the ftrace_ops this trampoline is used for. */ trampoline = alloc_tramp(size + MCOUNT_INSN_SIZE + sizeof(void *)); @@ -772,8 +771,8 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) ip = (unsigned long)trampoline + size; - /* The trampoline ends with a jmp to ftrace_return */ - jmp = ftrace_jmp_replace(ip, (unsigned long)ftrace_return); + /* The trampoline ends with a jmp to ftrace_epilogue */ + jmp = ftrace_jmp_replace(ip, (unsigned long)ftrace_epilogue); memcpy(trampoline + size, jmp, MCOUNT_INSN_SIZE); /* diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 2c0f3407bd1f..1f4422d5c8d0 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -40,13 +40,8 @@ pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX); /* Wipe all early page tables except for the kernel symbol map */ static void __init reset_early_page_tables(void) { - unsigned long i; - - for (i = 0; i < PTRS_PER_PGD-1; i++) - early_level4_pgt[i].pgd = 0; - + memset(early_level4_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1)); next_early_pgt = 0; - write_cr3(__pa_nodebug(early_level4_pgt)); } @@ -54,7 +49,6 @@ static void __init reset_early_page_tables(void) int __init early_make_pgtable(unsigned long address) { unsigned long physaddr = address - __PAGE_OFFSET; - unsigned long i; pgdval_t pgd, *pgd_p; pudval_t pud, *pud_p; pmdval_t pmd, *pmd_p; @@ -81,8 +75,7 @@ again: } pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++]; - for (i = 0; i < PTRS_PER_PUD; i++) - pud_p[i] = 0; + memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD); *pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; } pud_p += pud_index(address); @@ -97,8 +90,7 @@ again: } pmd_p = (pmdval_t *)early_dynamic_pgts[next_early_pgt++]; - for (i = 0; i < PTRS_PER_PMD; i++) - pmd_p[i] = 0; + memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD); *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; } pmd = (physaddr & PMD_MASK) + early_pmd_flags; diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 6bc9ae24b6d2..54cdbd2003fe 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -19,7 +19,7 @@ #include <asm/setup.h> #include <asm/processor-flags.h> #include <asm/msr-index.h> -#include <asm/cpufeature.h> +#include <asm/cpufeatures.h> #include <asm/percpu.h> #include <asm/nops.h> #include <asm/bootparam.h> @@ -389,6 +389,12 @@ default_entry: /* Make changes effective */ wrmsr + /* + * And make sure that all the mappings we set up have NX set from + * the beginning. + */ + orl $(1 << (_PAGE_BIT_NX - 32)), pa(__supported_pte_mask + 4) + enable_paging: /* diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index ffdc0e860390..22fbf9df61bb 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -38,7 +38,6 @@ #define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) L4_PAGE_OFFSET = pgd_index(__PAGE_OFFSET) -L3_PAGE_OFFSET = pud_index(__PAGE_OFFSET) L4_START_KERNEL = pgd_index(__START_KERNEL_map) L3_START_KERNEL = pud_index(__START_KERNEL_map) @@ -76,9 +75,7 @@ startup_64: subq $_text - __START_KERNEL_map, %rbp /* Is the address not 2M aligned? */ - movq %rbp, %rax - andl $~PMD_PAGE_MASK, %eax - testl %eax, %eax + testl $~PMD_PAGE_MASK, %ebp jnz bad_address /* diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index b8e6ff5cd5d0..a1f0e4a5c47e 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -12,6 +12,7 @@ #include <linux/pm.h> #include <linux/io.h> +#include <asm/cpufeature.h> #include <asm/irqdomain.h> #include <asm/fixmap.h> #include <asm/hpet.h> @@ -716,7 +717,7 @@ static int hpet_cpuhp_notify(struct notifier_block *n, struct hpet_work_struct work; struct hpet_dev *hdev = per_cpu(cpu_hpet_dev, cpu); - switch (action & 0xf) { + switch (action & ~CPU_TASKS_FROZEN) { case CPU_ONLINE: INIT_DELAYED_WORK_ONSTACK(&work.work, hpet_work); init_completion(&work.complete); diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index 37dae792dbbe..589b3193f102 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -96,9 +96,14 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) SYSCALL_DEFINE1(iopl, unsigned int, level) { struct pt_regs *regs = current_pt_regs(); - unsigned int old = (regs->flags >> 12) & 3; struct thread_struct *t = ¤t->thread; + /* + * Careful: the IOPL bits in regs->flags are undefined under Xen PV + * and changing them has no effect. + */ + unsigned int old = t->iopl >> X86_EFLAGS_IOPL_BIT; + if (level > 3) return -EINVAL; /* Trying to gain more privileges? */ @@ -106,8 +111,9 @@ SYSCALL_DEFINE1(iopl, unsigned int, level) if (!capable(CAP_SYS_RAWIO)) return -EPERM; } - regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | (level << 12); - t->iopl = level << 12; + regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | + (level << X86_EFLAGS_IOPL_BIT); + t->iopl = level << X86_EFLAGS_IOPL_BIT; set_iopl_mask(t->iopl); return 0; diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index 0f8a6bbaaa44..2af478e3fd4e 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -271,7 +271,7 @@ static int bzImage64_probe(const char *buf, unsigned long len) int ret = -ENOEXEC; struct setup_header *header; - /* kernel should be atleast two sectors long */ + /* kernel should be at least two sectors long */ if (len < 2 * 512) { pr_err("File is too short to be a bzImage\n"); return ret; diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 44256a62702b..2da6ee9ae69b 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -609,9 +609,9 @@ static struct notifier_block kgdb_notifier = { }; /** - * kgdb_arch_init - Perform any architecture specific initalization. + * kgdb_arch_init - Perform any architecture specific initialization. * - * This function will handle the initalization of any architecture + * This function will handle the initialization of any architecture * specific callbacks. */ int kgdb_arch_init(void) @@ -750,9 +750,7 @@ void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long ip) int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) { int err; -#ifdef CONFIG_DEBUG_RODATA char opc[BREAK_INSTR_SIZE]; -#endif /* CONFIG_DEBUG_RODATA */ bpt->type = BP_BREAKPOINT; err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr, @@ -761,7 +759,6 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) return err; err = probe_kernel_write((char *)bpt->bpt_addr, arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE); -#ifdef CONFIG_DEBUG_RODATA if (!err) return err; /* @@ -778,13 +775,12 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) if (memcmp(opc, arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE)) return -EINVAL; bpt->type = BP_POKE_BREAKPOINT; -#endif /* CONFIG_DEBUG_RODATA */ + return err; } int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt) { -#ifdef CONFIG_DEBUG_RODATA int err; char opc[BREAK_INSTR_SIZE]; @@ -801,8 +797,8 @@ int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt) if (err || memcmp(opc, bpt->saved_instr, BREAK_INSTR_SIZE)) goto knl_write; return err; + knl_write: -#endif /* CONFIG_DEBUG_RODATA */ return probe_kernel_write((char *)bpt->bpt_addr, (char *)bpt->saved_instr, BREAK_INSTR_SIZE); } diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 1deffe6cc873..ae703acb85c1 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -49,6 +49,7 @@ #include <linux/kdebug.h> #include <linux/kallsyms.h> #include <linux/ftrace.h> +#include <linux/frame.h> #include <asm/cacheflush.h> #include <asm/desc.h> @@ -671,39 +672,39 @@ NOKPROBE_SYMBOL(kprobe_int3_handler); * When a retprobed function returns, this code saves registers and * calls trampoline_handler() runs, which calls the kretprobe's handler. */ -static void __used kretprobe_trampoline_holder(void) -{ - asm volatile ( - ".global kretprobe_trampoline\n" - "kretprobe_trampoline: \n" +asm( + ".global kretprobe_trampoline\n" + ".type kretprobe_trampoline, @function\n" + "kretprobe_trampoline:\n" #ifdef CONFIG_X86_64 - /* We don't bother saving the ss register */ - " pushq %rsp\n" - " pushfq\n" - SAVE_REGS_STRING - " movq %rsp, %rdi\n" - " call trampoline_handler\n" - /* Replace saved sp with true return address. */ - " movq %rax, 152(%rsp)\n" - RESTORE_REGS_STRING - " popfq\n" + /* We don't bother saving the ss register */ + " pushq %rsp\n" + " pushfq\n" + SAVE_REGS_STRING + " movq %rsp, %rdi\n" + " call trampoline_handler\n" + /* Replace saved sp with true return address. */ + " movq %rax, 152(%rsp)\n" + RESTORE_REGS_STRING + " popfq\n" #else - " pushf\n" - SAVE_REGS_STRING - " movl %esp, %eax\n" - " call trampoline_handler\n" - /* Move flags to cs */ - " movl 56(%esp), %edx\n" - " movl %edx, 52(%esp)\n" - /* Replace saved flags with true return address. */ - " movl %eax, 56(%esp)\n" - RESTORE_REGS_STRING - " popf\n" + " pushf\n" + SAVE_REGS_STRING + " movl %esp, %eax\n" + " call trampoline_handler\n" + /* Move flags to cs */ + " movl 56(%esp), %edx\n" + " movl %edx, 52(%esp)\n" + /* Replace saved flags with true return address. */ + " movl %eax, 56(%esp)\n" + RESTORE_REGS_STRING + " popf\n" #endif - " ret\n"); -} -NOKPROBE_SYMBOL(kretprobe_trampoline_holder); + " ret\n" + ".size kretprobe_trampoline, .-kretprobe_trampoline\n" +); NOKPROBE_SYMBOL(kretprobe_trampoline); +STACK_FRAME_NON_STANDARD(kretprobe_trampoline); /* * Called from kretprobe_trampoline @@ -988,7 +989,7 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr) * In case the user-specified fault handler returned * zero, try to fix up. */ - if (fixup_exception(regs)) + if (fixup_exception(regs, trapnr)) return 1; /* diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 47190bd399e7..807950860fb7 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -36,6 +36,7 @@ #include <linux/kprobes.h> #include <linux/debugfs.h> #include <linux/nmi.h> +#include <linux/swait.h> #include <asm/timer.h> #include <asm/cpu.h> #include <asm/traps.h> @@ -91,14 +92,14 @@ static void kvm_io_delay(void) struct kvm_task_sleep_node { struct hlist_node link; - wait_queue_head_t wq; + struct swait_queue_head wq; u32 token; int cpu; bool halted; }; static struct kvm_task_sleep_head { - spinlock_t lock; + raw_spinlock_t lock; struct hlist_head list; } async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE]; @@ -122,17 +123,17 @@ void kvm_async_pf_task_wait(u32 token) u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; struct kvm_task_sleep_node n, *e; - DEFINE_WAIT(wait); + DECLARE_SWAITQUEUE(wait); rcu_irq_enter(); - spin_lock(&b->lock); + raw_spin_lock(&b->lock); e = _find_apf_task(b, token); if (e) { /* dummy entry exist -> wake up was delivered ahead of PF */ hlist_del(&e->link); kfree(e); - spin_unlock(&b->lock); + raw_spin_unlock(&b->lock); rcu_irq_exit(); return; @@ -141,13 +142,13 @@ void kvm_async_pf_task_wait(u32 token) n.token = token; n.cpu = smp_processor_id(); n.halted = is_idle_task(current) || preempt_count() > 1; - init_waitqueue_head(&n.wq); + init_swait_queue_head(&n.wq); hlist_add_head(&n.link, &b->list); - spin_unlock(&b->lock); + raw_spin_unlock(&b->lock); for (;;) { if (!n.halted) - prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE); + prepare_to_swait(&n.wq, &wait, TASK_UNINTERRUPTIBLE); if (hlist_unhashed(&n.link)) break; @@ -166,7 +167,7 @@ void kvm_async_pf_task_wait(u32 token) } } if (!n.halted) - finish_wait(&n.wq, &wait); + finish_swait(&n.wq, &wait); rcu_irq_exit(); return; @@ -178,8 +179,8 @@ static void apf_task_wake_one(struct kvm_task_sleep_node *n) hlist_del_init(&n->link); if (n->halted) smp_send_reschedule(n->cpu); - else if (waitqueue_active(&n->wq)) - wake_up(&n->wq); + else if (swait_active(&n->wq)) + swake_up(&n->wq); } static void apf_task_wake_all(void) @@ -189,14 +190,14 @@ static void apf_task_wake_all(void) for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) { struct hlist_node *p, *next; struct kvm_task_sleep_head *b = &async_pf_sleepers[i]; - spin_lock(&b->lock); + raw_spin_lock(&b->lock); hlist_for_each_safe(p, next, &b->list) { struct kvm_task_sleep_node *n = hlist_entry(p, typeof(*n), link); if (n->cpu == smp_processor_id()) apf_task_wake_one(n); } - spin_unlock(&b->lock); + raw_spin_unlock(&b->lock); } } @@ -212,7 +213,7 @@ void kvm_async_pf_task_wake(u32 token) } again: - spin_lock(&b->lock); + raw_spin_lock(&b->lock); n = _find_apf_task(b, token); if (!n) { /* @@ -225,17 +226,17 @@ again: * Allocation failed! Busy wait while other cpu * handles async PF. */ - spin_unlock(&b->lock); + raw_spin_unlock(&b->lock); cpu_relax(); goto again; } n->token = token; n->cpu = smp_processor_id(); - init_waitqueue_head(&n->wq); + init_swait_queue_head(&n->wq); hlist_add_head(&n->link, &b->list); } else apf_task_wake_one(n); - spin_unlock(&b->lock); + raw_spin_unlock(&b->lock); return; } EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake); @@ -486,7 +487,7 @@ void __init kvm_guest_init(void) paravirt_ops_setup(); register_reboot_notifier(&kvm_pv_reboot_nb); for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) - spin_lock_init(&async_pf_sleepers[i].lock); + raw_spin_lock_init(&async_pf_sleepers[i].lock); if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF)) x86_init.irqs.trap_init = kvm_apf_trap_init; diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 72cef58693c7..1d39bfbd26bb 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -226,7 +226,7 @@ static void kvm_setup_secondary_clock(void) * registered memory location. If the guest happens to shutdown, this memory * won't be valid. In cases like kexec, in which you install a new kernel, this * means a random memory location will be kept being written. So before any - * kind of shutdown from our side, we unregister the clock by writting anything + * kind of shutdown from our side, we unregister the clock by writing anything * that does not have the 'enable' bit set in the msr */ #ifdef CONFIG_KEXEC_CORE diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 6acc9dd91f36..6707039b9032 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -103,7 +103,7 @@ static void free_ldt_struct(struct ldt_struct *ldt) * we do not have to muck with descriptors here, that is * done in switch_mm() as needed. */ -int init_new_context(struct task_struct *tsk, struct mm_struct *mm) +int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm) { struct ldt_struct *new_ldt; struct mm_struct *old_mm; @@ -144,7 +144,7 @@ out_unlock: * * 64bit: Don't touch the LDT register - we're already in the next thread. */ -void destroy_context(struct mm_struct *mm) +void destroy_context_ldt(struct mm_struct *mm) { free_ldt_struct(mm->context.ldt); mm->context.ldt = NULL; diff --git a/arch/x86/kernel/mcount_64.S b/arch/x86/kernel/mcount_64.S index 87e1762e2bca..ed48a9f465f8 100644 --- a/arch/x86/kernel/mcount_64.S +++ b/arch/x86/kernel/mcount_64.S @@ -168,12 +168,14 @@ GLOBAL(ftrace_call) restore_mcount_regs /* - * The copied trampoline must call ftrace_return as it + * The copied trampoline must call ftrace_epilogue as it * still may need to call the function graph tracer. + * + * The code up to this label is copied into trampolines so + * think twice before adding any new code or changing the + * layout here. */ -GLOBAL(ftrace_caller_end) - -GLOBAL(ftrace_return) +GLOBAL(ftrace_epilogue) #ifdef CONFIG_FUNCTION_GRAPH_TRACER GLOBAL(ftrace_graph_call) @@ -244,14 +246,14 @@ GLOBAL(ftrace_regs_call) popfq /* - * As this jmp to ftrace_return can be a short jump + * As this jmp to ftrace_epilogue can be a short jump * it must not be copied into the trampoline. * The trampoline will add the code to jump * to the return. */ GLOBAL(ftrace_regs_caller_end) - jmp ftrace_return + jmp ftrace_epilogue END(ftrace_regs_caller) diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 30ca7607cbbb..97340f2c437c 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -408,7 +408,7 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type) processor.cpuflag = CPU_ENABLED; processor.cpufeature = (boot_cpu_data.x86 << 8) | (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask; - processor.featureflag = boot_cpu_data.x86_capability[0]; + processor.featureflag = boot_cpu_data.x86_capability[CPUID_1_EDX]; processor.reserved[0] = 0; processor.reserved[1] = 0; for (i = 0; i < 2; i++) { diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 64f9616f93f1..7f3550acde1b 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -40,7 +40,7 @@ #include <linux/uaccess.h> #include <linux/gfp.h> -#include <asm/processor.h> +#include <asm/cpufeature.h> #include <asm/msr.h> static struct class *msr_class; diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 8a2cdd736fa4..04b132a767f1 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -30,6 +30,7 @@ #include <asm/nmi.h> #include <asm/x86_init.h> #include <asm/reboot.h> +#include <asm/cache.h> #define CREATE_TRACE_POINTS #include <trace/events/nmi.h> @@ -69,7 +70,7 @@ struct nmi_stats { static DEFINE_PER_CPU(struct nmi_stats, nmi_stats); -static int ignore_nmis; +static int ignore_nmis __read_mostly; int unknown_nmi_panic; /* diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c index 14415aff1813..92f70147a9a6 100644 --- a/arch/x86/kernel/pmem.c +++ b/arch/x86/kernel/pmem.c @@ -13,11 +13,11 @@ static int found(u64 start, u64 end, void *data) static __init int register_e820_pmem(void) { - char *pmem = "Persistent Memory (legacy)"; struct platform_device *pdev; int rc; - rc = walk_iomem_res(pmem, IORESOURCE_MEM, 0, -1, NULL, found); + rc = walk_iomem_res_desc(IORES_DESC_PERSISTENT_MEMORY_LEGACY, + IORESOURCE_MEM, 0, -1, NULL, found); if (rc <= 0) return 0; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 9f7c21c22477..2915d54e9dd5 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -57,6 +57,9 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { */ .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, #endif +#ifdef CONFIG_X86_32 + .SYSENTER_stack_canary = STACK_END_MAGIC, +#endif }; EXPORT_PER_CPU_SYMBOL(cpu_tss); @@ -418,9 +421,9 @@ static void mwait_idle(void) if (!current_set_polling_and_test()) { trace_cpu_idle_rcuidle(1, smp_processor_id()); if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) { - smp_mb(); /* quirk */ + mb(); /* quirk */ clflush((void *)¤t_thread_info()->flags); - smp_mb(); /* quirk */ + mb(); /* quirk */ } __monitor((void *)¤t_thread_info()->flags, 0, 0); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index b9d99e0f82c4..6cbab31ac23a 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -48,6 +48,7 @@ #include <asm/syscalls.h> #include <asm/debugreg.h> #include <asm/switch_to.h> +#include <asm/xen/hypervisor.h> asmlinkage extern void ret_from_fork(void); @@ -116,6 +117,8 @@ void __show_regs(struct pt_regs *regs, int all) printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); + if (boot_cpu_has(X86_FEATURE_OSPKE)) + printk(KERN_DEFAULT "PKRU: %08x\n", read_pkru()); } void release_thread(struct task_struct *dead_task) @@ -411,6 +414,17 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) __switch_to_xtra(prev_p, next_p, tss); +#ifdef CONFIG_XEN + /* + * On Xen PV, IOPL bits in pt_regs->flags have no effect, and + * current_pt_regs()->flags may not match the current task's + * intended IOPL. We need to switch it manually. + */ + if (unlikely(static_cpu_has(X86_FEATURE_XENPV) && + prev->iopl != next->iopl)) + xen_set_iopl_mask(next->iopl); +#endif + if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) { /* * AMD CPUs have a misfeature: SYSRET sets the SS selector but @@ -476,7 +490,7 @@ void set_personality_ia32(bool x32) if (current->mm) current->mm->context.ia32_compat = TIF_X32; current->personality &= ~READ_IMPLIES_EXEC; - /* is_compat_task() uses the presence of the x32 + /* in_compat_syscall() uses the presence of the x32 syscall bit flag to determine compat status */ current_thread_info()->status &= ~TS_COMPAT; } else { diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index d3d80e6d42a2..2367ae07eb76 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -112,6 +112,7 @@ #include <asm/alternative.h> #include <asm/prom.h> #include <asm/microcode.h> +#include <asm/mmu_context.h> /* * max_low_pfn_mapped: highest direct mapped pfn under 4GB @@ -152,21 +153,21 @@ static struct resource data_resource = { .name = "Kernel data", .start = 0, .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM }; static struct resource code_resource = { .name = "Kernel code", .start = 0, .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM }; static struct resource bss_resource = { .name = "Kernel bss", .start = 0, .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM }; @@ -1282,3 +1283,11 @@ static int __init register_kernel_offset_dumper(void) return 0; } __initcall(register_kernel_offset_dumper); + +void arch_show_smap(struct seq_file *m, struct vm_area_struct *vma) +{ + if (!boot_cpu_has(X86_FEATURE_OSPKE)) + return; + + seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); +} diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index cb6282c3638f..548ddf7d6fd2 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -61,7 +61,38 @@ regs->seg = GET_SEG(seg) | 3; \ } while (0) -int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc) +#ifdef CONFIG_X86_64 +/* + * If regs->ss will cause an IRET fault, change it. Otherwise leave it + * alone. Using this generally makes no sense unless + * user_64bit_mode(regs) would return true. + */ +static void force_valid_ss(struct pt_regs *regs) +{ + u32 ar; + asm volatile ("lar %[old_ss], %[ar]\n\t" + "jz 1f\n\t" /* If invalid: */ + "xorl %[ar], %[ar]\n\t" /* set ar = 0 */ + "1:" + : [ar] "=r" (ar) + : [old_ss] "rm" ((u16)regs->ss)); + + /* + * For a valid 64-bit user context, we need DPL 3, type + * read-write data or read-write exp-down data, and S and P + * set. We can't use VERW because VERW doesn't check the + * P bit. + */ + ar &= AR_DPL_MASK | AR_S | AR_P | AR_TYPE_MASK; + if (ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA) && + ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA_EXPDOWN)) + regs->ss = __USER_DS; +} +#endif + +static int restore_sigcontext(struct pt_regs *regs, + struct sigcontext __user *sc, + unsigned long uc_flags) { unsigned long buf_val; void __user *buf; @@ -94,15 +125,18 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc) COPY(r15); #endif /* CONFIG_X86_64 */ -#ifdef CONFIG_X86_32 COPY_SEG_CPL3(cs); COPY_SEG_CPL3(ss); -#else /* !CONFIG_X86_32 */ - /* Kernel saves and restores only the CS segment register on signals, - * which is the bare minimum needed to allow mixed 32/64-bit code. - * App's signal handler can save/restore other segments if needed. */ - COPY_SEG_CPL3(cs); -#endif /* CONFIG_X86_32 */ + +#ifdef CONFIG_X86_64 + /* + * Fix up SS if needed for the benefit of old DOSEMU and + * CRIU. + */ + if (unlikely(!(uc_flags & UC_STRICT_RESTORE_SS) && + user_64bit_mode(regs))) + force_valid_ss(regs); +#endif get_user_ex(tmpflags, &sc->flags); regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); @@ -165,6 +199,7 @@ int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, put_user_ex(regs->cs, &sc->cs); put_user_ex(0, &sc->gs); put_user_ex(0, &sc->fs); + put_user_ex(regs->ss, &sc->ss); #endif /* CONFIG_X86_32 */ put_user_ex(fpstate, &sc->fpstate); @@ -403,6 +438,21 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, return 0; } #else /* !CONFIG_X86_32 */ +static unsigned long frame_uc_flags(struct pt_regs *regs) +{ + unsigned long flags; + + if (cpu_has_xsave) + flags = UC_FP_XSTATE | UC_SIGCONTEXT_SS; + else + flags = UC_SIGCONTEXT_SS; + + if (likely(user_64bit_mode(regs))) + flags |= UC_STRICT_RESTORE_SS; + + return flags; +} + static int __setup_rt_frame(int sig, struct ksignal *ksig, sigset_t *set, struct pt_regs *regs) { @@ -422,10 +472,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, put_user_try { /* Create the ucontext. */ - if (cpu_has_xsave) - put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags); - else - put_user_ex(0, &frame->uc.uc_flags); + put_user_ex(frame_uc_flags(regs), &frame->uc.uc_flags); put_user_ex(0, &frame->uc.uc_link); save_altstack_ex(&frame->uc.uc_stack, regs->sp); @@ -459,10 +506,28 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, regs->sp = (unsigned long)frame; - /* Set up the CS register to run signal handlers in 64-bit mode, - even if the handler happens to be interrupting 32-bit code. */ + /* + * Set up the CS and SS registers to run signal handlers in + * 64-bit mode, even if the handler happens to be interrupting + * 32-bit or 16-bit code. + * + * SS is subtle. In 64-bit mode, we don't need any particular + * SS descriptor, but we do need SS to be valid. It's possible + * that the old SS is entirely bogus -- this can happen if the + * signal we're trying to deliver is #GP or #SS caused by a bad + * SS value. We also have a compatbility issue here: DOSEMU + * relies on the contents of the SS register indicating the + * SS value at the time of the signal, even though that code in + * DOSEMU predates sigreturn's ability to restore SS. (DOSEMU + * avoids relying on sigreturn to restore SS; instead it uses + * a trampoline.) So we do our best: if the old SS was valid, + * we keep it. Otherwise we replace it. + */ regs->cs = __USER_CS; + if (unlikely(regs->ss != __USER_DS)) + force_valid_ss(regs); + return 0; } #endif /* CONFIG_X86_32 */ @@ -489,10 +554,7 @@ static int x32_setup_rt_frame(struct ksignal *ksig, put_user_try { /* Create the ucontext. */ - if (cpu_has_xsave) - put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags); - else - put_user_ex(0, &frame->uc.uc_flags); + put_user_ex(frame_uc_flags(regs), &frame->uc.uc_flags); put_user_ex(0, &frame->uc.uc_link); compat_save_altstack_ex(&frame->uc.uc_stack, regs->sp); put_user_ex(0, &frame->uc.uc__pad0); @@ -554,7 +616,11 @@ asmlinkage unsigned long sys_sigreturn(void) set_current_blocked(&set); - if (restore_sigcontext(regs, &frame->sc)) + /* + * x86_32 has no uc_flags bits relevant to restore_sigcontext. + * Save a few cycles by skipping the __get_user. + */ + if (restore_sigcontext(regs, &frame->sc, 0)) goto badframe; return regs->ax; @@ -570,16 +636,19 @@ asmlinkage long sys_rt_sigreturn(void) struct pt_regs *regs = current_pt_regs(); struct rt_sigframe __user *frame; sigset_t set; + unsigned long uc_flags; frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) goto badframe; if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) goto badframe; + if (__get_user(uc_flags, &frame->uc.uc_flags)) + goto badframe; set_current_blocked(&set); - if (restore_sigcontext(regs, &frame->uc.uc_mcontext)) + if (restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) goto badframe; if (restore_altstack(&frame->uc.uc_stack)) @@ -692,12 +761,15 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs) { -#if defined(CONFIG_X86_32) || !defined(CONFIG_X86_64) +#ifdef CONFIG_X86_64 + if (is_ia32_task()) + return __NR_ia32_restart_syscall; +#endif +#ifdef CONFIG_X86_X32_ABI + return __NR_restart_syscall | (regs->orig_ax & __X32_SYSCALL_BIT); +#else return __NR_restart_syscall; -#else /* !CONFIG_X86_32 && CONFIG_X86_64 */ - return test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : - __NR_restart_syscall | (regs->orig_ax & __X32_SYSCALL_BIT); -#endif /* CONFIG_X86_32 || !CONFIG_X86_64 */ +#endif } /* @@ -763,6 +835,7 @@ asmlinkage long sys32_x32_rt_sigreturn(void) struct pt_regs *regs = current_pt_regs(); struct rt_sigframe_x32 __user *frame; sigset_t set; + unsigned long uc_flags; frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8); @@ -770,10 +843,12 @@ asmlinkage long sys32_x32_rt_sigreturn(void) goto badframe; if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) goto badframe; + if (__get_user(uc_flags, &frame->uc.uc_flags)) + goto badframe; set_current_blocked(&set); - if (restore_sigcontext(regs, &frame->uc.uc_mcontext)) + if (restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) goto badframe; if (compat_restore_altstack(&frame->uc.uc_stack)) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 24d57f77b3c1..b2c99f811c3f 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -97,6 +97,14 @@ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); EXPORT_PER_CPU_SYMBOL(cpu_info); +/* Logical package management. We might want to allocate that dynamically */ +static int *physical_to_logical_pkg __read_mostly; +static unsigned long *physical_package_map __read_mostly;; +static unsigned long *logical_package_map __read_mostly; +static unsigned int max_physical_pkg_id __read_mostly; +unsigned int __max_logical_packages __read_mostly; +EXPORT_SYMBOL(__max_logical_packages); + static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip) { unsigned long flags; @@ -248,7 +256,111 @@ static void notrace start_secondary(void *unused) x86_cpuinit.setup_percpu_clockev(); wmb(); - cpu_startup_entry(CPUHP_ONLINE); + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); +} + +int topology_update_package_map(unsigned int apicid, unsigned int cpu) +{ + unsigned int new, pkg = apicid >> boot_cpu_data.x86_coreid_bits; + + /* Called from early boot ? */ + if (!physical_package_map) + return 0; + + if (pkg >= max_physical_pkg_id) + return -EINVAL; + + /* Set the logical package id */ + if (test_and_set_bit(pkg, physical_package_map)) + goto found; + + new = find_first_zero_bit(logical_package_map, __max_logical_packages); + if (new >= __max_logical_packages) { + physical_to_logical_pkg[pkg] = -1; + pr_warn("APIC(%x) Package %u exceeds logical package map\n", + apicid, pkg); + return -ENOSPC; + } + set_bit(new, logical_package_map); + pr_info("APIC(%x) Converting physical %u to logical package %u\n", + apicid, pkg, new); + physical_to_logical_pkg[pkg] = new; + +found: + cpu_data(cpu).logical_proc_id = physical_to_logical_pkg[pkg]; + return 0; +} + +/** + * topology_phys_to_logical_pkg - Map a physical package id to a logical + * + * Returns logical package id or -1 if not found + */ +int topology_phys_to_logical_pkg(unsigned int phys_pkg) +{ + if (phys_pkg >= max_physical_pkg_id) + return -1; + return physical_to_logical_pkg[phys_pkg]; +} +EXPORT_SYMBOL(topology_phys_to_logical_pkg); + +static void __init smp_init_package_map(void) +{ + unsigned int ncpus, cpu; + size_t size; + + /* + * Today neither Intel nor AMD support heterogenous systems. That + * might change in the future.... + * + * While ideally we'd want '* smp_num_siblings' in the below @ncpus + * computation, this won't actually work since some Intel BIOSes + * report inconsistent HT data when they disable HT. + * + * In particular, they reduce the APIC-IDs to only include the cores, + * but leave the CPUID topology to say there are (2) siblings. + * This means we don't know how many threads there will be until + * after the APIC enumeration. + * + * By not including this we'll sometimes over-estimate the number of + * logical packages by the amount of !present siblings, but this is + * still better than MAX_LOCAL_APIC. + * + * We use total_cpus not nr_cpu_ids because nr_cpu_ids can be limited + * on the command line leading to a similar issue as the HT disable + * problem because the hyperthreads are usually enumerated after the + * primary cores. + */ + ncpus = boot_cpu_data.x86_max_cores; + __max_logical_packages = DIV_ROUND_UP(total_cpus, ncpus); + + /* + * Possibly larger than what we need as the number of apic ids per + * package can be smaller than the actual used apic ids. + */ + max_physical_pkg_id = DIV_ROUND_UP(MAX_LOCAL_APIC, ncpus); + size = max_physical_pkg_id * sizeof(unsigned int); + physical_to_logical_pkg = kmalloc(size, GFP_KERNEL); + memset(physical_to_logical_pkg, 0xff, size); + size = BITS_TO_LONGS(max_physical_pkg_id) * sizeof(unsigned long); + physical_package_map = kzalloc(size, GFP_KERNEL); + size = BITS_TO_LONGS(__max_logical_packages) * sizeof(unsigned long); + logical_package_map = kzalloc(size, GFP_KERNEL); + + pr_info("Max logical packages: %u\n", __max_logical_packages); + + for_each_present_cpu(cpu) { + unsigned int apicid = apic->cpu_present_to_apicid(cpu); + + if (apicid == BAD_APICID || !apic->apic_id_valid(apicid)) + continue; + if (!topology_update_package_map(apicid, cpu)) + continue; + pr_warn("CPU %u APICId %x disabled\n", cpu, apicid); + per_cpu(x86_bios_cpu_apicid, cpu) = BAD_APICID; + set_cpu_possible(cpu, false); + set_cpu_present(cpu, false); + } } void __init smp_store_boot_cpu_info(void) @@ -258,6 +370,7 @@ void __init smp_store_boot_cpu_info(void) *c = boot_cpu_data; c->cpu_index = id; + smp_init_package_map(); } /* diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index fdd0c6430e5a..9ee98eefc44d 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -14,30 +14,34 @@ static int save_stack_stack(void *data, char *name) return 0; } -static void +static int __save_stack_address(void *data, unsigned long addr, bool reliable, bool nosched) { struct stack_trace *trace = data; #ifdef CONFIG_FRAME_POINTER if (!reliable) - return; + return 0; #endif if (nosched && in_sched_functions(addr)) - return; + return 0; if (trace->skip > 0) { trace->skip--; - return; + return 0; } - if (trace->nr_entries < trace->max_entries) + if (trace->nr_entries < trace->max_entries) { trace->entries[trace->nr_entries++] = addr; + return 0; + } else { + return -1; /* no more room, stop walking the stack */ + } } -static void save_stack_address(void *data, unsigned long addr, int reliable) +static int save_stack_address(void *data, unsigned long addr, int reliable) { return __save_stack_address(data, addr, reliable, false); } -static void +static int save_stack_address_nosched(void *data, unsigned long addr, int reliable) { return __save_stack_address(data, addr, reliable, true); diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index 91a4496db434..e72a07f20b05 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -135,7 +135,7 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn, pmd = pmd_alloc(&tboot_mm, pud, vaddr); if (!pmd) return -1; - pte = pte_alloc_map(&tboot_mm, NULL, pmd, vaddr); + pte = pte_alloc_map(&tboot_mm, pmd, vaddr); if (!pte) return -1; set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot)); diff --git a/arch/x86/kernel/test_nx.c b/arch/x86/kernel/test_nx.c index 3f92ce07e525..27538f183c3b 100644 --- a/arch/x86/kernel/test_nx.c +++ b/arch/x86/kernel/test_nx.c @@ -142,7 +142,6 @@ static int test_NX(void) * by the error message */ -#ifdef CONFIG_DEBUG_RODATA /* Test 3: Check if the .rodata section is executable */ if (rodata_test_data != 0xC3) { printk(KERN_ERR "test_nx: .rodata marker has invalid value\n"); @@ -151,7 +150,6 @@ static int test_NX(void) printk(KERN_ERR "test_nx: .rodata section is executable\n"); ret = -ENODEV; } -#endif #if 0 /* Test 4: Check if the .data section of a module is executable */ diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c index 5ecbfe5099da..cb4a01b41e27 100644 --- a/arch/x86/kernel/test_rodata.c +++ b/arch/x86/kernel/test_rodata.c @@ -76,5 +76,5 @@ int rodata_test(void) } MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("Testcase for the DEBUG_RODATA infrastructure"); +MODULE_DESCRIPTION("Testcase for marking rodata as read-only"); MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>"); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index ade185a46b1d..06cbe25861f1 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -83,30 +83,16 @@ gate_desc idt_table[NR_VECTORS] __page_aligned_bss; DECLARE_BITMAP(used_vectors, NR_VECTORS); EXPORT_SYMBOL_GPL(used_vectors); -static inline void conditional_sti(struct pt_regs *regs) +static inline void cond_local_irq_enable(struct pt_regs *regs) { if (regs->flags & X86_EFLAGS_IF) local_irq_enable(); } -static inline void preempt_conditional_sti(struct pt_regs *regs) -{ - preempt_count_inc(); - if (regs->flags & X86_EFLAGS_IF) - local_irq_enable(); -} - -static inline void conditional_cli(struct pt_regs *regs) -{ - if (regs->flags & X86_EFLAGS_IF) - local_irq_disable(); -} - -static inline void preempt_conditional_cli(struct pt_regs *regs) +static inline void cond_local_irq_disable(struct pt_regs *regs) { if (regs->flags & X86_EFLAGS_IF) local_irq_disable(); - preempt_count_dec(); } void ist_enter(struct pt_regs *regs) @@ -199,7 +185,7 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, } if (!user_mode(regs)) { - if (!fixup_exception(regs)) { + if (!fixup_exception(regs, trapnr)) { tsk->thread.error_code = error_code; tsk->thread.trap_nr = trapnr; die(str, regs, error_code); @@ -262,7 +248,6 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, tsk->thread.error_code = error_code; tsk->thread.trap_nr = trapnr; -#ifdef CONFIG_X86_64 if (show_unhandled_signals && unhandled_signal(tsk, signr) && printk_ratelimit()) { pr_info("%s[%d] trap %s ip:%lx sp:%lx error:%lx", @@ -271,7 +256,6 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, print_vma_addr(" in ", regs->ip); pr_cont("\n"); } -#endif force_sig_info(signr, info ?: SEND_SIG_PRIV, tsk); } @@ -286,7 +270,7 @@ static void do_error_trap(struct pt_regs *regs, long error_code, char *str, if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) != NOTIFY_STOP) { - conditional_sti(regs); + cond_local_irq_enable(regs); do_trap(trapnr, signr, str, regs, error_code, fill_trap_info(regs, signr, trapnr, &info)); } @@ -368,7 +352,7 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code) if (notify_die(DIE_TRAP, "bounds", regs, error_code, X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP) return; - conditional_sti(regs); + cond_local_irq_enable(regs); if (!user_mode(regs)) die("bounds", regs, error_code); @@ -443,7 +427,7 @@ do_general_protection(struct pt_regs *regs, long error_code) struct task_struct *tsk; RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); - conditional_sti(regs); + cond_local_irq_enable(regs); if (v8086_mode(regs)) { local_irq_enable(); @@ -453,7 +437,7 @@ do_general_protection(struct pt_regs *regs, long error_code) tsk = current; if (!user_mode(regs)) { - if (fixup_exception(regs)) + if (fixup_exception(regs, X86_TRAP_GP)) return; tsk->thread.error_code = error_code; @@ -517,9 +501,11 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) * as we may switch to the interrupt stack. */ debug_stack_usage_inc(); - preempt_conditional_sti(regs); + preempt_disable(); + cond_local_irq_enable(regs); do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL); - preempt_conditional_cli(regs); + cond_local_irq_disable(regs); + preempt_enable_no_resched(); debug_stack_usage_dec(); exit: ist_exit(regs); @@ -571,6 +557,29 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s) NOKPROBE_SYMBOL(fixup_bad_iret); #endif +static bool is_sysenter_singlestep(struct pt_regs *regs) +{ + /* + * We don't try for precision here. If we're anywhere in the region of + * code that can be single-stepped in the SYSENTER entry path, then + * assume that this is a useless single-step trap due to SYSENTER + * being invoked with TF set. (We don't know in advance exactly + * which instructions will be hit because BTF could plausibly + * be set.) + */ +#ifdef CONFIG_X86_32 + return (regs->ip - (unsigned long)__begin_SYSENTER_singlestep_region) < + (unsigned long)__end_SYSENTER_singlestep_region - + (unsigned long)__begin_SYSENTER_singlestep_region; +#elif defined(CONFIG_IA32_EMULATION) + return (regs->ip - (unsigned long)entry_SYSENTER_compat) < + (unsigned long)__end_entry_SYSENTER_compat - + (unsigned long)entry_SYSENTER_compat; +#else + return false; +#endif +} + /* * Our handling of the processor debug registers is non-trivial. * We do not clear them on entry and exit from the kernel. Therefore @@ -605,11 +614,42 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) ist_enter(regs); get_debugreg(dr6, 6); + /* + * The Intel SDM says: + * + * Certain debug exceptions may clear bits 0-3. The remaining + * contents of the DR6 register are never cleared by the + * processor. To avoid confusion in identifying debug + * exceptions, debug handlers should clear the register before + * returning to the interrupted task. + * + * Keep it simple: clear DR6 immediately. + */ + set_debugreg(0, 6); /* Filter out all the reserved bits which are preset to 1 */ dr6 &= ~DR6_RESERVED; /* + * The SDM says "The processor clears the BTF flag when it + * generates a debug exception." Clear TIF_BLOCKSTEP to keep + * TIF_BLOCKSTEP in sync with the hardware BTF flag. + */ + clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP); + + if (unlikely(!user_mode(regs) && (dr6 & DR_STEP) && + is_sysenter_singlestep(regs))) { + dr6 &= ~DR_STEP; + if (!dr6) + goto exit; + /* + * else we might have gotten a single-step trap and hit a + * watchpoint at the same time, in which case we should fall + * through and handle the watchpoint. + */ + } + + /* * If dr6 has no reason to give us about the origin of this trap, * then it's very likely the result of an icebp/int01 trap. * User wants a sigtrap for that. @@ -617,18 +657,10 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) if (!dr6 && user_mode(regs)) user_icebp = 1; - /* Catch kmemcheck conditions first of all! */ + /* Catch kmemcheck conditions! */ if ((dr6 & DR_STEP) && kmemcheck_trap(regs)) goto exit; - /* DR6 may or may not be cleared by the CPU */ - set_debugreg(0, 6); - - /* - * The processor cleared BTF, so don't mark that we need it set. - */ - clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP); - /* Store the virtualized DR6 value */ tsk->thread.debugreg6 = dr6; @@ -648,24 +680,25 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) debug_stack_usage_inc(); /* It's safe to allow irq's after DR6 has been saved */ - preempt_conditional_sti(regs); + preempt_disable(); + cond_local_irq_enable(regs); if (v8086_mode(regs)) { handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, X86_TRAP_DB); - preempt_conditional_cli(regs); + cond_local_irq_disable(regs); + preempt_enable_no_resched(); debug_stack_usage_dec(); goto exit; } - /* - * Single-stepping through system calls: ignore any exceptions in - * kernel space, but re-enable TF when returning to user mode. - * - * We already checked v86 mode above, so we can check for kernel mode - * by just checking the CPL of CS. - */ - if ((dr6 & DR_STEP) && !user_mode(regs)) { + if (WARN_ON_ONCE((dr6 & DR_STEP) && !user_mode(regs))) { + /* + * Historical junk that used to handle SYSENTER single-stepping. + * This should be unreachable now. If we survive for a while + * without anyone hitting this warning, we'll turn this into + * an oops. + */ tsk->thread.debugreg6 &= ~DR_STEP; set_tsk_thread_flag(tsk, TIF_SINGLESTEP); regs->flags &= ~X86_EFLAGS_TF; @@ -673,10 +706,19 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) si_code = get_si_code(tsk->thread.debugreg6); if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp) send_sigtrap(tsk, regs, error_code, si_code); - preempt_conditional_cli(regs); + cond_local_irq_disable(regs); + preempt_enable_no_resched(); debug_stack_usage_dec(); exit: +#if defined(CONFIG_X86_32) + /* + * This is the most likely code path that involves non-trivial use + * of the SYSENTER stack. Check that we haven't overrun it. + */ + WARN(this_cpu_read(cpu_tss.SYSENTER_stack_canary) != STACK_END_MAGIC, + "Overran or corrupted SYSENTER stack\n"); +#endif ist_exit(regs); } NOKPROBE_SYMBOL(do_debug); @@ -696,10 +738,10 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr) if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, SIGFPE) == NOTIFY_STOP) return; - conditional_sti(regs); + cond_local_irq_enable(regs); if (!user_mode(regs)) { - if (!fixup_exception(regs)) { + if (!fixup_exception(regs, trapnr)) { task->thread.error_code = error_code; task->thread.trap_nr = trapnr; die(str, regs, error_code); @@ -743,20 +785,19 @@ do_simd_coprocessor_error(struct pt_regs *regs, long error_code) dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) { - conditional_sti(regs); + cond_local_irq_enable(regs); } dotraplinkage void do_device_not_available(struct pt_regs *regs, long error_code) { RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); - BUG_ON(use_eager_fpu()); #ifdef CONFIG_MATH_EMULATION - if (read_cr0() & X86_CR0_EM) { + if (!boot_cpu_has(X86_FEATURE_FPU) && (read_cr0() & X86_CR0_EM)) { struct math_emu_info info = { }; - conditional_sti(regs); + cond_local_irq_enable(regs); info.regs = regs; math_emulate(&info); @@ -765,7 +806,7 @@ do_device_not_available(struct pt_regs *regs, long error_code) #endif fpu__restore(¤t->thread.fpu); /* interrupts still off */ #ifdef CONFIG_X86_32 - conditional_sti(regs); + cond_local_irq_enable(regs); #endif } NOKPROBE_SYMBOL(do_device_not_available); @@ -868,7 +909,7 @@ void __init trap_init(void) #endif #ifdef CONFIG_X86_32 - set_system_trap_gate(IA32_SYSCALL_VECTOR, entry_INT80_32); + set_system_intr_gate(IA32_SYSCALL_VECTOR, entry_INT80_32); set_bit(IA32_SYSCALL_VECTOR, used_vectors); #endif diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 3d743da828d3..c9c4c7ce3eb2 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -43,6 +43,11 @@ static DEFINE_STATIC_KEY_FALSE(__use_tsc); int tsc_clocksource_reliable; +static u32 art_to_tsc_numerator; +static u32 art_to_tsc_denominator; +static u64 art_to_tsc_offset; +struct clocksource *art_related_clocksource; + /* * Use a ring-buffer like data structure, where a writer advances the head by * writing a new data entry and a reader advances the tail when it observes a @@ -876,7 +881,7 @@ void tsc_restore_sched_clock_state(void) local_irq_save(flags); /* - * We're comming out of suspend, there's no concurrency yet; don't + * We're coming out of suspend, there's no concurrency yet; don't * bother being nice about the RCU stuff, just write to both * data fields. */ @@ -964,6 +969,37 @@ core_initcall(cpufreq_tsc); #endif /* CONFIG_CPU_FREQ */ +#define ART_CPUID_LEAF (0x15) +#define ART_MIN_DENOMINATOR (1) + + +/* + * If ART is present detect the numerator:denominator to convert to TSC + */ +static void detect_art(void) +{ + unsigned int unused[2]; + + if (boot_cpu_data.cpuid_level < ART_CPUID_LEAF) + return; + + cpuid(ART_CPUID_LEAF, &art_to_tsc_denominator, + &art_to_tsc_numerator, unused, unused+1); + + /* Don't enable ART in a VM, non-stop TSC required */ + if (boot_cpu_has(X86_FEATURE_HYPERVISOR) || + !boot_cpu_has(X86_FEATURE_NONSTOP_TSC) || + art_to_tsc_denominator < ART_MIN_DENOMINATOR) + return; + + if (rdmsrl_safe(MSR_IA32_TSC_ADJUST, &art_to_tsc_offset)) + return; + + /* Make this sticky over multiple CPU init calls */ + setup_force_cpu_cap(X86_FEATURE_ART); +} + + /* clocksource code */ static struct clocksource clocksource_tsc; @@ -1071,6 +1107,25 @@ int unsynchronized_tsc(void) return 0; } +/* + * Convert ART to TSC given numerator/denominator found in detect_art() + */ +struct system_counterval_t convert_art_to_tsc(cycle_t art) +{ + u64 tmp, res, rem; + + rem = do_div(art, art_to_tsc_denominator); + + res = art * art_to_tsc_numerator; + tmp = rem * art_to_tsc_numerator; + + do_div(tmp, art_to_tsc_denominator); + res += tmp + art_to_tsc_offset; + + return (struct system_counterval_t) {.cs = art_related_clocksource, + .cycles = res}; +} +EXPORT_SYMBOL(convert_art_to_tsc); static void tsc_refine_calibration_work(struct work_struct *work); static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work); @@ -1142,6 +1197,8 @@ static void tsc_refine_calibration_work(struct work_struct *work) (unsigned long)tsc_khz % 1000); out: + if (boot_cpu_has(X86_FEATURE_ART)) + art_related_clocksource = &clocksource_tsc; clocksource_register_khz(&clocksource_tsc, tsc_khz); } @@ -1235,6 +1292,8 @@ void __init tsc_init(void) mark_tsc_unstable("TSCs unsynchronized"); check_system_tsc_reliable(); + + detect_art(); } #ifdef CONFIG_SMP @@ -1246,14 +1305,18 @@ void __init tsc_init(void) */ unsigned long calibrate_delay_is_known(void) { - int i, cpu = smp_processor_id(); + int sibling, cpu = smp_processor_id(); + struct cpumask *mask = topology_core_cpumask(cpu); if (!tsc_disabled && !cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC)) return 0; - for_each_online_cpu(i) - if (cpu_data(i).phys_proc_id == cpu_data(cpu).phys_proc_id) - return cpu_data(i).loops_per_jiffy; + if (!mask) + return 0; + + sibling = cpumask_any_but(mask, cpu); + if (sibling < nr_cpu_ids) + return cpu_data(sibling).loops_per_jiffy; return 0; } #endif diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S index 07efb35ee4bc..014ea59aa153 100644 --- a/arch/x86/kernel/verify_cpu.S +++ b/arch/x86/kernel/verify_cpu.S @@ -30,7 +30,7 @@ * appropriately. Either display a message or halt. */ -#include <asm/cpufeature.h> +#include <asm/cpufeatures.h> #include <asm/msr-index.h> verify_cpu: diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index e574b8546518..3dce1ca0a653 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -362,7 +362,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) /* make room for real-mode segments */ tsk->thread.sp0 += 16; - if (static_cpu_has_safe(X86_FEATURE_SEP)) + if (static_cpu_has(X86_FEATURE_SEP)) tsk->thread.sysenter_cs = 0; load_sp0(tss, &tsk->thread); diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 74e4bf11f562..4c941f88d405 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -41,29 +41,28 @@ ENTRY(phys_startup_64) jiffies_64 = jiffies; #endif -#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) +#if defined(CONFIG_X86_64) /* - * On 64-bit, align RODATA to 2MB so that even with CONFIG_DEBUG_RODATA - * we retain large page mappings for boundaries spanning kernel text, rodata - * and data sections. + * On 64-bit, align RODATA to 2MB so we retain large page mappings for + * boundaries spanning kernel text, rodata and data sections. * * However, kernel identity mappings will have different RWX permissions * to the pages mapping to text and to the pages padding (which are freed) the * text section. Hence kernel identity mappings will be broken to smaller * pages. For 64-bit, kernel text and kernel identity mappings are different, - * so we can enable protection checks that come with CONFIG_DEBUG_RODATA, - * as well as retain 2MB large page mappings for kernel text. + * so we can enable protection checks as well as retain 2MB large page + * mappings for kernel text. */ -#define X64_ALIGN_DEBUG_RODATA_BEGIN . = ALIGN(HPAGE_SIZE); +#define X64_ALIGN_RODATA_BEGIN . = ALIGN(HPAGE_SIZE); -#define X64_ALIGN_DEBUG_RODATA_END \ +#define X64_ALIGN_RODATA_END \ . = ALIGN(HPAGE_SIZE); \ __end_rodata_hpage_align = .; #else -#define X64_ALIGN_DEBUG_RODATA_BEGIN -#define X64_ALIGN_DEBUG_RODATA_END +#define X64_ALIGN_RODATA_BEGIN +#define X64_ALIGN_RODATA_END #endif @@ -82,11 +81,11 @@ PHDRS { SECTIONS { #ifdef CONFIG_X86_32 - . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR; - phys_startup_32 = startup_32 - LOAD_OFFSET; + . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR; + phys_startup_32 = ABSOLUTE(startup_32 - LOAD_OFFSET); #else - . = __START_KERNEL; - phys_startup_64 = startup_64 - LOAD_OFFSET; + . = __START_KERNEL; + phys_startup_64 = ABSOLUTE(startup_64 - LOAD_OFFSET); #endif /* Text and read-only data */ @@ -102,6 +101,7 @@ SECTIONS KPROBES_TEXT ENTRY_TEXT IRQENTRY_TEXT + SOFTIRQENTRY_TEXT *(.fixup) *(.gnu.warning) /* End of text section */ @@ -112,13 +112,11 @@ SECTIONS EXCEPTION_TABLE(16) :text = 0x9090 -#if defined(CONFIG_DEBUG_RODATA) /* .text should occupy whole number of pages */ . = ALIGN(PAGE_SIZE); -#endif - X64_ALIGN_DEBUG_RODATA_BEGIN + X64_ALIGN_RODATA_BEGIN RO_DATA(PAGE_SIZE) - X64_ALIGN_DEBUG_RODATA_END + X64_ALIGN_RODATA_END /* Data */ .data : AT(ADDR(.data) - LOAD_OFFSET) { @@ -195,6 +193,17 @@ SECTIONS :init #endif + /* + * Section for code used exclusively before alternatives are run. All + * references to such code must be patched out by alternatives, normally + * by using X86_FEATURE_ALWAYS CPU feature bit. + * + * See static_cpu_has() for an example. + */ + .altinstr_aux : AT(ADDR(.altinstr_aux) - LOAD_OFFSET) { + *(.altinstr_aux) + } + INIT_DATA_SECTION(16) .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { @@ -325,6 +334,7 @@ SECTIONS __brk_limit = .; } + . = ALIGN(PAGE_SIZE); _end = .; STABS_DEBUG @@ -332,7 +342,10 @@ SECTIONS /* Sections to be discarded */ DISCARDS - /DISCARD/ : { *(.eh_frame) } + /DISCARD/ : { + *(.eh_frame) + *(__func_stack_frame_non_standard) + } } diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index a0695be19864..cd05942bc918 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -37,6 +37,8 @@ EXPORT_SYMBOL(__copy_user_nocache); EXPORT_SYMBOL(_copy_from_user); EXPORT_SYMBOL(_copy_to_user); +EXPORT_SYMBOL_GPL(memcpy_mcsafe); + EXPORT_SYMBOL(copy_page); EXPORT_SYMBOL(clear_page); |