From 707e59ba494372a90d245f18b0c78982caa88e48 Mon Sep 17 00:00:00 2001 From: Ross Lagerwall Date: Fri, 22 Apr 2016 13:05:31 +0100 Subject: xen/qspinlock: Don't kick CPU if IRQ is not initialized The following commit: 1fb3a8b2cfb2 ("xen/spinlock: Fix locking path engaging too soon under PVHVM.") ... moved the initalization of the kicker interrupt until after native_cpu_up() is called. However, when using qspinlocks, a CPU may try to kick another CPU that is spinning (because it has not yet initialized its kicker interrupt), resulting in the following crash during boot: kernel BUG at /build/linux-Ay7j_C/linux-4.4.0/drivers/xen/events/events_base.c:1210! invalid opcode: 0000 [#1] SMP ... RIP: 0010:[] [] xen_send_IPI_one+0x59/0x60 ... Call Trace: [] xen_qlock_kick+0xe/0x10 [] __pv_queued_spin_unlock+0xb2/0xf0 [] ? __raw_callee_save___pv_queued_spin_unlock+0x11/0x20 [] ? check_tsc_warp+0x76/0x150 [] check_tsc_sync_source+0x96/0x160 [] native_cpu_up+0x3d8/0x9f0 [] xen_hvm_cpu_up+0x35/0x80 [] _cpu_up+0x13c/0x180 [] cpu_up+0x7a/0xa0 [] smp_init+0x7f/0x81 [] kernel_init_freeable+0xef/0x212 [] ? rest_init+0x80/0x80 [] kernel_init+0xe/0xe0 [] ret_from_fork+0x3f/0x70 [] ? rest_init+0x80/0x80 To fix this, only send the kick if the target CPU's interrupt has been initialized. This check isn't racy, because the target is waiting for the spinlock, so it won't have initialized the interrupt in the meantime. Signed-off-by: Ross Lagerwall Reviewed-by: Boris Ostrovsky Cc: David Vrabel Cc: Juergen Gross Cc: Konrad Rzeszutek Wilk Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Cc: xen-devel@lists.xenproject.org Signed-off-by: Ingo Molnar --- arch/x86/xen/spinlock.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 9e2ba5c6e1dd..f42e78de1e10 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -27,6 +27,12 @@ static bool xen_pvspin = true; static void xen_qlock_kick(int cpu) { + int irq = per_cpu(lock_kicker_irq, cpu); + + /* Don't kick if the target's kicker interrupt is not initialized. */ + if (irq == -1) + return; + xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR); } -- cgit v1.2.3 From b89c173788c3a8ed571652c203bf59a0e9d700aa Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 15 Apr 2016 13:25:33 -0700 Subject: perf/x86/intel: Add model number for Skylake Server to perf Everything the same as base Skylake, just a new model number. Signed-off-by: Andi Kleen Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/1460751933-2264-1-git-send-email-andi@firstfloor.org Signed-off-by: Ingo Molnar --- arch/x86/events/intel/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 68fa55b4d42e..aff79884e17d 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3639,6 +3639,7 @@ __init int intel_pmu_init(void) case 78: /* 14nm Skylake Mobile */ case 94: /* 14nm Skylake Desktop */ + case 85: /* 14nm Skylake Server */ x86_pmu.late_ack = true; memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); -- cgit v1.2.3 From e1089602a3bf3efd13d0ffc575f3e22213f009da Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Sun, 17 Apr 2016 08:43:29 -0700 Subject: perf/x86/intel/rapl: Add missing Haswell model Added one missing Haswell model. Signed-off-by: Srinivas Pandruvada Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: bp@alien8.de Cc: hpa@zytor.com Link: http://lkml.kernel.org/r/1460907809-11897-1-git-send-email-srinivas.pandruvada@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/rapl.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c index 70c93f9b03ac..1705c9d75e44 100644 --- a/arch/x86/events/intel/rapl.c +++ b/arch/x86/events/intel/rapl.c @@ -718,6 +718,7 @@ static int __init rapl_pmu_init(void) break; case 60: /* Haswell */ case 69: /* Haswell-Celeron */ + case 70: /* Haswell GT3e */ case 61: /* Broadwell */ case 71: /* Broadwell-H */ rapl_cntr_mask = RAPL_IDX_HSW; -- cgit v1.2.3 From e16d8a6cbb499c5c8bfe9330d3351b649bded4af Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 26 Apr 2016 08:52:44 -0700 Subject: Revert "x86/mm/32: Set NX in __supported_pte_mask before enabling paging" This reverts commit 320d25b6a05f8b73c23fc21025d2906ecdd2d4fc. This change was problematic for a couple of reasons: 1. It missed a some entry points (Xen things and 64-bit native). 2. The entry it changed can be executed more than once. This isn't really a problem, but it conflated per-cpu state setup and global state setup. 3. It broke 64-bit non-NX. 64-bit non-NX worked the other way around from 32-bit -- __supported_pte_mask had NX set initially and was *cleared* in x86_configure_nx. With the patch applied, it never got cleared. Reported-and-tested-by: Meelis Roos Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/59bd15f7f4b56b633a611b7f70876c6d2ad01a98.1461685884.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/head_32.S | 6 ------ arch/x86/mm/setup_nx.c | 5 +++-- 2 files changed, 3 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 54cdbd2003fe..af1112980dd4 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -389,12 +389,6 @@ default_entry: /* Make changes effective */ wrmsr - /* - * And make sure that all the mappings we set up have NX set from - * the beginning. - */ - orl $(1 << (_PAGE_BIT_NX - 32)), pa(__supported_pte_mask + 4) - enable_paging: /* diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c index 8bea84724a7d..f65a33f505b6 100644 --- a/arch/x86/mm/setup_nx.c +++ b/arch/x86/mm/setup_nx.c @@ -32,8 +32,9 @@ early_param("noexec", noexec_setup); void x86_configure_nx(void) { - /* If disable_nx is set, clear NX on all new mappings going forward. */ - if (disable_nx) + if (boot_cpu_has(X86_FEATURE_NX) && !disable_nx) + __supported_pte_mask |= _PAGE_NX; + else __supported_pte_mask &= ~_PAGE_NX; } -- cgit v1.2.3 From 1bdb8970392a68489b469c3a330a1adb5ef61beb Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 27 Apr 2016 14:22:32 -0600 Subject: x86/apic: Handle zero vector gracefully in clear_vector_irq() If x86_vector_alloc_irq() fails x86_vector_free_irqs() is invoked to cleanup the already allocated vectors. This subsequently calls clear_vector_irq(). The failed irq has no vector assigned, which triggers the BUG_ON(!vector) in clear_vector_irq(). We cannot suppress the call to x86_vector_free_irqs() for the failed interrupt, because the other data related to this irq must be cleaned up as well. So calling clear_vector_irq() with vector == 0 is legitimate. Remove the BUG_ON and return if vector is zero, [ tglx: Massaged changelog ] Fixes: b5dc8e6c21e7 "x86/irq: Use hierarchical irqdomain to manage CPU interrupt vectors" Signed-off-by: Keith Busch Cc: stable@vger.kernel.org Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/vector.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index ad59d70bcb1a..ef495511f019 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -256,7 +256,8 @@ static void clear_irq_vector(int irq, struct apic_chip_data *data) struct irq_desc *desc; int cpu, vector; - BUG_ON(!data->cfg.vector); + if (!data->cfg.vector) + return; vector = data->cfg.vector; for_each_cpu_and(cpu, data->domain, cpu_online_mask) -- cgit v1.2.3 From 0a25556f84d5f79e68e9502bb1f32a43377ab2bf Mon Sep 17 00:00:00 2001 From: Adam Borowski Date: Wed, 27 Apr 2016 11:35:31 +0200 Subject: perf/x86/amd: Set the size of event map array to PERF_COUNT_HW_MAX The entry for PERF_COUNT_HW_REF_CPU_CYCLES is not used on AMD, but is referenced by filter_events() which expects undefined events to have a value of 0. Found via KASAN: UBSAN: Undefined behaviour in arch/x86/events/amd/core.c:132:30 index 9 is out of range for type 'u64 [9]' UBSAN: Undefined behaviour in arch/x86/events/amd/core.c:132:9 load of address ffffffff81c021c8 with insufficient space for an object of type 'const u64' Signed-off-by: Adam Borowski Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Jiri Olsa Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/1461749731-30979-1-git-send-email-kilobyte@angband.pl Signed-off-by: Ingo Molnar --- arch/x86/events/amd/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 86a9bec18dab..bd3e8421b57c 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -115,7 +115,7 @@ static __initconst const u64 amd_hw_cache_event_ids /* * AMD Performance Monitor K7 and later. */ -static const u64 amd_perfmon_event_map[] = +static const u64 amd_perfmon_event_map[PERF_COUNT_HW_MAX] = { [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, -- cgit v1.2.3 From 1c5ac21a0e9bab7fc45d0ba9e11623e9ad99d02e Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Tue, 29 Mar 2016 17:43:10 +0300 Subject: perf/x86/intel/pt: Don't die on VMXON Some versions of Intel PT do not support tracing across VMXON, more specifically, VMXON will clear TraceEn control bit and any attempt to set it before VMXOFF will throw a #GP, which in the current state of things will crash the kernel. Namely: $ perf record -e intel_pt// kvm -nographic on such a machine will kill it. To avoid this, notify the intel_pt driver before VMXON and after VMXOFF so that it knows when not to enable itself. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Gleb Natapov Cc: Jiri Olsa Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: hpa@zytor.com Link: http://lkml.kernel.org/r/87oa9dwrfk.fsf@ashishki-desk.ger.corp.intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/pt.c | 75 +++++++++++++++++++++++++++++++++------ arch/x86/events/intel/pt.h | 3 ++ arch/x86/include/asm/perf_event.h | 4 +++ arch/x86/kvm/vmx.c | 4 +++ 4 files changed, 75 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index 6af7cf71d6b2..09a77dbc73c9 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -136,9 +136,21 @@ static int __init pt_pmu_hw_init(void) struct dev_ext_attribute *de_attrs; struct attribute **attrs; size_t size; + u64 reg; int ret; long i; + if (boot_cpu_has(X86_FEATURE_VMX)) { + /* + * Intel SDM, 36.5 "Tracing post-VMXON" says that + * "IA32_VMX_MISC[bit 14]" being 1 means PT can trace + * post-VMXON. + */ + rdmsrl(MSR_IA32_VMX_MISC, reg); + if (reg & BIT(14)) + pt_pmu.vmx = true; + } + attrs = NULL; for (i = 0; i < PT_CPUID_LEAVES; i++) { @@ -269,20 +281,23 @@ static void pt_config(struct perf_event *event) reg |= (event->attr.config & PT_CONFIG_MASK); + event->hw.config = reg; wrmsrl(MSR_IA32_RTIT_CTL, reg); } -static void pt_config_start(bool start) +static void pt_config_stop(struct perf_event *event) { - u64 ctl; + u64 ctl = READ_ONCE(event->hw.config); + + /* may be already stopped by a PMI */ + if (!(ctl & RTIT_CTL_TRACEEN)) + return; - rdmsrl(MSR_IA32_RTIT_CTL, ctl); - if (start) - ctl |= RTIT_CTL_TRACEEN; - else - ctl &= ~RTIT_CTL_TRACEEN; + ctl &= ~RTIT_CTL_TRACEEN; wrmsrl(MSR_IA32_RTIT_CTL, ctl); + WRITE_ONCE(event->hw.config, ctl); + /* * A wrmsr that disables trace generation serializes other PT * registers and causes all data packets to be written to memory, @@ -291,8 +306,7 @@ static void pt_config_start(bool start) * The below WMB, separating data store and aux_head store matches * the consumer's RMB that separates aux_head load and data load. */ - if (!start) - wmb(); + wmb(); } static void pt_config_buffer(void *buf, unsigned int topa_idx, @@ -942,11 +956,17 @@ void intel_pt_interrupt(void) if (!ACCESS_ONCE(pt->handle_nmi)) return; - pt_config_start(false); + /* + * If VMX is on and PT does not support it, don't touch anything. + */ + if (READ_ONCE(pt->vmx_on)) + return; if (!event) return; + pt_config_stop(event); + buf = perf_get_aux(&pt->handle); if (!buf) return; @@ -983,6 +1003,35 @@ void intel_pt_interrupt(void) } } +void intel_pt_handle_vmx(int on) +{ + struct pt *pt = this_cpu_ptr(&pt_ctx); + struct perf_event *event; + unsigned long flags; + + /* PT plays nice with VMX, do nothing */ + if (pt_pmu.vmx) + return; + + /* + * VMXON will clear RTIT_CTL.TraceEn; we need to make + * sure to not try to set it while VMX is on. Disable + * interrupts to avoid racing with pmu callbacks; + * concurrent PMI should be handled fine. + */ + local_irq_save(flags); + WRITE_ONCE(pt->vmx_on, on); + + if (on) { + /* prevent pt_config_stop() from writing RTIT_CTL */ + event = pt->handle.event; + if (event) + event->hw.config = 0; + } + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(intel_pt_handle_vmx); + /* * PMU callbacks */ @@ -992,6 +1041,9 @@ static void pt_event_start(struct perf_event *event, int mode) struct pt *pt = this_cpu_ptr(&pt_ctx); struct pt_buffer *buf = perf_get_aux(&pt->handle); + if (READ_ONCE(pt->vmx_on)) + return; + if (!buf || pt_buffer_is_full(buf, pt)) { event->hw.state = PERF_HES_STOPPED; return; @@ -1014,7 +1066,8 @@ static void pt_event_stop(struct perf_event *event, int mode) * see comment in intel_pt_interrupt(). */ ACCESS_ONCE(pt->handle_nmi) = 0; - pt_config_start(false); + + pt_config_stop(event); if (event->hw.state == PERF_HES_STOPPED) return; diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h index 336878a5d205..3abb5f5cccc8 100644 --- a/arch/x86/events/intel/pt.h +++ b/arch/x86/events/intel/pt.h @@ -65,6 +65,7 @@ enum pt_capabilities { struct pt_pmu { struct pmu pmu; u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES]; + bool vmx; }; /** @@ -107,10 +108,12 @@ struct pt_buffer { * struct pt - per-cpu pt context * @handle: perf output handle * @handle_nmi: do handle PT PMI on this cpu, there's an active event + * @vmx_on: 1 if VMX is ON on this cpu */ struct pt { struct perf_output_handle handle; int handle_nmi; + int vmx_on; }; #endif /* __INTEL_PT_H__ */ diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 5a2ed3ed2f26..f353061bba1d 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -285,6 +285,10 @@ static inline void perf_events_lapic_init(void) { } static inline void perf_check_microcode(void) { } #endif +#ifdef CONFIG_CPU_SUP_INTEL + extern void intel_pt_handle_vmx(int on); +#endif + #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) extern void amd_pmu_enable_virt(void); extern void amd_pmu_disable_virt(void); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ee1c8a93871c..133679d520af 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3103,6 +3103,8 @@ static __init int vmx_disabled_by_bios(void) static void kvm_cpu_vmxon(u64 addr) { + intel_pt_handle_vmx(1); + asm volatile (ASM_VMX_VMXON_RAX : : "a"(&addr), "m"(addr) : "memory", "cc"); @@ -3172,6 +3174,8 @@ static void vmclear_local_loaded_vmcss(void) static void kvm_cpu_vmxoff(void) { asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc"); + + intel_pt_handle_vmx(0); } static void hardware_disable(void) -- cgit v1.2.3 From cf3beb7c90a8efa16a06b26634cddddc92bb819c Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 21 Apr 2016 02:30:10 -0700 Subject: perf/x86/intel: Fix incorrect lbr_sel_mask value This patch fixes a bug which was introduced by: b16a5b52eb90 ("perf/x86: Add option to disable reading branch flags/cycles") In this patch, lbr_sel_mask is used to mask the lbr_select. But LBR_SEL_MASK doesn't include the bit for LBR_CALL_STACK. So LBR call stack will never be set in lbr_select. This patch corrects the LBR_SEL_MASK by including all valid bits in LBR_SELECT. Also, the LBR_CALL_STACK bit is different as other bit in LBR_SELECT. It does not operate in suppress mode, so it needs to be specially handled in intel_pmu_setup_hw_lbr_filter. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/1461231010-4399-1-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/lbr.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 6c3b7c1780c9..1ca5d1e7d4f2 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -63,7 +63,7 @@ static enum { #define LBR_PLM (LBR_KERNEL | LBR_USER) -#define LBR_SEL_MASK 0x1ff /* valid bits in LBR_SELECT */ +#define LBR_SEL_MASK 0x3ff /* valid bits in LBR_SELECT */ #define LBR_NOT_SUPP -1 /* LBR filter not supported */ #define LBR_IGN 0 /* ignored */ @@ -610,8 +610,10 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event) * The first 9 bits (LBR_SEL_MASK) in LBR_SELECT operate * in suppress mode. So LBR_SELECT should be set to * (~mask & LBR_SEL_MASK) | (mask & ~LBR_SEL_MASK) + * But the 10th bit LBR_CALL_STACK does not operate + * in suppress mode. */ - reg->config = mask ^ x86_pmu.lbr_sel_mask; + reg->config = mask ^ (x86_pmu.lbr_sel_mask & ~LBR_CALL_STACK); if ((br_type & PERF_SAMPLE_BRANCH_NO_CYCLES) && (br_type & PERF_SAMPLE_BRANCH_NO_FLAGS) && -- cgit v1.2.3 From 7f9b474c92713067237c8188f32791cc4007b5da Mon Sep 17 00:00:00 2001 From: Josh Boyer Date: Tue, 3 May 2016 20:29:41 +0100 Subject: x86/efi-bgrt: Switch all pr_err() to pr_notice() for invalid BGRT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The promise of pretty boot splashes from firmware via BGRT was at best only that; a promise. The kernel diligently checks to make sure the BGRT data firmware gives it is valid, and dutifully warns the user when it isn't. However, it does so via the pr_err log level which seems unnecessary. The user cannot do anything about this and there really isn't an error on the part of Linux to correct. This lowers the log level by using pr_notice instead. Users will no longer have their boot process uglified by the kernel reminding us that firmware can and often is broken when the 'quiet' kernel parameter is specified. Ironic, considering BGRT is supposed to make boot pretty to begin with. Signed-off-by: Josh Boyer Signed-off-by: Matt Fleming Reviewed-by: Josh Triplett Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Môshe van der Sterre Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-efi@vger.kernel.org Link: http://lkml.kernel.org/r/1462303781-8686-4-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar --- arch/x86/platform/efi/efi-bgrt.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/efi/efi-bgrt.c b/arch/x86/platform/efi/efi-bgrt.c index a2433817c987..6a2f5691b1ab 100644 --- a/arch/x86/platform/efi/efi-bgrt.c +++ b/arch/x86/platform/efi/efi-bgrt.c @@ -43,40 +43,40 @@ void __init efi_bgrt_init(void) return; if (bgrt_tab->header.length < sizeof(*bgrt_tab)) { - pr_err("Ignoring BGRT: invalid length %u (expected %zu)\n", + pr_notice("Ignoring BGRT: invalid length %u (expected %zu)\n", bgrt_tab->header.length, sizeof(*bgrt_tab)); return; } if (bgrt_tab->version != 1) { - pr_err("Ignoring BGRT: invalid version %u (expected 1)\n", + pr_notice("Ignoring BGRT: invalid version %u (expected 1)\n", bgrt_tab->version); return; } if (bgrt_tab->status & 0xfe) { - pr_err("Ignoring BGRT: reserved status bits are non-zero %u\n", + pr_notice("Ignoring BGRT: reserved status bits are non-zero %u\n", bgrt_tab->status); return; } if (bgrt_tab->image_type != 0) { - pr_err("Ignoring BGRT: invalid image type %u (expected 0)\n", + pr_notice("Ignoring BGRT: invalid image type %u (expected 0)\n", bgrt_tab->image_type); return; } if (!bgrt_tab->image_address) { - pr_err("Ignoring BGRT: null image address\n"); + pr_notice("Ignoring BGRT: null image address\n"); return; } image = memremap(bgrt_tab->image_address, sizeof(bmp_header), MEMREMAP_WB); if (!image) { - pr_err("Ignoring BGRT: failed to map image header memory\n"); + pr_notice("Ignoring BGRT: failed to map image header memory\n"); return; } memcpy(&bmp_header, image, sizeof(bmp_header)); memunmap(image); if (bmp_header.id != 0x4d42) { - pr_err("Ignoring BGRT: Incorrect BMP magic number 0x%x (expected 0x4d42)\n", + pr_notice("Ignoring BGRT: Incorrect BMP magic number 0x%x (expected 0x4d42)\n", bmp_header.id); return; } @@ -84,14 +84,14 @@ void __init efi_bgrt_init(void) bgrt_image = kmalloc(bgrt_image_size, GFP_KERNEL | __GFP_NOWARN); if (!bgrt_image) { - pr_err("Ignoring BGRT: failed to allocate memory for image (wanted %zu bytes)\n", + pr_notice("Ignoring BGRT: failed to allocate memory for image (wanted %zu bytes)\n", bgrt_image_size); return; } image = memremap(bgrt_tab->image_address, bmp_header.size, MEMREMAP_WB); if (!image) { - pr_err("Ignoring BGRT: failed to map image memory\n"); + pr_notice("Ignoring BGRT: failed to map image memory\n"); kfree(bgrt_image); bgrt_image = NULL; return; -- cgit v1.2.3 From cba1b3798e2c4c094f2079a0d4c1ba4ec2c5a9ac Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 29 Apr 2016 17:55:48 -0700 Subject: perf/x86: Add model numbers for Kabylake CPUs Everything the same as Skylake, just new model numbers. Signed-off-by: Andi Kleen Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/1461977748-17616-1-git-send-email-andi@firstfloor.org Signed-off-by: Ingo Molnar --- arch/x86/events/intel/core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index aff79884e17d..a6fd4dbcf820 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3637,6 +3637,8 @@ __init int intel_pmu_init(void) pr_cont("Knights Landing events, "); break; + case 142: /* 14nm Kabylake Mobile */ + case 158: /* 14nm Kabylake Desktop */ case 78: /* 14nm Skylake Mobile */ case 94: /* 14nm Skylake Desktop */ case 85: /* 14nm Skylake Server */ -- cgit v1.2.3 From 08914f436bdd2ed60923f49cbc402307aba20fe4 Mon Sep 17 00:00:00 2001 From: Alex Thorlton Date: Wed, 4 May 2016 17:39:52 -0500 Subject: x86/platform/UV: Bring back the call to map_low_mmrs in uv_system_init A while back the following commit: d394f2d9d8e1 ("x86/platform/UV: Remove EFI memmap quirk for UV2+") changed uv_system_init() to only call map_low_mmrs() on older UV1 hardware, which requires EFI_OLD_MEMMAP to be set in order to boot. The recent changes to the EFI memory mapping code in: d2f7cbe7b26a ("x86/efi: Runtime services virtual mapping") exposed some issues with the fact that we were relying on the EFI memory mapping mechanisms to map in our MMRs for us, after commit d394f2d9d8e1. Rather than revert the entire commit and go back to forcing EFI_OLD_MEMMAP on all UVs, we're going to add the call to map_low_mmrs() back into uv_system_init(), and then fix up our EFI runtime calls to use the appropriate page table. For now, UV2+ will still need efi=old_map to boot, but there will be other changes soon that should eliminate the need for this. Signed-off-by: Alex Thorlton Cc: Matt Fleming Cc: Adam Buchbinder Cc: Len Brown Cc: Borislav Petkov Cc: Russ Anderson Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/1462401592-120735-1-git-send-email-athorlton@sgi.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/x2apic_uv_x.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 8f4942e2bcbb..d7ce96a7daca 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -891,9 +891,7 @@ void __init uv_system_init(void) } pr_info("UV: Found %s hub\n", hub); - /* We now only need to map the MMRs on UV1 */ - if (is_uv1_hub()) - map_low_mmrs(); + map_low_mmrs(); m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR ); m_val = m_n_config.s.m_skt; -- cgit v1.2.3 From 8482716b9d865db34c55d4bf2fed19498e7195b9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 24 Apr 2016 00:42:55 +0200 Subject: perf/x86/amd/iommu: Do not register a task ctx for uncore like PMUs The new sanity check introduced by: 26657848502b ("perf/core: Verify we have a single perf_hw_context PMU") ... triggered on the AMD IOMMU driver. IOMMUs are not per logical CPU, they cannot have per-task counters. Fix it. Reported-by: Borislav Petkov Tested-by: Borislav Petkov Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: jroedel@suse.de Cc: suravee.suthikulpanit@amd.com Link: http://lkml.kernel.org/r/20160423224255.GB3430@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- arch/x86/events/amd/iommu.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/events/amd/iommu.c b/arch/x86/events/amd/iommu.c index 40625ca7a190..6011a573dd64 100644 --- a/arch/x86/events/amd/iommu.c +++ b/arch/x86/events/amd/iommu.c @@ -474,6 +474,7 @@ static __init int _init_perf_amd_iommu( static struct perf_amd_iommu __perf_iommu = { .pmu = { + .task_ctx_nr = perf_invalid_context, .event_init = perf_iommu_event_init, .add = perf_iommu_add, .del = perf_iommu_del, -- cgit v1.2.3 From c10fcb14c7afd6688c7b197a814358fecf244222 Mon Sep 17 00:00:00 2001 From: Wang YanQing Date: Thu, 5 May 2016 14:14:21 +0100 Subject: x86/sysfb_efi: Fix valid BAR address range check The code for checking whether a BAR address range is valid will break out of the loop when a start address of 0x0 is encountered. This behaviour is wrong since by breaking out of the loop we may miss the BAR that describes the EFI frame buffer in a later iteration. Because of this bug I can't use video=efifb: boot parameter to get efifb on my new ThinkPad E550 for my old linux system hard disk with 3.10 kernel. In 3.10, efifb is the only choice due to DRM/I915 not supporting the GPU. This patch also add a trivial optimization to break out after we find the frame buffer address range without testing later BARs. Signed-off-by: Wang YanQing [ Rewrote changelog. ] Signed-off-by: Matt Fleming Reviewed-by: Peter Jones Cc: Cc: Ard Biesheuvel Cc: David Herrmann Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tomi Valkeinen Cc: linux-efi@vger.kernel.org Link: http://lkml.kernel.org/r/1462454061-21561-2-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar --- arch/x86/kernel/sysfb_efi.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/sysfb_efi.c b/arch/x86/kernel/sysfb_efi.c index b285d4e8c68e..5da924bbf0a0 100644 --- a/arch/x86/kernel/sysfb_efi.c +++ b/arch/x86/kernel/sysfb_efi.c @@ -106,14 +106,24 @@ static int __init efifb_set_system(const struct dmi_system_id *id) continue; for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) { resource_size_t start, end; + unsigned long flags; + + flags = pci_resource_flags(dev, i); + if (!(flags & IORESOURCE_MEM)) + continue; + + if (flags & IORESOURCE_UNSET) + continue; + + if (pci_resource_len(dev, i) == 0) + continue; start = pci_resource_start(dev, i); - if (start == 0) - break; end = pci_resource_end(dev, i); if (screen_info.lfb_base >= start && screen_info.lfb_base < end) { found_bar = 1; + break; } } } -- cgit v1.2.3 From 127393fbe597dd85863a9bdccaa11007e7d4948f Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 5 May 2016 16:22:20 -0700 Subject: mm: thp: kvm: fix memory corruption in KVM with THP enabled After the THP refcounting change, obtaining a compound pages from get_user_pages() no longer allows us to assume the entire compound page is immediately mappable from a secondary MMU. A secondary MMU doesn't want to call get_user_pages() more than once for each compound page, in order to know if it can map the whole compound page. So a secondary MMU needs to know from a single get_user_pages() invocation when it can map immediately the entire compound page to avoid a flood of unnecessary secondary MMU faults and spurious atomic_inc()/atomic_dec() (pages don't have to be pinned by MMU notifier users). Ideally instead of the page->_mapcount < 1 check, get_user_pages() should return the granularity of the "page" mapping in the "mm" passed to get_user_pages(). However it's non trivial change to pass the "pmd" status belonging to the "mm" walked by get_user_pages up the stack (up to the caller of get_user_pages). So the fix just checks if there is not a single pte mapping on the page returned by get_user_pages, and in turn if the caller can assume that the whole compound page is mapped in the current "mm" (in a pmd_trans_huge()). In such case the entire compound page is safe to map into the secondary MMU without additional get_user_pages() calls on the surrounding tail/head pages. In addition of being faster, not having to run other get_user_pages() calls also reduces the memory footprint of the secondary MMU fault in case the pmd split happened as result of memory pressure. Without this fix after a MADV_DONTNEED (like invoked by QEMU during postcopy live migration or balloning) or after generic swapping (with a failure in split_huge_page() that would only result in pmd splitting and not a physical page split), KVM would map the whole compound page into the shadow pagetables, despite regular faults or userfaults (like UFFDIO_COPY) may map regular pages into the primary MMU as result of the pte faults, leading to the guest mode and userland mode going out of sync and not working on the same memory at all times. Any other secondary MMU notifier manager (KVM is just one of the many MMU notifier users) will need the same information if it doesn't want to run a flood of get_user_pages_fast and it can support multiple granularity in the secondary MMU mappings, so I think it is justified to be exposed not just to KVM. The other option would be to move transparent_hugepage_adjust to mm/huge_memory.c but that currently has all kind of KVM data structures in it, so it's definitely not a cut-and-paste work, so I couldn't do a fix as cleaner as this one for 4.6. Signed-off-by: Andrea Arcangeli Cc: "Dr. David Alan Gilbert" Cc: "Kirill A. Shutemov" Cc: "Li, Liang Z" Cc: Amit Shah Cc: Paolo Bonzini Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/kvm/mmu.c | 2 +- arch/x86/kvm/mmu.c | 4 ++-- include/linux/page-flags.h | 22 ++++++++++++++++++++++ 3 files changed, 25 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 58dbd5c439df..d6d4191e68f2 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -1004,7 +1004,7 @@ static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap) kvm_pfn_t pfn = *pfnp; gfn_t gfn = *ipap >> PAGE_SHIFT; - if (PageTransCompound(pfn_to_page(pfn))) { + if (PageTransCompoundMap(pfn_to_page(pfn))) { unsigned long mask; /* * The address we faulted on is backed by a transparent huge diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 1ff4dbb73fb7..b6f50e8b0a39 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2823,7 +2823,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, */ if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL && - PageTransCompound(pfn_to_page(pfn)) && + PageTransCompoundMap(pfn_to_page(pfn)) && !mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) { unsigned long mask; /* @@ -4785,7 +4785,7 @@ restart: */ if (sp->role.direct && !kvm_is_reserved_pfn(pfn) && - PageTransCompound(pfn_to_page(pfn))) { + PageTransCompoundMap(pfn_to_page(pfn))) { drop_spte(kvm, sptep); need_tlb_flush = 1; goto restart; diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index f4ed4f1b0c77..6b052aa7b5b7 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -516,6 +516,27 @@ static inline int PageTransCompound(struct page *page) return PageCompound(page); } +/* + * PageTransCompoundMap is the same as PageTransCompound, but it also + * guarantees the primary MMU has the entire compound page mapped + * through pmd_trans_huge, which in turn guarantees the secondary MMUs + * can also map the entire compound page. This allows the secondary + * MMUs to call get_user_pages() only once for each compound page and + * to immediately map the entire compound page with a single secondary + * MMU fault. If there will be a pmd split later, the secondary MMUs + * will get an update through the MMU notifier invalidation through + * split_huge_pmd(). + * + * Unlike PageTransCompound, this is safe to be called only while + * split_huge_pmd() cannot run from under us, like if protected by the + * MMU notifier, otherwise it may result in page->_mapcount < 0 false + * positives. + */ +static inline int PageTransCompoundMap(struct page *page) +{ + return PageTransCompound(page) && atomic_read(&page->_mapcount) < 0; +} + /* * PageTransTail returns true for both transparent huge pages * and hugetlbfs pages, so it should only be called when it's known @@ -559,6 +580,7 @@ static inline int TestClearPageDoubleMap(struct page *page) #else TESTPAGEFLAG_FALSE(TransHuge) TESTPAGEFLAG_FALSE(TransCompound) +TESTPAGEFLAG_FALSE(TransCompoundMap) TESTPAGEFLAG_FALSE(TransTail) TESTPAGEFLAG_FALSE(DoubleMap) TESTSETFLAG_FALSE(DoubleMap) -- cgit v1.2.3 From 886123fb3a8656699dff40afa0573df359abeb18 Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Fri, 6 May 2016 11:33:39 +0800 Subject: x86/tsc: Read all ratio bits from MSR_PLATFORM_INFO Currently we read the tsc radio: ratio = (MSR_PLATFORM_INFO >> 8) & 0x1f; Thus we get bit 8-12 of MSR_PLATFORM_INFO, however according to the SDM (35.5), the ratio bits are bit 8-15. Ignoring the upper bits can result in an incorrect tsc ratio, which causes the TSC calibration and the Local APIC timer frequency to be incorrect. Fix this problem by masking 0xff instead. [ tglx: Massaged changelog ] Fixes: 7da7c1561366 "x86, tsc: Add static (MSR) TSC calibration on Intel Atom SoCs" Signed-off-by: Chen Yu Cc: "Rafael J. Wysocki" Cc: stable@vger.kernel.org Cc: Bin Gao Cc: Len Brown Link: http://lkml.kernel.org/r/1462505619-5516-1-git-send-email-yu.c.chen@intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/tsc_msr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c index 92ae6acac8a7..6aa0f4d9eea6 100644 --- a/arch/x86/kernel/tsc_msr.c +++ b/arch/x86/kernel/tsc_msr.c @@ -92,7 +92,7 @@ unsigned long try_msr_calibrate_tsc(void) if (freq_desc_tables[cpu_index].msr_plat) { rdmsr(MSR_PLATFORM_INFO, lo, hi); - ratio = (lo >> 8) & 0x1f; + ratio = (lo >> 8) & 0xff; } else { rdmsr(MSR_IA32_PERF_STATUS, lo, hi); ratio = (hi >> 8) & 0x1f; -- cgit v1.2.3