diff options
Diffstat (limited to 'arch')
52 files changed, 1027 insertions, 204 deletions
diff --git a/arch/Kconfig b/arch/Kconfig index 1aa59063f1fd..d1f2ed462ac8 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -13,6 +13,9 @@ config KEXEC_CORE config HAVE_IMA_KEXEC bool +config HOTPLUG_SMT + bool + config OPROFILE tristate "OProfile system profiling" depends on PROFILING diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 6d4774f203d0..0b767d6577b7 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -187,6 +187,7 @@ config X86 select HAVE_SYSCALL_TRACEPOINTS select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_USER_RETURN_NOTIFIER + select HOTPLUG_SMT if SMP select IRQ_FORCED_THREADING select NEED_SG_DMA_LENGTH select PCI_LOCKLESS_CONFIG diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 74a9e06b6cfd..130e81e10fc7 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -10,6 +10,7 @@ #include <asm/fixmap.h> #include <asm/mpspec.h> #include <asm/msr.h> +#include <asm/hardirq.h> #define ARCH_APICTIMER_STOPS_ON_C3 1 @@ -502,12 +503,19 @@ extern int default_check_phys_apicid_present(int phys_apicid); #endif /* CONFIG_X86_LOCAL_APIC */ +#ifdef CONFIG_SMP +bool apic_id_is_primary_thread(unsigned int id); +#else +static inline bool apic_id_is_primary_thread(unsigned int id) { return false; } +#endif + extern void irq_enter(void); extern void irq_exit(void); static inline void entering_irq(void) { irq_enter(); + kvm_set_cpu_l1tf_flush_l1d(); } static inline void entering_ack_irq(void) @@ -520,6 +528,7 @@ static inline void ipi_entering_ack_irq(void) { irq_enter(); ack_APIC_irq(); + kvm_set_cpu_l1tf_flush_l1d(); } static inline void exiting_irq(void) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index b5c60faf8429..89a048c2faec 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -219,7 +219,8 @@ #define X86_FEATURE_IBPB ( 7*32+26) /* Indirect Branch Prediction Barrier */ #define X86_FEATURE_STIBP ( 7*32+27) /* Single Thread Indirect Branch Predictors */ #define X86_FEATURE_ZEN ( 7*32+28) /* "" CPU is AMD family 0x17 (Zen) */ -#define X86_FEATURE_IBRS_ENHANCED ( 7*32+29) /* Enhanced IBRS */ +#define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* "" L1TF workaround PTE inversion */ +#define X86_FEATURE_IBRS_ENHANCED ( 7*32+30) /* Enhanced IBRS */ /* Virtualization flags: Linux defined, word 8 */ #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ @@ -342,6 +343,7 @@ #define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */ #define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */ #define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */ +#define X86_FEATURE_FLUSH_L1D (18*32+28) /* Flush L1D cache */ #define X86_FEATURE_ARCH_CAPABILITIES (18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */ #define X86_FEATURE_SPEC_CTRL_SSBD (18*32+31) /* "" Speculative Store Bypass Disable */ @@ -374,5 +376,6 @@ #define X86_BUG_SPECTRE_V1 X86_BUG(15) /* CPU is affected by Spectre variant 1 attack with conditional branches */ #define X86_BUG_SPECTRE_V2 X86_BUG(16) /* CPU is affected by Spectre variant 2 attack with indirect branches */ #define X86_BUG_SPEC_STORE_BYPASS X86_BUG(17) /* CPU is affected by speculative store bypass attack */ +#define X86_BUG_L1TF X86_BUG(18) /* CPU is affected by L1 Terminal Fault */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/dmi.h b/arch/x86/include/asm/dmi.h index 0ab2ab27ad1f..b825cb201251 100644 --- a/arch/x86/include/asm/dmi.h +++ b/arch/x86/include/asm/dmi.h @@ -4,8 +4,8 @@ #include <linux/compiler.h> #include <linux/init.h> +#include <linux/io.h> -#include <asm/io.h> #include <asm/setup.h> static __always_inline __init void *dmi_alloc(unsigned len) diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 740a428acf1e..d9069bb26c7f 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -3,10 +3,12 @@ #define _ASM_X86_HARDIRQ_H #include <linux/threads.h> -#include <linux/irq.h> typedef struct { - unsigned int __softirq_pending; + u16 __softirq_pending; +#if IS_ENABLED(CONFIG_KVM_INTEL) + u8 kvm_cpu_l1tf_flush_l1d; +#endif unsigned int __nmi_count; /* arch dependent */ #ifdef CONFIG_X86_LOCAL_APIC unsigned int apic_timer_irqs; /* arch dependent */ @@ -58,4 +60,24 @@ extern u64 arch_irq_stat_cpu(unsigned int cpu); extern u64 arch_irq_stat(void); #define arch_irq_stat arch_irq_stat + +#if IS_ENABLED(CONFIG_KVM_INTEL) +static inline void kvm_set_cpu_l1tf_flush_l1d(void) +{ + __this_cpu_write(irq_stat.kvm_cpu_l1tf_flush_l1d, 1); +} + +static inline void kvm_clear_cpu_l1tf_flush_l1d(void) +{ + __this_cpu_write(irq_stat.kvm_cpu_l1tf_flush_l1d, 0); +} + +static inline bool kvm_get_cpu_l1tf_flush_l1d(void) +{ + return __this_cpu_read(irq_stat.kvm_cpu_l1tf_flush_l1d); +} +#else /* !IS_ENABLED(CONFIG_KVM_INTEL) */ +static inline void kvm_set_cpu_l1tf_flush_l1d(void) { } +#endif /* IS_ENABLED(CONFIG_KVM_INTEL) */ + #endif /* _ASM_X86_HARDIRQ_H */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c13cd28d9d1b..acebb808c4b5 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -17,6 +17,7 @@ #include <linux/tracepoint.h> #include <linux/cpumask.h> #include <linux/irq_work.h> +#include <linux/irq.h> #include <linux/kvm.h> #include <linux/kvm_para.h> @@ -713,6 +714,9 @@ struct kvm_vcpu_arch { /* be preempted when it's in kernel-mode(cpl=0) */ bool preempted_in_kernel; + + /* Flush the L1 Data cache for L1TF mitigation on VMENTER */ + bool l1tf_flush_l1d; }; struct kvm_lpage_info { @@ -881,6 +885,7 @@ struct kvm_vcpu_stat { u64 signal_exits; u64 irq_window_exits; u64 nmi_window_exits; + u64 l1d_flush; u64 halt_exits; u64 halt_successful_poll; u64 halt_attempted_poll; @@ -1413,6 +1418,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v); void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event); void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu); +u64 kvm_get_arch_capabilities(void); void kvm_define_shared_msr(unsigned index, u32 msr); int kvm_set_shared_msr(unsigned index, u64 val, u64 mask); diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 68b2c3150de1..4731f0cf97c5 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -70,12 +70,19 @@ #define MSR_IA32_ARCH_CAPABILITIES 0x0000010a #define ARCH_CAP_RDCL_NO (1 << 0) /* Not susceptible to Meltdown */ #define ARCH_CAP_IBRS_ALL (1 << 1) /* Enhanced IBRS support */ +#define ARCH_CAP_SKIP_VMENTRY_L1DFLUSH (1 << 3) /* Skip L1D flush on vmentry */ #define ARCH_CAP_SSB_NO (1 << 4) /* * Not susceptible to Speculative Store Bypass * attack, so no Speculative Store Bypass * control required. */ +#define MSR_IA32_FLUSH_CMD 0x0000010b +#define L1D_FLUSH (1 << 0) /* + * Writeback and invalidate the + * L1 data cache. + */ + #define MSR_IA32_BBL_CR_CTL 0x00000119 #define MSR_IA32_BBL_CR_CTL3 0x0000011e diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h index aa30c3241ea7..0d5c739eebd7 100644 --- a/arch/x86/include/asm/page_32_types.h +++ b/arch/x86/include/asm/page_32_types.h @@ -29,8 +29,13 @@ #define N_EXCEPTION_STACKS 1 #ifdef CONFIG_X86_PAE -/* 44=32+12, the limit we can fit into an unsigned long pfn */ -#define __PHYSICAL_MASK_SHIFT 44 +/* + * This is beyond the 44 bit limit imposed by the 32bit long pfns, + * but we need the full mask to make sure inverted PROT_NONE + * entries have all the host bits set in a guest. + * The real limit is still 44 bits. + */ +#define __PHYSICAL_MASK_SHIFT 52 #define __VIRTUAL_MASK_SHIFT 32 #else /* !CONFIG_X86_PAE */ diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h index c399ea5eea41..24c6cf5f16b7 100644 --- a/arch/x86/include/asm/pgtable-2level.h +++ b/arch/x86/include/asm/pgtable-2level.h @@ -104,4 +104,21 @@ static inline unsigned long pte_bitop(unsigned long value, unsigned int rightshi #define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_low }) #define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val }) +/* No inverted PFNs on 2 level page tables */ + +static inline u64 protnone_mask(u64 val) +{ + return 0; +} + +static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask) +{ + return val; +} + +static inline bool __pte_needs_invert(u64 val) +{ + return false; +} + #endif /* _ASM_X86_PGTABLE_2LEVEL_H */ diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index f2ca3139ca22..a564084c6141 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -248,12 +248,43 @@ static inline pud_t native_pudp_get_and_clear(pud_t *pudp) #endif /* Encode and de-code a swap entry */ +#define SWP_TYPE_BITS 5 + +#define SWP_OFFSET_FIRST_BIT (_PAGE_BIT_PROTNONE + 1) + +/* We always extract/encode the offset by shifting it all the way up, and then down again */ +#define SWP_OFFSET_SHIFT (SWP_OFFSET_FIRST_BIT + SWP_TYPE_BITS) + #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5) #define __swp_type(x) (((x).val) & 0x1f) #define __swp_offset(x) ((x).val >> 5) #define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << 5}) -#define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high }) -#define __swp_entry_to_pte(x) ((pte_t){ { .pte_high = (x).val } }) + +/* + * Normally, __swp_entry() converts from arch-independent swp_entry_t to + * arch-dependent swp_entry_t, and __swp_entry_to_pte() just stores the result + * to pte. But here we have 32bit swp_entry_t and 64bit pte, and need to use the + * whole 64 bits. Thus, we shift the "real" arch-dependent conversion to + * __swp_entry_to_pte() through the following helper macro based on 64bit + * __swp_entry(). + */ +#define __swp_pteval_entry(type, offset) ((pteval_t) { \ + (~(pteval_t)(offset) << SWP_OFFSET_SHIFT >> SWP_TYPE_BITS) \ + | ((pteval_t)(type) << (64 - SWP_TYPE_BITS)) }) + +#define __swp_entry_to_pte(x) ((pte_t){ .pte = \ + __swp_pteval_entry(__swp_type(x), __swp_offset(x)) }) +/* + * Analogically, __pte_to_swp_entry() doesn't just extract the arch-dependent + * swp_entry_t, but also has to convert it from 64bit to the 32bit + * intermediate representation, using the following macros based on 64bit + * __swp_type() and __swp_offset(). + */ +#define __pteval_swp_type(x) ((unsigned long)((x).pte >> (64 - SWP_TYPE_BITS))) +#define __pteval_swp_offset(x) ((unsigned long)(~((x).pte) << SWP_TYPE_BITS >> SWP_OFFSET_SHIFT)) + +#define __pte_to_swp_entry(pte) (__swp_entry(__pteval_swp_type(pte), \ + __pteval_swp_offset(pte))) #define gup_get_pte gup_get_pte /* @@ -302,4 +333,6 @@ static inline pte_t gup_get_pte(pte_t *ptep) return pte; } +#include <asm/pgtable-invert.h> + #endif /* _ASM_X86_PGTABLE_3LEVEL_H */ diff --git a/arch/x86/include/asm/pgtable-invert.h b/arch/x86/include/asm/pgtable-invert.h new file mode 100644 index 000000000000..44b1203ece12 --- /dev/null +++ b/arch/x86/include/asm/pgtable-invert.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_PGTABLE_INVERT_H +#define _ASM_PGTABLE_INVERT_H 1 + +#ifndef __ASSEMBLY__ + +static inline bool __pte_needs_invert(u64 val) +{ + return !(val & _PAGE_PRESENT); +} + +/* Get a mask to xor with the page table entry to get the correct pfn. */ +static inline u64 protnone_mask(u64 val) +{ + return __pte_needs_invert(val) ? ~0ull : 0; +} + +static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask) +{ + /* + * When a PTE transitions from NONE to !NONE or vice-versa + * invert the PFN part to stop speculation. + * pte_pfn undoes this when needed. + */ + if (__pte_needs_invert(oldval) != __pte_needs_invert(val)) + val = (val & ~mask) | (~val & mask); + return val; +} + +#endif /* __ASSEMBLY__ */ + +#endif diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index a1cb3339da8d..e4ffa565a69f 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -188,19 +188,29 @@ static inline int pte_special(pte_t pte) return pte_flags(pte) & _PAGE_SPECIAL; } +/* Entries that were set to PROT_NONE are inverted */ + +static inline u64 protnone_mask(u64 val); + static inline unsigned long pte_pfn(pte_t pte) { - return (pte_val(pte) & PTE_PFN_MASK) >> PAGE_SHIFT; + phys_addr_t pfn = pte_val(pte); + pfn ^= protnone_mask(pfn); + return (pfn & PTE_PFN_MASK) >> PAGE_SHIFT; } static inline unsigned long pmd_pfn(pmd_t pmd) { - return (pmd_val(pmd) & pmd_pfn_mask(pmd)) >> PAGE_SHIFT; + phys_addr_t pfn = pmd_val(pmd); + pfn ^= protnone_mask(pfn); + return (pfn & pmd_pfn_mask(pmd)) >> PAGE_SHIFT; } static inline unsigned long pud_pfn(pud_t pud) { - return (pud_val(pud) & pud_pfn_mask(pud)) >> PAGE_SHIFT; + phys_addr_t pfn = pud_val(pud); + pfn ^= protnone_mask(pfn); + return (pfn & pud_pfn_mask(pud)) >> PAGE_SHIFT; } static inline unsigned long p4d_pfn(p4d_t p4d) @@ -403,11 +413,6 @@ static inline pmd_t pmd_mkwrite(pmd_t pmd) return pmd_set_flags(pmd, _PAGE_RW); } -static inline pmd_t pmd_mknotpresent(pmd_t pmd) -{ - return pmd_clear_flags(pmd, _PAGE_PRESENT | _PAGE_PROTNONE); -} - static inline pud_t pud_set_flags(pud_t pud, pudval_t set) { pudval_t v = native_pud_val(pud); @@ -462,11 +467,6 @@ static inline pud_t pud_mkwrite(pud_t pud) return pud_set_flags(pud, _PAGE_RW); } -static inline pud_t pud_mknotpresent(pud_t pud) -{ - return pud_clear_flags(pud, _PAGE_PRESENT | _PAGE_PROTNONE); -} - #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY static inline int pte_soft_dirty(pte_t pte) { @@ -548,25 +548,45 @@ static inline pgprotval_t check_pgprot(pgprot_t pgprot) static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot) { - return __pte(((phys_addr_t)page_nr << PAGE_SHIFT) | - check_pgprot(pgprot)); + phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT; + pfn ^= protnone_mask(pgprot_val(pgprot)); + pfn &= PTE_PFN_MASK; + return __pte(pfn | check_pgprot(pgprot)); } static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot) { - return __pmd(((phys_addr_t)page_nr << PAGE_SHIFT) | - check_pgprot(pgprot)); + phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT; + pfn ^= protnone_mask(pgprot_val(pgprot)); + pfn &= PHYSICAL_PMD_PAGE_MASK; + return __pmd(pfn | check_pgprot(pgprot)); } static inline pud_t pfn_pud(unsigned long page_nr, pgprot_t pgprot) { - return __pud(((phys_addr_t)page_nr << PAGE_SHIFT) | - check_pgprot(pgprot)); + phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT; + pfn ^= protnone_mask(pgprot_val(pgprot)); + pfn &= PHYSICAL_PUD_PAGE_MASK; + return __pud(pfn | check_pgprot(pgprot)); } +static inline pmd_t pmd_mknotpresent(pmd_t pmd) +{ + return pfn_pmd(pmd_pfn(pmd), + __pgprot(pmd_flags(pmd) & ~(_PAGE_PRESENT|_PAGE_PROTNONE))); +} + +static inline pud_t pud_mknotpresent(pud_t pud) +{ + return pfn_pud(pud_pfn(pud), + __pgprot(pud_flags(pud) & ~(_PAGE_PRESENT|_PAGE_PROTNONE))); +} + +static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask); + static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { - pteval_t val = pte_val(pte); + pteval_t val = pte_val(pte), oldval = val; /* * Chop off the NX bit (if present), and add the NX portion of @@ -574,17 +594,17 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) */ val &= _PAGE_CHG_MASK; val |= check_pgprot(newprot) & ~_PAGE_CHG_MASK; - + val = flip_protnone_guard(oldval, val, PTE_PFN_MASK); return __pte(val); } static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) { - pmdval_t val = pmd_val(pmd); + pmdval_t val = pmd_val(pmd), oldval = val; val &= _HPAGE_CHG_MASK; val |= check_pgprot(newprot) & ~_HPAGE_CHG_MASK; - + val = flip_protnone_guard(oldval, val, PHYSICAL_PMD_PAGE_MASK); return __pmd(val); } @@ -1410,6 +1430,14 @@ static inline bool pud_access_permitted(pud_t pud, bool write) return __pte_access_permitted(pud_val(pud), write); } +#define __HAVE_ARCH_PFN_MODIFY_ALLOWED 1 +extern bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot); + +static inline bool arch_has_pfn_modify_check(void) +{ + return boot_cpu_has_bug(X86_BUG_L1TF); +} + #include <asm-generic/pgtable.h> #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index acb6970e7bcf..f773d5e6c8cc 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -188,7 +188,7 @@ extern void sync_global_pgds(unsigned long start, unsigned long end); * * | ... | 11| 10| 9|8|7|6|5| 4| 3|2| 1|0| <- bit number * | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U| W|P| <- bit names - * | OFFSET (14->63) | TYPE (9-13) |0|0|X|X| X| X|X|SD|0| <- swp entry + * | TYPE (59-63) | ~OFFSET (9-58) |0|0|X|X| X| X|X|SD|0| <- swp entry * * G (8) is aliased and used as a PROT_NONE indicator for * !present ptes. We need to start storing swap entries above @@ -201,20 +201,34 @@ extern void sync_global_pgds(unsigned long start, unsigned long end); * * Bit 7 in swp entry should be 0 because pmd_present checks not only P, * but also L and G. + * + * The offset is inverted by a binary not operation to make the high + * physical bits set. */ -#define SWP_TYPE_FIRST_BIT (_PAGE_BIT_PROTNONE + 1) -#define SWP_TYPE_BITS 5 -/* Place the offset above the type: */ -#define SWP_OFFSET_FIRST_BIT (SWP_TYPE_FIRST_BIT + SWP_TYPE_BITS) +#define SWP_TYPE_BITS 5 + +#define SWP_OFFSET_FIRST_BIT (_PAGE_BIT_PROTNONE + 1) + +/* We always extract/encode the offset by shifting it all the way up, and then down again */ +#define SWP_OFFSET_SHIFT (SWP_OFFSET_FIRST_BIT+SWP_TYPE_BITS) #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) -#define __swp_type(x) (((x).val >> (SWP_TYPE_FIRST_BIT)) \ - & ((1U << SWP_TYPE_BITS) - 1)) -#define __swp_offset(x) ((x).val >> SWP_OFFSET_FIRST_BIT) -#define __swp_entry(type, offset) ((swp_entry_t) { \ - ((type) << (SWP_TYPE_FIRST_BIT)) \ - | ((offset) << SWP_OFFSET_FIRST_BIT) }) +/* Extract the high bits for type */ +#define __swp_type(x) ((x).val >> (64 - SWP_TYPE_BITS)) + +/* Shift up (to get rid of type), then down to get value */ +#define __swp_offset(x) (~(x).val << SWP_TYPE_BITS >> SWP_OFFSET_SHIFT) + +/* + * Shift the offset up "too far" by TYPE bits, then down again + * The offset is inverted by a binary not operation to make the high + * physical bits set. + */ +#define __swp_entry(type, offset) ((swp_entry_t) { \ + (~(unsigned long)(offset) << SWP_OFFSET_SHIFT >> SWP_TYPE_BITS) \ + | ((unsigned long)(type) << (64-SWP_TYPE_BITS)) }) + #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) }) #define __pmd_to_swp_entry(pmd) ((swp_entry_t) { pmd_val((pmd)) }) #define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val }) @@ -258,5 +272,7 @@ static inline bool gup_fast_permitted(unsigned long start, int nr_pages, return true; } +#include <asm/pgtable-invert.h> + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_PGTABLE_64_H */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 59663c08c949..682286aca881 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -181,6 +181,11 @@ extern const struct seq_operations cpuinfo_op; extern void cpu_detect(struct cpuinfo_x86 *c); +static inline unsigned long l1tf_pfn_limit(void) +{ + return BIT(boot_cpu_data.x86_phys_bits - 1 - PAGE_SHIFT) - 1; +} + extern void early_cpu_init(void); extern void identify_boot_cpu(void); extern void identify_secondary_cpu(struct cpuinfo_x86 *); @@ -978,4 +983,16 @@ bool xen_set_default_idle(void); void stop_this_cpu(void *dummy); void df_debug(struct pt_regs *regs, long error_code); void microcode_check(void); + +enum l1tf_mitigations { + L1TF_MITIGATION_OFF, + L1TF_MITIGATION_FLUSH_NOWARN, + L1TF_MITIGATION_FLUSH, + L1TF_MITIGATION_FLUSH_NOSMT, + L1TF_MITIGATION_FULL, + L1TF_MITIGATION_FULL_FORCE +}; + +extern enum l1tf_mitigations l1tf_mitigation; + #endif /* _ASM_X86_PROCESSOR_H */ diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index c1d2a9892352..453cf38a1c33 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -123,13 +123,17 @@ static inline int topology_max_smt_threads(void) } int topology_update_package_map(unsigned int apicid, unsigned int cpu); -extern int topology_phys_to_logical_pkg(unsigned int pkg); +int topology_phys_to_logical_pkg(unsigned int pkg); +bool topology_is_primary_thread(unsigned int cpu); +bool topology_smt_supported(void); #else #define topology_max_packages() (1) static inline int topology_update_package_map(unsigned int apicid, unsigned int cpu) { return 0; } static inline int topology_phys_to_logical_pkg(unsigned int pkg) { return 0; } static inline int topology_max_smt_threads(void) { return 1; } +static inline bool topology_is_primary_thread(unsigned int cpu) { return true; } +static inline bool topology_smt_supported(void) { return false; } #endif static inline void arch_fix_phys_package_id(int num, u32 slot) diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 6aa8499e1f62..95f9107449bf 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -576,4 +576,15 @@ enum vm_instruction_error_number { VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID = 28, }; +enum vmx_l1d_flush_state { + VMENTER_L1D_FLUSH_AUTO, + VMENTER_L1D_FLUSH_NEVER, + VMENTER_L1D_FLUSH_COND, + VMENTER_L1D_FLUSH_ALWAYS, + VMENTER_L1D_FLUSH_EPT_DISABLED, + VMENTER_L1D_FLUSH_NOT_REQUIRED, +}; + +extern enum vmx_l1d_flush_state l1tf_vmx_mitigation; + #endif diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 07fa222f0c52..87ff6235bbfe 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -56,6 +56,7 @@ #include <asm/hypervisor.h> #include <asm/cpu_device_id.h> #include <asm/intel-family.h> +#include <asm/irq_regs.h> unsigned int num_processors; @@ -2192,6 +2193,21 @@ static int cpuid_to_apicid[] = { [0 ... NR_CPUS - 1] = -1, }; +/** + * apic_id_is_primary_thread - Check whether APIC ID belongs to a primary thread + * @id: APIC ID to check + */ +bool apic_id_is_primary_thread(unsigned int apicid) +{ + u32 mask; + + if (smp_num_siblings == 1) + return true; + /* Isolate the SMT bit(s) in the APICID and check for 0 */ + mask = (1U << (fls(smp_num_siblings) - 1)) - 1; + return !(apicid & mask); +} + /* * Should use this API to allocate logical CPU IDs to keep nr_logical_cpuids * and cpuid_to_apicid[] synchronized. diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 3982f79d2377..ff0d14cd9e82 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -33,6 +33,7 @@ #include <linux/mm.h> #include <linux/interrupt.h> +#include <linux/irq.h> #include <linux/init.h> #include <linux/delay.h> #include <linux/sched.h> diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index ce503c99f5c4..72a94401f9e0 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -12,6 +12,7 @@ */ #include <linux/mm.h> #include <linux/interrupt.h> +#include <linux/irq.h> #include <linux/pci.h> #include <linux/dmar.h> #include <linux/hpet.h> diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 0954315842c0..9f148e3d45b4 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -11,6 +11,7 @@ * published by the Free Software Foundation. */ #include <linux/interrupt.h> +#include <linux/irq.h> #include <linux/seq_file.h> #include <linux/init.h> #include <linux/compiler.h> diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index b732438c1a1e..22ab408177b2 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -313,6 +313,13 @@ static void legacy_fixup_core_id(struct cpuinfo_x86 *c) c->cpu_core_id %= cus_per_node; } + +static void amd_get_topology_early(struct cpuinfo_x86 *c) +{ + if (cpu_has(c, X86_FEATURE_TOPOEXT)) + smp_num_siblings = ((cpuid_ebx(0x8000001e) >> 8) & 0xff) + 1; +} + /* * Fixup core topology information for * (1) AMD multi-node processors @@ -332,7 +339,6 @@ static void amd_get_topology(struct cpuinfo_x86 *c) cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); node_id = ecx & 0xff; - smp_num_siblings = ((ebx >> 8) & 0xff) + 1; if (c->x86 == 0x15) c->cu_id = ebx & 0xff; @@ -611,6 +617,7 @@ clear_sev: static void early_init_amd(struct cpuinfo_x86 *c) { + u64 value; u32 dummy; early_init_amd_mc(c); @@ -689,6 +696,22 @@ static void early_init_amd(struct cpuinfo_x86 *c) set_cpu_bug(c, X86_BUG_AMD_E400); early_detect_mem_encrypt(c); + + /* Re-enable TopologyExtensions if switched off by BIOS */ + if (c->x86 == 0x15 && + (c->x86_model >= 0x10 && c->x86_model <= 0x6f) && + !cpu_has(c, X86_FEATURE_TOPOEXT)) { + + if (msr_set_bit(0xc0011005, 54) > 0) { + rdmsrl(0xc0011005, value); + if (value & BIT_64(54)) { + set_cpu_cap(c, X86_FEATURE_TOPOEXT); + pr_info_once(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n"); + } + } + } + + amd_get_topology_early(c); } static void init_amd_k8(struct cpuinfo_x86 *c) @@ -780,19 +803,6 @@ static void init_amd_bd(struct cpuinfo_x86 *c) { u64 value; - /* re-enable TopologyExtensions if switched off by BIOS */ - if ((c->x86_model >= 0x10) && (c->x86_model <= 0x6f) && - !cpu_has(c, X86_FEATURE_TOPOEXT)) { - - if (msr_set_bit(0xc0011005, 54) > 0) { - rdmsrl(0xc0011005, value); - if (value & BIT_64(54)) { - set_cpu_cap(c, X86_FEATURE_TOPOEXT); - pr_info_once(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n"); - } - } - } - /* * The way access filter has a performance penalty on some workloads. * Disable it on the affected CPUs. @@ -856,16 +866,9 @@ static void init_amd(struct cpuinfo_x86 *c) cpu_detect_cache_sizes(c); - /* Multi core CPU? */ - if (c->extended_cpuid_level >= 0x80000008) { - amd_detect_cmp(c); - amd_get_topology(c); - srat_detect_node(c); - } - -#ifdef CONFIG_X86_32 - detect_ht(c); -#endif + amd_detect_cmp(c); + amd_get_topology(c); + srat_detect_node(c); init_amd_cacheinfo(c); diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 405a9a61bb89..27830880e7a7 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -22,15 +22,18 @@ #include <asm/processor-flags.h> #include <asm/fpu/internal.h> #include <asm/msr.h> +#include <asm/vmx.h> #include <asm/paravirt.h> #include <asm/alternative.h> #include <asm/pgtable.h> #include <asm/set_memory.h> #include <asm/intel-family.h> +#include <asm/e820/api.h> #include <asm/hypervisor.h> static void __init spectre_v2_select_mitigation(void); static void __init ssb_select_mitigation(void); +static void __init l1tf_select_mitigation(void); /* * Our boot-time value of the SPEC_CTRL MSR. We read it once so that any @@ -56,6 +59,12 @@ void __init check_bugs(void) { identify_boot_cpu(); + /* + * identify_boot_cpu() initialized SMT support information, let the + * core code know. + */ + cpu_smt_check_topology_early(); + if (!IS_ENABLED(CONFIG_SMP)) { pr_info("CPU: "); print_cpu_info(&boot_cpu_data); @@ -82,6 +91,8 @@ void __init check_bugs(void) */ ssb_select_mitigation(); + l1tf_select_mitigation(); + #ifdef CONFIG_X86_32 /* * Check whether we are able to run this kernel safely on SMP. @@ -646,8 +657,121 @@ void x86_spec_ctrl_setup_ap(void) x86_amd_ssb_disable(); } +#undef pr_fmt +#define pr_fmt(fmt) "L1TF: " fmt + +/* Default mitigation for L1TF-affected CPUs */ +enum l1tf_mitigations l1tf_mitigation __ro_after_init = L1TF_MITIGATION_FLUSH; +#if IS_ENABLED(CONFIG_KVM_INTEL) +EXPORT_SYMBOL_GPL(l1tf_mitigation); + +enum vmx_l1d_flush_state l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; +EXPORT_SYMBOL_GPL(l1tf_vmx_mitigation); +#endif + +static void __init l1tf_select_mitigation(void) +{ + u64 half_pa; + + if (!boot_cpu_has_bug(X86_BUG_L1TF)) + return; + + switch (l1tf_mitigation) { + case L1TF_MITIGATION_OFF: + case L1TF_MITIGATION_FLUSH_NOWARN: + case L1TF_MITIGATION_FLUSH: + break; + case L1TF_MITIGATION_FLUSH_NOSMT: + case L1TF_MITIGATION_FULL: + cpu_smt_disable(false); + break; + case L1TF_MITIGATION_FULL_FORCE: + cpu_smt_disable(true); + break; + } + +#if CONFIG_PGTABLE_LEVELS == 2 + pr_warn("Kernel not compiled for PAE. No mitigation for L1TF\n"); + return; +#endif + + /* + * This is extremely unlikely to happen because almost all + * systems have far more MAX_PA/2 than RAM can be fit into + * DIMM slots. + */ + half_pa = (u64)l1tf_pfn_limit() << PAGE_SHIFT; + if (e820__mapped_any(half_pa, ULLONG_MAX - half_pa, E820_TYPE_RAM)) { + pr_warn("System has more than MAX_PA/2 memory. L1TF mitigation not effective.\n"); + return; + } + + setup_force_cpu_cap(X86_FEATURE_L1TF_PTEINV); +} + +static int __init l1tf_cmdline(char *str) +{ + if (!boot_cpu_has_bug(X86_BUG_L1TF)) + return 0; + + if (!str) + return -EINVAL; + + if (!strcmp(str, "off")) + l1tf_mitigation = L1TF_MITIGATION_OFF; + else if (!strcmp(str, "flush,nowarn")) + l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOWARN; + else if (!strcmp(str, "flush")) + l1tf_mitigation = L1TF_MITIGATION_FLUSH; + else if (!strcmp(str, "flush,nosmt")) + l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT; + else if (!strcmp(str, "full")) + l1tf_mitigation = L1TF_MITIGATION_FULL; + else if (!strcmp(str, "full,force")) + l1tf_mitigation = L1TF_MITIGATION_FULL_FORCE; + + return 0; +} +early_param("l1tf", l1tf_cmdline); + +#undef pr_fmt + #ifdef CONFIG_SYSFS +#define L1TF_DEFAULT_MSG "Mitigation: PTE Inversion" + +#if IS_ENABLED(CONFIG_KVM_INTEL) +static const char *l1tf_vmx_states[] = { + [VMENTER_L1D_FLUSH_AUTO] = "auto", + [VMENTER_L1D_FLUSH_NEVER] = "vulnerable", + [VMENTER_L1D_FLUSH_COND] = "conditional cache flushes", + [VMENTER_L1D_FLUSH_ALWAYS] = "cache flushes", + [VMENTER_L1D_FLUSH_EPT_DISABLED] = "EPT disabled", + [VMENTER_L1D_FLUSH_NOT_REQUIRED] = "flush not necessary" +}; + +static ssize_t l1tf_show_state(char *buf) +{ + if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) + return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG); + + if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_EPT_DISABLED || + (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER && + cpu_smt_control == CPU_SMT_ENABLED)) + return sprintf(buf, "%s; VMX: %s\n", L1TF_DEFAULT_MSG, + l1tf_vmx_states[l1tf_vmx_mitigation]); + + return sprintf(buf, "%s; VMX: %s, SMT %s\n", L1TF_DEFAULT_MSG, + l1tf_vmx_states[l1tf_vmx_mitigation], + cpu_smt_control == CPU_SMT_ENABLED ? "vulnerable" : "disabled"); +} +#else +static ssize_t l1tf_show_state(char *buf) +{ + return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG); +} +#endif + static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr, char *buf, unsigned int bug) { @@ -676,6 +800,10 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr case X86_BUG_SPEC_STORE_BYPASS: return sprintf(buf, "%s\n", ssb_strings[ssb_mode]); + case X86_BUG_L1TF: + if (boot_cpu_has(X86_FEATURE_L1TF_PTEINV)) + return l1tf_show_state(buf); + break; default: break; } @@ -702,4 +830,9 @@ ssize_t cpu_show_spec_store_bypass(struct device *dev, struct device_attribute * { return cpu_show_common(dev, attr, buf, X86_BUG_SPEC_STORE_BYPASS); } + +ssize_t cpu_show_l1tf(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_L1TF); +} #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index ba6b8bb1c036..f3234010847c 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -661,33 +661,36 @@ static void cpu_detect_tlb(struct cpuinfo_x86 *c) tlb_lld_4m[ENTRIES], tlb_lld_1g[ENTRIES]); } -void detect_ht(struct cpuinfo_x86 *c) +int detect_ht_early(struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP u32 eax, ebx, ecx, edx; - int index_msb, core_bits; - static bool printed; if (!cpu_has(c, X86_FEATURE_HT)) - return; + return -1; if (cpu_has(c, X86_FEATURE_CMP_LEGACY)) - goto out; + return -1; if (cpu_has(c, X86_FEATURE_XTOPOLOGY)) - return; + return -1; cpuid(1, &eax, &ebx, &ecx, &edx); smp_num_siblings = (ebx & 0xff0000) >> 16; - - if (smp_num_siblings == 1) { + if (smp_num_siblings == 1) pr_info_once("CPU0: Hyper-Threading is disabled\n"); - goto out; - } +#endif + return 0; +} - if (smp_num_siblings <= 1) - goto out; +void detect_ht(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_SMP + int index_msb, core_bits; + + if (detect_ht_early(c) < 0) + return; index_msb = get_count_order(smp_num_siblings); c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb); @@ -700,15 +703,6 @@ void detect_ht(struct cpuinfo_x86 *c) c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, index_msb) & ((1 << core_bits) - 1); - -out: - if (!printed && (c->x86_max_cores * smp_num_siblings) > 1) { - pr_info("CPU: Physical Processor ID: %d\n", - c->phys_proc_id); - pr_info("CPU: Processor Core ID: %d\n", - c->cpu_core_id); - printed = 1; - } #endif } @@ -987,6 +981,21 @@ static const __initconst struct x86_cpu_id cpu_no_spec_store_bypass[] = { {} }; +static const __initconst struct x86_cpu_id cpu_no_l1tf[] = { + /* in addition to cpu_no_speculation */ + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT1 }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT2 }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_AIRMONT }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MERRIFIELD }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MOOREFIELD }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GOLDMONT }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_DENVERTON }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GEMINI_LAKE }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNL }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNM }, + {} +}; + static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) { u64 ia32_cap = 0; @@ -1016,6 +1025,11 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) return; setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); + + if (x86_match_cpu(cpu_no_l1tf)) + return; + + setup_force_cpu_bug(X86_BUG_L1TF); } /* diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 38216f678fc3..e59c0ea82a33 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -55,7 +55,9 @@ extern void init_intel_cacheinfo(struct cpuinfo_x86 *c); extern void init_amd_cacheinfo(struct cpuinfo_x86 *c); extern void detect_num_cpu_cores(struct cpuinfo_x86 *c); +extern int detect_extended_topology_early(struct cpuinfo_x86 *c); extern int detect_extended_topology(struct cpuinfo_x86 *c); +extern int detect_ht_early(struct cpuinfo_x86 *c); extern void detect_ht(struct cpuinfo_x86 *c); unsigned int aperfmperf_get_khz(int cpu); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index c050cd6066af..401e8c133108 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -301,6 +301,13 @@ static void early_init_intel(struct cpuinfo_x86 *c) } check_mpx_erratum(c); + + /* + * Get the number of SMT siblings early from the extended topology + * leaf, if available. Otherwise try the legacy SMT detection. + */ + if (detect_extended_topology_early(c) < 0) + detect_ht_early(c); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 08286269fd24..b9bc8a1a584e 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -509,12 +509,20 @@ static struct platform_device *microcode_pdev; static int check_online_cpus(void) { - if (num_online_cpus() == num_present_cpus()) - return 0; + unsigned int cpu; - pr_err("Not all CPUs online, aborting microcode update.\n"); + /* + * Make sure all CPUs are online. It's fine for SMT to be disabled if + * all the primary threads are still online. + */ + for_each_present_cpu(cpu) { + if (topology_is_primary_thread(cpu) && !cpu_online(cpu)) { + pr_err("Not all CPUs online, aborting microcode update.\n"); + return -EINVAL; + } + } - return -EINVAL; + return 0; } static atomic_t late_cpus_in; diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index 81c0afb39d0a..71ca064e3794 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -22,18 +22,10 @@ #define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f) #define LEVEL_MAX_SIBLINGS(ebx) ((ebx) & 0xffff) -/* - * Check for extended topology enumeration cpuid leaf 0xb and if it - * exists, use it for populating initial_apicid and cpu topology - * detection. - */ -int detect_extended_topology(struct cpuinfo_x86 *c) +int detect_extended_topology_early(struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP - unsigned int eax, ebx, ecx, edx, sub_index; - unsigned int ht_mask_width, core_plus_mask_width; - unsigned int core_select_mask, core_level_siblings; - static bool printed; + unsigned int eax, ebx, ecx, edx; if (c->cpuid_level < 0xb) return -1; @@ -52,10 +44,30 @@ int detect_extended_topology(struct cpuinfo_x86 *c) * initial apic id, which also represents 32-bit extended x2apic id. */ c->initial_apicid = edx; + smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx); +#endif + return 0; +} + +/* + * Check for extended topology enumeration cpuid leaf 0xb and if it + * exists, use it for populating initial_apicid and cpu topology + * detection. + */ +int detect_extended_topology(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_SMP + unsigned int eax, ebx, ecx, edx, sub_index; + unsigned int ht_mask_width, core_plus_mask_width; + unsigned int core_select_mask, core_level_siblings; + + if (detect_extended_topology_early(c) < 0) + return -1; /* * Populate HT related information from sub-leaf level 0. */ + cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx); core_level_siblings = smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx); core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); @@ -86,15 +98,6 @@ int detect_extended_topology(struct cpuinfo_x86 *c) c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); c->x86_max_cores = (core_level_siblings / smp_num_siblings); - - if (!printed) { - pr_info("CPU: Physical Processor ID: %d\n", - c->phys_proc_id); - if (c->x86_max_cores > 1) - pr_info("CPU: Processor Core ID: %d\n", - c->cpu_core_id); - printed = 1; - } #endif return 0; } diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index f92a6593de1e..2ea85b32421a 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -10,6 +10,7 @@ #include <asm/fpu/signal.h> #include <asm/fpu/types.h> #include <asm/traps.h> +#include <asm/irq_regs.h> #include <linux/hardirq.h> #include <linux/pkeys.h> diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 346b24883911..b0acb22e5a46 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -1,6 +1,7 @@ #include <linux/clocksource.h> #include <linux/clockchips.h> #include <linux/interrupt.h> +#include <linux/irq.h> #include <linux/export.h> #include <linux/delay.h> #include <linux/errno.h> diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 86c4439f9d74..519649ddf100 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -5,6 +5,7 @@ #include <linux/sched.h> #include <linux/ioport.h> #include <linux/interrupt.h> +#include <linux/irq.h> #include <linux/timex.h> #include <linux/random.h> #include <linux/init.h> diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 74383a3780dc..01adea278a71 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -8,6 +8,7 @@ #include <asm/traps.h> #include <asm/proto.h> #include <asm/desc.h> +#include <asm/hw_irq.h> struct idt_data { unsigned int vector; diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 328d027d829d..59b5f2ea7c2f 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -10,6 +10,7 @@ #include <linux/ftrace.h> #include <linux/delay.h> #include <linux/export.h> +#include <linux/irq.h> #include <asm/apic.h> #include <asm/io_apic.h> diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index c1bdbd3d3232..95600a99ae93 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -11,6 +11,7 @@ #include <linux/seq_file.h> #include <linux/interrupt.h> +#include <linux/irq.h> #include <linux/kernel_stat.h> #include <linux/notifier.h> #include <linux/cpu.h> diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index d86e344f5b3d..0469cd078db1 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -11,6 +11,7 @@ #include <linux/kernel_stat.h> #include <linux/interrupt.h> +#include <linux/irq.h> #include <linux/seq_file.h> #include <linux/delay.h> #include <linux/ftrace.h> diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 772196c1b8c4..a0693b71cfc1 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -5,6 +5,7 @@ #include <linux/sched.h> #include <linux/ioport.h> #include <linux/interrupt.h> +#include <linux/irq.h> #include <linux/timex.h> #include <linux/random.h> #include <linux/kprobes.h> diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index d2edd7e6c294..1e6764648af3 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -27,6 +27,7 @@ #include <linux/sched.h> #include <linux/sched/clock.h> #include <linux/mm.h> +#include <linux/slab.h> #include <asm/hypervisor.h> #include <asm/mem_encrypt.h> diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 5d32c55aeb8b..8fe740e22030 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -823,6 +823,12 @@ void __init setup_arch(char **cmdline_p) memblock_reserve(__pa_symbol(_text), (unsigned long)__bss_stop - (unsigned long)_text); + /* + * Make sure page 0 is always reserved because on systems with + * L1TF its contents can be leaked to user processes. + */ + memblock_reserve(0, PAGE_SIZE); + early_reserve_initrd(); /* diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 5c574dff4c1a..04adc8d60aed 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -261,6 +261,7 @@ __visible void __irq_entry smp_reschedule_interrupt(struct pt_regs *regs) { ack_APIC_irq(); inc_irq_stat(irq_resched_count); + kvm_set_cpu_l1tf_flush_l1d(); if (trace_resched_ipi_enabled()) { /* diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index db9656e13ea0..f02ecaf97904 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -80,6 +80,7 @@ #include <asm/intel-family.h> #include <asm/cpu_device_id.h> #include <asm/spec-ctrl.h> +#include <asm/hw_irq.h> /* representing HT siblings of each logical CPU */ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map); @@ -271,6 +272,23 @@ static void notrace start_secondary(void *unused) } /** + * topology_is_primary_thread - Check whether CPU is the primary SMT thread + * @cpu: CPU to check + */ +bool topology_is_primary_thread(unsigned int cpu) +{ + return apic_id_is_primary_thread(per_cpu(x86_cpu_to_apicid, cpu)); +} + +/** + * topology_smt_supported - Check whether SMT is supported by the CPUs + */ +bool topology_smt_supported(void) +{ + return smp_num_siblings > 1; +} + +/** * topology_phys_to_logical_pkg - Map a physical package id to a logical * * Returns logical package id or -1 if not found diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index 774ebafa97c4..be01328eb755 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -12,6 +12,7 @@ #include <linux/clockchips.h> #include <linux/interrupt.h> +#include <linux/irq.h> #include <linux/i8253.h> #include <linux/time.h> #include <linux/export.h> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 6b8f11521c41..a44e568363a4 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3840,6 +3840,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, { int r = 1; + vcpu->arch.l1tf_flush_l1d = true; switch (vcpu->arch.apf.host_apf_reason) { default: trace_kvm_page_fault(fault_address, error_code); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5d8e317c2b04..46b428c0990e 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -188,6 +188,150 @@ module_param(ple_window_max, uint, 0444); extern const ulong vmx_return; +static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush); +static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond); +static DEFINE_MUTEX(vmx_l1d_flush_mutex); + +/* Storage for pre module init parameter parsing */ +static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO; + +static const struct { + const char *option; + enum vmx_l1d_flush_state cmd; +} vmentry_l1d_param[] = { + {"auto", VMENTER_L1D_FLUSH_AUTO}, + {"never", VMENTER_L1D_FLUSH_NEVER}, + {"cond", VMENTER_L1D_FLUSH_COND}, + {"always", VMENTER_L1D_FLUSH_ALWAYS}, +}; + +#define L1D_CACHE_ORDER 4 +static void *vmx_l1d_flush_pages; + +static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) +{ + struct page *page; + unsigned int i; + + if (!enable_ept) { + l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED; + return 0; + } + + if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) { + u64 msr; + + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr); + if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) { + l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; + return 0; + } + } + + /* If set to auto use the default l1tf mitigation method */ + if (l1tf == VMENTER_L1D_FLUSH_AUTO) { + switch (l1tf_mitigation) { + case L1TF_MITIGATION_OFF: + l1tf = VMENTER_L1D_FLUSH_NEVER; + break; + case L1TF_MITIGATION_FLUSH_NOWARN: + case L1TF_MITIGATION_FLUSH: + case L1TF_MITIGATION_FLUSH_NOSMT: + l1tf = VMENTER_L1D_FLUSH_COND; + break; + case L1TF_MITIGATION_FULL: + case L1TF_MITIGATION_FULL_FORCE: + l1tf = VMENTER_L1D_FLUSH_ALWAYS; + break; + } + } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) { + l1tf = VMENTER_L1D_FLUSH_ALWAYS; + } + + if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages && + !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) { + page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER); + if (!page) + return -ENOMEM; + vmx_l1d_flush_pages = page_address(page); + + /* + * Initialize each page with a different pattern in + * order to protect against KSM in the nested + * virtualization case. + */ + for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) { + memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1, + PAGE_SIZE); + } + } + + l1tf_vmx_mitigation = l1tf; + + if (l1tf != VMENTER_L1D_FLUSH_NEVER) + static_branch_enable(&vmx_l1d_should_flush); + else + static_branch_disable(&vmx_l1d_should_flush); + + if (l1tf == VMENTER_L1D_FLUSH_COND) + static_branch_enable(&vmx_l1d_flush_cond); + else + static_branch_disable(&vmx_l1d_flush_cond); + return 0; +} + +static int vmentry_l1d_flush_parse(const char *s) +{ + unsigned int i; + + if (s) { + for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) { + if (sysfs_streq(s, vmentry_l1d_param[i].option)) + return vmentry_l1d_param[i].cmd; + } + } + return -EINVAL; +} + +static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp) +{ + int l1tf, ret; + + if (!boot_cpu_has(X86_BUG_L1TF)) + return 0; + + l1tf = vmentry_l1d_flush_parse(s); + if (l1tf < 0) + return l1tf; + + /* + * Has vmx_init() run already? If not then this is the pre init + * parameter parsing. In that case just store the value and let + * vmx_init() do the proper setup after enable_ept has been + * established. + */ + if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) { + vmentry_l1d_flush_param = l1tf; + return 0; + } + + mutex_lock(&vmx_l1d_flush_mutex); + ret = vmx_setup_l1d_flush(l1tf); + mutex_unlock(&vmx_l1d_flush_mutex); + return ret; +} + +static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp) +{ + return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option); +} + +static const struct kernel_param_ops vmentry_l1d_flush_ops = { + .set = vmentry_l1d_flush_set, + .get = vmentry_l1d_flush_get, +}; +module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644); + struct kvm_vmx { struct kvm kvm; @@ -757,6 +901,11 @@ static inline int pi_test_sn(struct pi_desc *pi_desc) (unsigned long *)&pi_desc->control); } +struct vmx_msrs { + unsigned int nr; + struct vmx_msr_entry val[NR_AUTOLOAD_MSRS]; +}; + struct vcpu_vmx { struct kvm_vcpu vcpu; unsigned long host_rsp; @@ -790,9 +939,8 @@ struct vcpu_vmx { struct loaded_vmcs *loaded_vmcs; bool __launched; /* temporary, used in vmx_vcpu_run */ struct msr_autoload { - unsigned nr; - struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS]; - struct vmx_msr_entry host[NR_AUTOLOAD_MSRS]; + struct vmx_msrs guest; + struct vmx_msrs host; } msr_autoload; struct { int loaded; @@ -2377,9 +2525,20 @@ static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, vm_exit_controls_clearbit(vmx, exit); } +static int find_msr(struct vmx_msrs *m, unsigned int msr) +{ + unsigned int i; + + for (i = 0; i < m->nr; ++i) { + if (m->val[i].index == msr) + return i; + } + return -ENOENT; +} + static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) { - unsigned i; + int i; struct msr_autoload *m = &vmx->msr_autoload; switch (msr) { @@ -2400,18 +2559,21 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) } break; } + i = find_msr(&m->guest, msr); + if (i < 0) + goto skip_guest; + --m->guest.nr; + m->guest.val[i] = m->guest.val[m->guest.nr]; + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); - for (i = 0; i < m->nr; ++i) - if (m->guest[i].index == msr) - break; - - if (i == m->nr) +skip_guest: + i = find_msr(&m->host, msr); + if (i < 0) return; - --m->nr; - m->guest[i] = m->guest[m->nr]; - m->host[i] = m->host[m->nr]; - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr); - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr); + + --m->host.nr; + m->host.val[i] = m->host.val[m->host.nr]; + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); } static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx, @@ -2426,9 +2588,9 @@ static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx, } static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, - u64 guest_val, u64 host_val) + u64 guest_val, u64 host_val, bool entry_only) { - unsigned i; + int i, j = 0; struct msr_autoload *m = &vmx->msr_autoload; switch (msr) { @@ -2463,24 +2625,31 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, wrmsrl(MSR_IA32_PEBS_ENABLE, 0); } - for (i = 0; i < m->nr; ++i) - if (m->guest[i].index == msr) - break; + i = find_msr(&m->guest, msr); + if (!entry_only) + j = find_msr(&m->host, msr); - if (i == NR_AUTOLOAD_MSRS) { + if (i == NR_AUTOLOAD_MSRS || j == NR_AUTOLOAD_MSRS) { printk_once(KERN_WARNING "Not enough msr switch entries. " "Can't add msr %x\n", msr); return; - } else if (i == m->nr) { - ++m->nr; - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr); - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr); } + if (i < 0) { + i = m->guest.nr++; + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); + } + m->guest.val[i].index = msr; + m->guest.val[i].value = guest_val; + + if (entry_only) + return; - m->guest[i].index = msr; - m->guest[i].value = guest_val; - m->host[i].index = msr; - m->host[i].value = host_val; + if (j < 0) { + j = m->host.nr++; + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); + } + m->host.val[j].index = msr; + m->host.val[j].value = host_val; } static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) @@ -2524,7 +2693,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) guest_efer &= ~EFER_LME; if (guest_efer != host_efer) add_atomic_switch_msr(vmx, MSR_EFER, - guest_efer, host_efer); + guest_efer, host_efer, false); return false; } else { guest_efer &= ~ignore_bits; @@ -3987,7 +4156,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.ia32_xss = data; if (vcpu->arch.ia32_xss != host_xss) add_atomic_switch_msr(vmx, MSR_IA32_XSS, - vcpu->arch.ia32_xss, host_xss); + vcpu->arch.ia32_xss, host_xss, false); else clear_atomic_switch_msr(vmx, MSR_IA32_XSS); break; @@ -6274,9 +6443,9 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); - vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host)); + vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); - vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest)); + vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); @@ -6296,8 +6465,7 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx) ++vmx->nmsrs; } - if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) - rdmsrl(MSR_IA32_ARCH_CAPABILITIES, vmx->arch_capabilities); + vmx->arch_capabilities = kvm_get_arch_capabilities(); vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl); @@ -9548,6 +9716,79 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) } } +/* + * Software based L1D cache flush which is used when microcode providing + * the cache control MSR is not loaded. + * + * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to + * flush it is required to read in 64 KiB because the replacement algorithm + * is not exactly LRU. This could be sized at runtime via topology + * information but as all relevant affected CPUs have 32KiB L1D cache size + * there is no point in doing so. + */ +#define L1D_CACHE_ORDER 4 +static void *vmx_l1d_flush_pages; + +static void vmx_l1d_flush(struct kvm_vcpu *vcpu) +{ + int size = PAGE_SIZE << L1D_CACHE_ORDER; + + /* + * This code is only executed when the the flush mode is 'cond' or + * 'always' + */ + if (static_branch_likely(&vmx_l1d_flush_cond)) { + bool flush_l1d; + + /* + * Clear the per-vcpu flush bit, it gets set again + * either from vcpu_run() or from one of the unsafe + * VMEXIT handlers. + */ + flush_l1d = vcpu->arch.l1tf_flush_l1d; + vcpu->arch.l1tf_flush_l1d = false; + + /* + * Clear the per-cpu flush bit, it gets set again from + * the interrupt handlers. + */ + flush_l1d |= kvm_get_cpu_l1tf_flush_l1d(); + kvm_clear_cpu_l1tf_flush_l1d(); + + if (!flush_l1d) + return; + } + + vcpu->stat.l1d_flush++; + + if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) { + wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH); + return; + } + + asm volatile( + /* First ensure the pages are in the TLB */ + "xorl %%eax, %%eax\n" + ".Lpopulate_tlb:\n\t" + "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" + "addl $4096, %%eax\n\t" + "cmpl %%eax, %[size]\n\t" + "jne .Lpopulate_tlb\n\t" + "xorl %%eax, %%eax\n\t" + "cpuid\n\t" + /* Now fill the cache */ + "xorl %%eax, %%eax\n" + ".Lfill_cache:\n" + "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" + "addl $64, %%eax\n\t" + "cmpl %%eax, %[size]\n\t" + "jne .Lfill_cache\n\t" + "lfence\n" + :: [flush_pages] "r" (vmx_l1d_flush_pages), + [size] "r" (size) + : "eax", "ebx", "ecx", "edx"); +} + static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); @@ -9949,7 +10190,7 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) clear_atomic_switch_msr(vmx, msrs[i].msr); else add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest, - msrs[i].host); + msrs[i].host, false); } static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu) @@ -10044,6 +10285,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) evmcs_rsp = static_branch_unlikely(&enable_evmcs) ? (unsigned long)¤t_evmcs->host_rsp : 0; + if (static_branch_unlikely(&vmx_l1d_should_flush)) + vmx_l1d_flush(vcpu); + asm( /* Store host registers */ "push %%" _ASM_DX "; push %%" _ASM_BP ";" @@ -10403,10 +10647,37 @@ free_vcpu: return ERR_PTR(err); } +#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n" +#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n" + static int vmx_vm_init(struct kvm *kvm) { if (!ple_gap) kvm->arch.pause_in_guest = true; + + if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) { + switch (l1tf_mitigation) { + case L1TF_MITIGATION_OFF: + case L1TF_MITIGATION_FLUSH_NOWARN: + /* 'I explicitly don't care' is set */ + break; + case L1TF_MITIGATION_FLUSH: + case L1TF_MITIGATION_FLUSH_NOSMT: + case L1TF_MITIGATION_FULL: + /* + * Warn upon starting the first VM in a potentially + * insecure environment. + */ + if (cpu_smt_control == CPU_SMT_ENABLED) + pr_warn_once(L1TF_MSG_SMT); + if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER) + pr_warn_once(L1TF_MSG_L1D); + break; + case L1TF_MITIGATION_FULL_FORCE: + /* Flush is enforced */ + break; + } + } return 0; } @@ -11260,10 +11531,10 @@ static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) * Set the MSR load/store lists to match L0's settings. */ vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr); - vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host)); - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr); - vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest)); + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); + vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); + vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); set_cr4_guest_host_mask(vmx); @@ -11899,6 +12170,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) return ret; } + /* Hide L1D cache contents from the nested guest. */ + vmx->vcpu.arch.l1tf_flush_l1d = true; + /* * If we're entering a halted L2 vcpu and the L2 vcpu won't be woken * by event injection, halt vcpu. @@ -12419,8 +12693,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, vmx_segment_cache_clear(vmx); /* Update any VMCS fields that might have changed while L2 ran */ - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr); - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr); + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); if (vmx->hv_deadline_tsc == -1) vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL, @@ -13137,6 +13411,51 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .enable_smi_window = enable_smi_window, }; +static void vmx_cleanup_l1d_flush(void) +{ + if (vmx_l1d_flush_pages) { + free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER); + vmx_l1d_flush_pages = NULL; + } + /* Restore state so sysfs ignores VMX */ + l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; +} + +static void vmx_exit(void) +{ +#ifdef CONFIG_KEXEC_CORE + RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL); + synchronize_rcu(); +#endif + + kvm_exit(); + +#if IS_ENABLED(CONFIG_HYPERV) + if (static_branch_unlikely(&enable_evmcs)) { + int cpu; + struct hv_vp_assist_page *vp_ap; + /* + * Reset everything to support using non-enlightened VMCS + * access later (e.g. when we reload the module with + * enlightened_vmcs=0) + */ + for_each_online_cpu(cpu) { + vp_ap = hv_get_vp_assist_page(cpu); + + if (!vp_ap) + continue; + + vp_ap->current_nested_vmcs = 0; + vp_ap->enlighten_vmentry = 0; + } + + static_branch_disable(&enable_evmcs); + } +#endif + vmx_cleanup_l1d_flush(); +} +module_exit(vmx_exit); + static int __init vmx_init(void) { int r; @@ -13171,10 +13490,25 @@ static int __init vmx_init(void) #endif r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), - __alignof__(struct vcpu_vmx), THIS_MODULE); + __alignof__(struct vcpu_vmx), THIS_MODULE); if (r) return r; + /* + * Must be called after kvm_init() so enable_ept is properly set + * up. Hand the parameter mitigation value in which was stored in + * the pre module init parser. If no parameter was given, it will + * contain 'auto' which will be turned into the default 'cond' + * mitigation mode. + */ + if (boot_cpu_has(X86_BUG_L1TF)) { + r = vmx_setup_l1d_flush(vmentry_l1d_flush_param); + if (r) { + vmx_exit(); + return r; + } + } + #ifdef CONFIG_KEXEC_CORE rcu_assign_pointer(crash_vmclear_loaded_vmcss, crash_vmclear_local_loaded_vmcss); @@ -13183,39 +13517,4 @@ static int __init vmx_init(void) return 0; } - -static void __exit vmx_exit(void) -{ -#ifdef CONFIG_KEXEC_CORE - RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL); - synchronize_rcu(); -#endif - - kvm_exit(); - -#if IS_ENABLED(CONFIG_HYPERV) - if (static_branch_unlikely(&enable_evmcs)) { - int cpu; - struct hv_vp_assist_page *vp_ap; - /* - * Reset everything to support using non-enlightened VMCS - * access later (e.g. when we reload the module with - * enlightened_vmcs=0) - */ - for_each_online_cpu(cpu) { - vp_ap = hv_get_vp_assist_page(cpu); - - if (!vp_ap) - continue; - - vp_ap->current_nested_vmcs = 0; - vp_ap->enlighten_vmentry = 0; - } - - static_branch_disable(&enable_evmcs); - } -#endif -} - -module_init(vmx_init) -module_exit(vmx_exit) +module_init(vmx_init); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2b812b3c5088..a5caa5e5480c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -195,6 +195,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "irq_injections", VCPU_STAT(irq_injections) }, { "nmi_injections", VCPU_STAT(nmi_injections) }, { "req_event", VCPU_STAT(req_event) }, + { "l1d_flush", VCPU_STAT(l1d_flush) }, { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, { "mmu_pte_write", VM_STAT(mmu_pte_write) }, { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, @@ -1102,11 +1103,35 @@ static u32 msr_based_features[] = { static unsigned int num_msr_based_features; +u64 kvm_get_arch_capabilities(void) +{ + u64 data; + + rdmsrl_safe(MSR_IA32_ARCH_CAPABILITIES, &data); + + /* + * If we're doing cache flushes (either "always" or "cond") + * we will do one whenever the guest does a vmlaunch/vmresume. + * If an outer hypervisor is doing the cache flush for us + * (VMENTER_L1D_FLUSH_NESTED_VM), we can safely pass that + * capability to the guest too, and if EPT is disabled we're not + * vulnerable. Overall, only VMENTER_L1D_FLUSH_NEVER will + * require a nested hypervisor to do a flush of its own. + */ + if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER) + data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH; + + return data; +} +EXPORT_SYMBOL_GPL(kvm_get_arch_capabilities); + static int kvm_get_msr_feature(struct kvm_msr_entry *msr) { switch (msr->index) { - case MSR_IA32_UCODE_REV: case MSR_IA32_ARCH_CAPABILITIES: + msr->data = kvm_get_arch_capabilities(); + break; + case MSR_IA32_UCODE_REV: rdmsrl_safe(msr->index, &msr->data); break; default: @@ -4876,6 +4901,9 @@ static int emulator_write_std(struct x86_emulate_ctxt *ctxt, gva_t addr, void *v int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception) { + /* kvm_write_guest_virt_system can pull in tons of pages. */ + vcpu->arch.l1tf_flush_l1d = true; + return kvm_write_guest_virt_helper(addr, val, bytes, vcpu, PFERR_WRITE_MASK, exception); } @@ -6052,6 +6080,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, bool writeback = true; bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable; + vcpu->arch.l1tf_flush_l1d = true; + /* * Clear write_fault_to_shadow_pgtable here to ensure it is * never reused. @@ -7581,6 +7611,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu) struct kvm *kvm = vcpu->kvm; vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); + vcpu->arch.l1tf_flush_l1d = true; for (;;) { if (kvm_vcpu_running(vcpu)) { @@ -8700,6 +8731,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) { + vcpu->arch.l1tf_flush_l1d = true; kvm_x86_ops->sched_in(vcpu, cpu); } diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 74b157ac078d..156ed8154af8 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -4,6 +4,8 @@ #include <linux/swap.h> #include <linux/memblock.h> #include <linux/bootmem.h> /* for max_low_pfn */ +#include <linux/swapfile.h> +#include <linux/swapops.h> #include <asm/set_memory.h> #include <asm/e820/api.h> @@ -911,3 +913,24 @@ void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache) __cachemode2pte_tbl[cache] = __cm_idx2pte(entry); __pte2cachemode_tbl[entry] = cache; } + +unsigned long max_swapfile_size(void) +{ + unsigned long pages; + + pages = generic_max_swapfile_size(); + + if (boot_cpu_has_bug(X86_BUG_L1TF)) { + /* Limit the swap file size to MAX_PA/2 for L1TF workaround */ + unsigned long l1tf_limit = l1tf_pfn_limit() + 1; + /* + * We encode swap offsets also with 3 bits below those for pfn + * which makes the usable limit higher. + */ +#if CONFIG_PGTABLE_LEVELS > 2 + l1tf_limit <<= PAGE_SHIFT - SWP_OFFSET_FIRST_BIT; +#endif + pages = min_t(unsigned long, l1tf_limit, pages); + } + return pages; +} diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index 7c8686709636..79eb55ce69a9 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -126,24 +126,29 @@ static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long addr) static void clear_pmd_presence(pmd_t *pmd, bool clear, pmdval_t *old) { + pmd_t new_pmd; pmdval_t v = pmd_val(*pmd); if (clear) { - *old = v & _PAGE_PRESENT; - v &= ~_PAGE_PRESENT; - } else /* presume this has been called with clear==true previously */ - v |= *old; - set_pmd(pmd, __pmd(v)); + *old = v; + new_pmd = pmd_mknotpresent(*pmd); + } else { + /* Presume this has been called with clear==true previously */ + new_pmd = __pmd(*old); + } + set_pmd(pmd, new_pmd); } static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old) { pteval_t v = pte_val(*pte); if (clear) { - *old = v & _PAGE_PRESENT; - v &= ~_PAGE_PRESENT; - } else /* presume this has been called with clear==true previously */ - v |= *old; - set_pte_atomic(pte, __pte(v)); + *old = v; + /* Nothing should care about address */ + pte_clear(&init_mm, 0, pte); + } else { + /* Presume this has been called with clear==true previously */ + set_pte_atomic(pte, __pte(*old)); + } } static int clear_page_presence(struct kmmio_fault_page *f, bool clear) diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 48c591251600..f40ab8185d94 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -240,3 +240,24 @@ int valid_mmap_phys_addr_range(unsigned long pfn, size_t count) return phys_addr_valid(addr + count - 1); } + +/* + * Only allow root to set high MMIO mappings to PROT_NONE. + * This prevents an unpriv. user to set them to PROT_NONE and invert + * them, then pointing to valid memory for L1TF speculation. + * + * Note: for locked down kernels may want to disable the root override. + */ +bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot) +{ + if (!boot_cpu_has_bug(X86_BUG_L1TF)) + return true; + if (!__pte_needs_invert(pgprot_val(prot))) + return true; + /* If it's real memory always allow */ + if (pfn_valid(pfn)) + return true; + if (pfn > l1tf_pfn_limit() && !capable(CAP_SYS_ADMIN)) + return false; + return true; +} diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 0a74996a1149..8d6c34fe49be 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -1015,8 +1015,8 @@ static long populate_pmd(struct cpa_data *cpa, pmd = pmd_offset(pud, start); - set_pmd(pmd, __pmd(cpa->pfn << PAGE_SHIFT | _PAGE_PSE | - massage_pgprot(pmd_pgprot))); + set_pmd(pmd, pmd_mkhuge(pfn_pmd(cpa->pfn, + canon_pgprot(pmd_pgprot)))); start += PMD_SIZE; cpa->pfn += PMD_SIZE >> PAGE_SHIFT; @@ -1088,8 +1088,8 @@ static int populate_pud(struct cpa_data *cpa, unsigned long start, p4d_t *p4d, * Map everything starting from the Gb boundary, possibly with 1G pages */ while (boot_cpu_has(X86_FEATURE_GBPAGES) && end - start >= PUD_SIZE) { - set_pud(pud, __pud(cpa->pfn << PAGE_SHIFT | _PAGE_PSE | - massage_pgprot(pud_pgprot))); + set_pud(pud, pud_mkhuge(pfn_pud(cpa->pfn, + canon_pgprot(pud_pgprot)))); start += PUD_SIZE; cpa->pfn += PUD_SIZE >> PAGE_SHIFT; diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index d58b4aba9510..31341ae7309f 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -45,6 +45,7 @@ #include <asm/pgalloc.h> #include <asm/tlbflush.h> #include <asm/desc.h> +#include <asm/sections.h> #undef pr_fmt #define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c index 4f5fa65a1011..2acd6be13375 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c @@ -18,6 +18,7 @@ #include <asm/intel-mid.h> #include <asm/intel_scu_ipc.h> #include <asm/io_apic.h> +#include <asm/hw_irq.h> #define TANGIER_EXT_TIMER0_MSI 12 diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index e26dfad507c8..a4130b84d1ff 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -1285,6 +1285,7 @@ void uv_bau_message_interrupt(struct pt_regs *regs) struct msg_desc msgdesc; ack_APIC_irq(); + kvm_set_cpu_l1tf_flush_l1d(); time_start = get_cycles(); bcp = &per_cpu(bau_control, smp_processor_id()); diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 3b5318505c69..2eeddd814653 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -3,6 +3,7 @@ #endif #include <linux/cpu.h> #include <linux/kexec.h> +#include <linux/slab.h> #include <xen/features.h> #include <xen/page.h> |