diff options
Diffstat (limited to 'arch/x86')
50 files changed, 661 insertions, 290 deletions
diff --git a/arch/x86/boot/compressed/mem_encrypt.S b/arch/x86/boot/compressed/mem_encrypt.S index eaa843a52907..a480356e0ed8 100644 --- a/arch/x86/boot/compressed/mem_encrypt.S +++ b/arch/x86/boot/compressed/mem_encrypt.S @@ -25,20 +25,6 @@ ENTRY(get_sev_encryption_bit) push %ebx push %ecx push %edx - push %edi - - /* - * RIP-relative addressing is needed to access the encryption bit - * variable. Since we are running in 32-bit mode we need this call/pop - * sequence to get the proper relative addressing. - */ - call 1f -1: popl %edi - subl $1b, %edi - - movl enc_bit(%edi), %eax - cmpl $0, %eax - jge .Lsev_exit /* Check if running under a hypervisor */ movl $1, %eax @@ -69,15 +55,12 @@ ENTRY(get_sev_encryption_bit) movl %ebx, %eax andl $0x3f, %eax /* Return the encryption bit location */ - movl %eax, enc_bit(%edi) jmp .Lsev_exit .Lno_sev: xor %eax, %eax - movl %eax, enc_bit(%edi) .Lsev_exit: - pop %edi pop %edx pop %ecx pop %ebx @@ -113,8 +96,6 @@ ENTRY(set_sev_encryption_mask) ENDPROC(set_sev_encryption_mask) .data -enc_bit: - .int 0xffffffff #ifdef CONFIG_AMD_MEM_ENCRYPT .balign 8 diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c index acd11b3bf639..2a356b948720 100644 --- a/arch/x86/crypto/aegis128-aesni-glue.c +++ b/arch/x86/crypto/aegis128-aesni-glue.c @@ -379,7 +379,6 @@ static int __init crypto_aegis128_aesni_module_init(void) { if (!boot_cpu_has(X86_FEATURE_XMM2) || !boot_cpu_has(X86_FEATURE_AES) || - !boot_cpu_has(X86_FEATURE_OSXSAVE) || !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) return -ENODEV; diff --git a/arch/x86/crypto/aegis128l-aesni-glue.c b/arch/x86/crypto/aegis128l-aesni-glue.c index 2071c3d1ae07..dbe8bb980da1 100644 --- a/arch/x86/crypto/aegis128l-aesni-glue.c +++ b/arch/x86/crypto/aegis128l-aesni-glue.c @@ -379,7 +379,6 @@ static int __init crypto_aegis128l_aesni_module_init(void) { if (!boot_cpu_has(X86_FEATURE_XMM2) || !boot_cpu_has(X86_FEATURE_AES) || - !boot_cpu_has(X86_FEATURE_OSXSAVE) || !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) return -ENODEV; diff --git a/arch/x86/crypto/aegis256-aesni-glue.c b/arch/x86/crypto/aegis256-aesni-glue.c index b5f2a8fd5a71..8bebda2de92f 100644 --- a/arch/x86/crypto/aegis256-aesni-glue.c +++ b/arch/x86/crypto/aegis256-aesni-glue.c @@ -379,7 +379,6 @@ static int __init crypto_aegis256_aesni_module_init(void) { if (!boot_cpu_has(X86_FEATURE_XMM2) || !boot_cpu_has(X86_FEATURE_AES) || - !boot_cpu_has(X86_FEATURE_OSXSAVE) || !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) return -ENODEV; diff --git a/arch/x86/crypto/morus1280-sse2-glue.c b/arch/x86/crypto/morus1280-sse2-glue.c index 95cf857d2cbb..f40244eaf14d 100644 --- a/arch/x86/crypto/morus1280-sse2-glue.c +++ b/arch/x86/crypto/morus1280-sse2-glue.c @@ -40,7 +40,6 @@ MORUS1280_DECLARE_ALGS(sse2, "morus1280-sse2", 350); static int __init crypto_morus1280_sse2_module_init(void) { if (!boot_cpu_has(X86_FEATURE_XMM2) || - !boot_cpu_has(X86_FEATURE_OSXSAVE) || !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) return -ENODEV; diff --git a/arch/x86/crypto/morus640-sse2-glue.c b/arch/x86/crypto/morus640-sse2-glue.c index 615fb7bc9a32..9afaf8f8565a 100644 --- a/arch/x86/crypto/morus640-sse2-glue.c +++ b/arch/x86/crypto/morus640-sse2-glue.c @@ -40,7 +40,6 @@ MORUS640_DECLARE_ALGS(sse2, "morus640-sse2", 400); static int __init crypto_morus640_sse2_module_init(void) { if (!boot_cpu_has(X86_FEATURE_XMM2) || - !boot_cpu_has(X86_FEATURE_OSXSAVE) || !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) return -ENODEV; diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index f3e006bed9a7..c88ed39582a1 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -1272,4 +1272,8 @@ void intel_pmu_lbr_init_knl(void) x86_pmu.lbr_sel_mask = LBR_SEL_MASK; x86_pmu.lbr_sel_map = snb_lbr_sel_map; + + /* Knights Landing does have MISPREDICT bit */ + if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_LIP) + x86_pmu.intel_cap.lbr_format = LBR_FORMAT_EIP_FLAGS; } diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c index 5b0f613428c2..2c43e3055948 100644 --- a/arch/x86/hyperv/hv_apic.c +++ b/arch/x86/hyperv/hv_apic.c @@ -95,8 +95,8 @@ static void hv_apic_eoi_write(u32 reg, u32 val) */ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector) { - struct ipi_arg_ex **arg; - struct ipi_arg_ex *ipi_arg; + struct hv_send_ipi_ex **arg; + struct hv_send_ipi_ex *ipi_arg; unsigned long flags; int nr_bank = 0; int ret = 1; @@ -105,7 +105,7 @@ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector) return false; local_irq_save(flags); - arg = (struct ipi_arg_ex **)this_cpu_ptr(hyperv_pcpu_input_arg); + arg = (struct hv_send_ipi_ex **)this_cpu_ptr(hyperv_pcpu_input_arg); ipi_arg = *arg; if (unlikely(!ipi_arg)) @@ -135,7 +135,7 @@ ipi_mask_ex_done: static bool __send_ipi_mask(const struct cpumask *mask, int vector) { int cur_cpu, vcpu; - struct ipi_arg_non_ex ipi_arg; + struct hv_send_ipi ipi_arg; int ret = 1; trace_hyperv_send_ipi_mask(mask, vector); diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index b143717b92b3..ce84388e540c 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -80,11 +80,11 @@ static __always_inline void arch_atomic_sub(int i, atomic_t *v) * true if the result is zero, or false for all * other cases. */ -#define arch_atomic_sub_and_test arch_atomic_sub_and_test static __always_inline bool arch_atomic_sub_and_test(int i, atomic_t *v) { GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, "er", i, "%0", e); } +#define arch_atomic_sub_and_test arch_atomic_sub_and_test /** * arch_atomic_inc - increment atomic variable @@ -92,12 +92,12 @@ static __always_inline bool arch_atomic_sub_and_test(int i, atomic_t *v) * * Atomically increments @v by 1. */ -#define arch_atomic_inc arch_atomic_inc static __always_inline void arch_atomic_inc(atomic_t *v) { asm volatile(LOCK_PREFIX "incl %0" : "+m" (v->counter)); } +#define arch_atomic_inc arch_atomic_inc /** * arch_atomic_dec - decrement atomic variable @@ -105,12 +105,12 @@ static __always_inline void arch_atomic_inc(atomic_t *v) * * Atomically decrements @v by 1. */ -#define arch_atomic_dec arch_atomic_dec static __always_inline void arch_atomic_dec(atomic_t *v) { asm volatile(LOCK_PREFIX "decl %0" : "+m" (v->counter)); } +#define arch_atomic_dec arch_atomic_dec /** * arch_atomic_dec_and_test - decrement and test @@ -120,11 +120,11 @@ static __always_inline void arch_atomic_dec(atomic_t *v) * returns true if the result is 0, or false for all other * cases. */ -#define arch_atomic_dec_and_test arch_atomic_dec_and_test static __always_inline bool arch_atomic_dec_and_test(atomic_t *v) { GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, "%0", e); } +#define arch_atomic_dec_and_test arch_atomic_dec_and_test /** * arch_atomic_inc_and_test - increment and test @@ -134,11 +134,11 @@ static __always_inline bool arch_atomic_dec_and_test(atomic_t *v) * and returns true if the result is zero, or false for all * other cases. */ -#define arch_atomic_inc_and_test arch_atomic_inc_and_test static __always_inline bool arch_atomic_inc_and_test(atomic_t *v) { GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, "%0", e); } +#define arch_atomic_inc_and_test arch_atomic_inc_and_test /** * arch_atomic_add_negative - add and test if negative @@ -149,11 +149,11 @@ static __always_inline bool arch_atomic_inc_and_test(atomic_t *v) * if the result is negative, or false when * result is greater than or equal to zero. */ -#define arch_atomic_add_negative arch_atomic_add_negative static __always_inline bool arch_atomic_add_negative(int i, atomic_t *v) { GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, "er", i, "%0", s); } +#define arch_atomic_add_negative arch_atomic_add_negative /** * arch_atomic_add_return - add integer and return diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h index ef959f02d070..6a5b0ec460da 100644 --- a/arch/x86/include/asm/atomic64_32.h +++ b/arch/x86/include/asm/atomic64_32.h @@ -205,12 +205,12 @@ static inline long long arch_atomic64_sub(long long i, atomic64_t *v) * * Atomically increments @v by 1. */ -#define arch_atomic64_inc arch_atomic64_inc static inline void arch_atomic64_inc(atomic64_t *v) { __alternative_atomic64(inc, inc_return, /* no output */, "S" (v) : "memory", "eax", "ecx", "edx"); } +#define arch_atomic64_inc arch_atomic64_inc /** * arch_atomic64_dec - decrement atomic64 variable @@ -218,12 +218,12 @@ static inline void arch_atomic64_inc(atomic64_t *v) * * Atomically decrements @v by 1. */ -#define arch_atomic64_dec arch_atomic64_dec static inline void arch_atomic64_dec(atomic64_t *v) { __alternative_atomic64(dec, dec_return, /* no output */, "S" (v) : "memory", "eax", "ecx", "edx"); } +#define arch_atomic64_dec arch_atomic64_dec /** * arch_atomic64_add_unless - add unless the number is a given value @@ -245,7 +245,6 @@ static inline int arch_atomic64_add_unless(atomic64_t *v, long long a, return (int)a; } -#define arch_atomic64_inc_not_zero arch_atomic64_inc_not_zero static inline int arch_atomic64_inc_not_zero(atomic64_t *v) { int r; @@ -253,8 +252,8 @@ static inline int arch_atomic64_inc_not_zero(atomic64_t *v) "S" (v) : "ecx", "edx", "memory"); return r; } +#define arch_atomic64_inc_not_zero arch_atomic64_inc_not_zero -#define arch_atomic64_dec_if_positive arch_atomic64_dec_if_positive static inline long long arch_atomic64_dec_if_positive(atomic64_t *v) { long long r; @@ -262,6 +261,7 @@ static inline long long arch_atomic64_dec_if_positive(atomic64_t *v) "S" (v) : "ecx", "memory"); return r; } +#define arch_atomic64_dec_if_positive arch_atomic64_dec_if_positive #undef alternative_atomic64 #undef __alternative_atomic64 diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h index 4343d9b4f30e..5f851d92eecd 100644 --- a/arch/x86/include/asm/atomic64_64.h +++ b/arch/x86/include/asm/atomic64_64.h @@ -71,11 +71,11 @@ static inline void arch_atomic64_sub(long i, atomic64_t *v) * true if the result is zero, or false for all * other cases. */ -#define arch_atomic64_sub_and_test arch_atomic64_sub_and_test static inline bool arch_atomic64_sub_and_test(long i, atomic64_t *v) { GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, "er", i, "%0", e); } +#define arch_atomic64_sub_and_test arch_atomic64_sub_and_test /** * arch_atomic64_inc - increment atomic64 variable @@ -83,13 +83,13 @@ static inline bool arch_atomic64_sub_and_test(long i, atomic64_t *v) * * Atomically increments @v by 1. */ -#define arch_atomic64_inc arch_atomic64_inc static __always_inline void arch_atomic64_inc(atomic64_t *v) { asm volatile(LOCK_PREFIX "incq %0" : "=m" (v->counter) : "m" (v->counter)); } +#define arch_atomic64_inc arch_atomic64_inc /** * arch_atomic64_dec - decrement atomic64 variable @@ -97,13 +97,13 @@ static __always_inline void arch_atomic64_inc(atomic64_t *v) * * Atomically decrements @v by 1. */ -#define arch_atomic64_dec arch_atomic64_dec static __always_inline void arch_atomic64_dec(atomic64_t *v) { asm volatile(LOCK_PREFIX "decq %0" : "=m" (v->counter) : "m" (v->counter)); } +#define arch_atomic64_dec arch_atomic64_dec /** * arch_atomic64_dec_and_test - decrement and test @@ -113,11 +113,11 @@ static __always_inline void arch_atomic64_dec(atomic64_t *v) * returns true if the result is 0, or false for all other * cases. */ -#define arch_atomic64_dec_and_test arch_atomic64_dec_and_test static inline bool arch_atomic64_dec_and_test(atomic64_t *v) { GEN_UNARY_RMWcc(LOCK_PREFIX "decq", v->counter, "%0", e); } +#define arch_atomic64_dec_and_test arch_atomic64_dec_and_test /** * arch_atomic64_inc_and_test - increment and test @@ -127,11 +127,11 @@ static inline bool arch_atomic64_dec_and_test(atomic64_t *v) * and returns true if the result is zero, or false for all * other cases. */ -#define arch_atomic64_inc_and_test arch_atomic64_inc_and_test static inline bool arch_atomic64_inc_and_test(atomic64_t *v) { GEN_UNARY_RMWcc(LOCK_PREFIX "incq", v->counter, "%0", e); } +#define arch_atomic64_inc_and_test arch_atomic64_inc_and_test /** * arch_atomic64_add_negative - add and test if negative @@ -142,11 +142,11 @@ static inline bool arch_atomic64_inc_and_test(atomic64_t *v) * if the result is negative, or false when * result is greater than or equal to zero. */ -#define arch_atomic64_add_negative arch_atomic64_add_negative static inline bool arch_atomic64_add_negative(long i, atomic64_t *v) { GEN_BINARY_RMWcc(LOCK_PREFIX "addq", v->counter, "er", i, "%0", s); } +#define arch_atomic64_add_negative arch_atomic64_add_negative /** * arch_atomic64_add_return - add and return diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index e203169931c7..6390bd8c141b 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -14,6 +14,16 @@ #ifndef _ASM_X86_FIXMAP_H #define _ASM_X86_FIXMAP_H +/* + * Exposed to assembly code for setting up initial page tables. Cannot be + * calculated in assembly code (fixmap entries are an enum), but is sanity + * checked in the actual fixmap C code to make sure that the fixmap is + * covered fully. + */ +#define FIXMAP_PMD_NUM 2 +/* fixmap starts downwards from the 507th entry in level2_fixmap_pgt */ +#define FIXMAP_PMD_TOP 507 + #ifndef __ASSEMBLY__ #include <linux/kernel.h> #include <asm/acpi.h> diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index e977b6b3a538..00e01d215f74 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -726,19 +726,21 @@ struct hv_enlightened_vmcs { #define HV_STIMER_AUTOENABLE (1ULL << 3) #define HV_STIMER_SINT(config) (__u8)(((config) >> 16) & 0x0F) -struct ipi_arg_non_ex { - u32 vector; - u32 reserved; - u64 cpu_mask; -}; - struct hv_vpset { u64 format; u64 valid_bank_mask; u64 bank_contents[]; }; -struct ipi_arg_ex { +/* HvCallSendSyntheticClusterIpi hypercall */ +struct hv_send_ipi { + u32 vector; + u32 reserved; + u64 cpu_mask; +}; + +/* HvCallSendSyntheticClusterIpiEx hypercall */ +struct hv_send_ipi_ex { u32 vector; u32 reserved; struct hv_vpset vp_set; diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h index 395c9631e000..75f1e35e7c15 100644 --- a/arch/x86/include/asm/kdebug.h +++ b/arch/x86/include/asm/kdebug.h @@ -22,10 +22,20 @@ enum die_val { DIE_NMIUNKNOWN, }; +enum show_regs_mode { + SHOW_REGS_SHORT, + /* + * For when userspace crashed, but we don't think it's our fault, and + * therefore don't print kernel registers. + */ + SHOW_REGS_USER, + SHOW_REGS_ALL +}; + extern void die(const char *, struct pt_regs *,long); extern int __must_check __die(const char *, struct pt_regs *, long); extern void show_stack_regs(struct pt_regs *regs); -extern void __show_regs(struct pt_regs *regs, int all); +extern void __show_regs(struct pt_regs *regs, enum show_regs_mode); extern void show_iret_regs(struct pt_regs *regs); extern unsigned long oops_begin(void); extern void oops_end(unsigned long, struct pt_regs *, int signr); diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 00ddb0c9e612..09b2e3e2cf1b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -869,6 +869,8 @@ struct kvm_arch { bool x2apic_format; bool x2apic_broadcast_quirk_disabled; + + bool guest_can_read_msr_platform_info; }; struct kvm_vm_stat { @@ -1022,6 +1024,7 @@ struct kvm_x86_ops { void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu); void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr); + bool (*guest_apic_has_interrupt)(struct kvm_vcpu *vcpu); void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu); void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa); @@ -1055,6 +1058,7 @@ struct kvm_x86_ops { bool (*umip_emulated)(void); int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr); + void (*request_immediate_exit)(struct kvm_vcpu *vcpu); void (*sched_in)(struct kvm_vcpu *kvm, int cpu); @@ -1237,19 +1241,12 @@ enum emulation_result { #define EMULTYPE_NO_DECODE (1 << 0) #define EMULTYPE_TRAP_UD (1 << 1) #define EMULTYPE_SKIP (1 << 2) -#define EMULTYPE_RETRY (1 << 3) -#define EMULTYPE_NO_REEXECUTE (1 << 4) -#define EMULTYPE_NO_UD_ON_FAIL (1 << 5) -#define EMULTYPE_VMWARE (1 << 6) -int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, - int emulation_type, void *insn, int insn_len); - -static inline int emulate_instruction(struct kvm_vcpu *vcpu, - int emulation_type) -{ - return x86_emulate_instruction(vcpu, 0, - emulation_type | EMULTYPE_NO_REEXECUTE, NULL, 0); -} +#define EMULTYPE_ALLOW_RETRY (1 << 3) +#define EMULTYPE_NO_UD_ON_FAIL (1 << 4) +#define EMULTYPE_VMWARE (1 << 5) +int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type); +int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu, + void *insn, int insn_len); void kvm_enable_efer_bits(u64); bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer); @@ -1450,7 +1447,6 @@ asmlinkage void kvm_spurious_fault(void); ____kvm_handle_fault_on_reboot(insn, "") #define KVM_ARCH_WANT_MMU_NOTIFIER -int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end); int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); @@ -1463,7 +1459,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event); void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu); int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, - unsigned long ipi_bitmap_high, int min, + unsigned long ipi_bitmap_high, u32 min, unsigned long icr, int op_64_bit); u64 kvm_get_arch_capabilities(void); @@ -1490,6 +1486,7 @@ extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn); int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu); int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err); +void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu); int kvm_is_in_guest(void); diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index c0643831706e..616f8e637bc3 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -48,10 +48,13 @@ int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size); /* Architecture __weak replacement functions */ void __init mem_encrypt_init(void); +void __init mem_encrypt_free_decrypted_mem(void); bool sme_active(void); bool sev_active(void); +#define __bss_decrypted __attribute__((__section__(".bss..decrypted"))) + #else /* !CONFIG_AMD_MEM_ENCRYPT */ #define sme_me_mask 0ULL @@ -77,6 +80,8 @@ early_set_memory_decrypted(unsigned long vaddr, unsigned long size) { return 0; static inline int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0; } +#define __bss_decrypted + #endif /* CONFIG_AMD_MEM_ENCRYPT */ /* @@ -88,6 +93,8 @@ early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0; #define __sme_pa(x) (__pa(x) | sme_me_mask) #define __sme_pa_nodebug(x) (__pa_nodebug(x) | sme_me_mask) +extern char __start_bss_decrypted[], __end_bss_decrypted[], __start_bss_decrypted_unused[]; + #endif /* __ASSEMBLY__ */ #endif /* __X86_MEM_ENCRYPT_H__ */ diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h index 24c6cf5f16b7..60d0f9015317 100644 --- a/arch/x86/include/asm/pgtable-2level.h +++ b/arch/x86/include/asm/pgtable-2level.h @@ -19,9 +19,6 @@ static inline void native_set_pte(pte_t *ptep , pte_t pte) static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd) { -#ifdef CONFIG_PAGE_TABLE_ISOLATION - pmd.pud.p4d.pgd = pti_set_user_pgtbl(&pmdp->pud.p4d.pgd, pmd.pud.p4d.pgd); -#endif *pmdp = pmd; } @@ -61,9 +58,6 @@ static inline pte_t native_ptep_get_and_clear(pte_t *xp) #ifdef CONFIG_SMP static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp) { -#ifdef CONFIG_PAGE_TABLE_ISOLATION - pti_set_user_pgtbl(&xp->pud.p4d.pgd, __pgd(0)); -#endif return __pmd(xchg((pmdval_t *)xp, 0)); } #else @@ -73,9 +67,6 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp) #ifdef CONFIG_SMP static inline pud_t native_pudp_get_and_clear(pud_t *xp) { -#ifdef CONFIG_PAGE_TABLE_ISOLATION - pti_set_user_pgtbl(&xp->p4d.pgd, __pgd(0)); -#endif return __pud(xchg((pudval_t *)xp, 0)); } #else diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index e4ffa565a69f..690c0307afed 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1195,7 +1195,7 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, return xchg(pmdp, pmd); } else { pmd_t old = *pmdp; - *pmdp = pmd; + WRITE_ONCE(*pmdp, pmd); return old; } } diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index f773d5e6c8cc..9c85b54bf03c 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -14,6 +14,7 @@ #include <asm/processor.h> #include <linux/bitops.h> #include <linux/threads.h> +#include <asm/fixmap.h> extern p4d_t level4_kernel_pgt[512]; extern p4d_t level4_ident_pgt[512]; @@ -22,7 +23,7 @@ extern pud_t level3_ident_pgt[512]; extern pmd_t level2_kernel_pgt[512]; extern pmd_t level2_fixmap_pgt[512]; extern pmd_t level2_ident_pgt[512]; -extern pte_t level1_fixmap_pgt[512]; +extern pte_t level1_fixmap_pgt[512 * FIXMAP_PMD_NUM]; extern pgd_t init_top_pgt[]; #define swapper_pg_dir init_top_pgt @@ -55,15 +56,15 @@ struct mm_struct; void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte); void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte); -static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) +static inline void native_set_pte(pte_t *ptep, pte_t pte) { - *ptep = native_make_pte(0); + WRITE_ONCE(*ptep, pte); } -static inline void native_set_pte(pte_t *ptep, pte_t pte) +static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) { - *ptep = pte; + native_set_pte(ptep, native_make_pte(0)); } static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte) @@ -73,7 +74,7 @@ static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte) static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd) { - *pmdp = pmd; + WRITE_ONCE(*pmdp, pmd); } static inline void native_pmd_clear(pmd_t *pmd) @@ -109,7 +110,7 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp) static inline void native_set_pud(pud_t *pudp, pud_t pud) { - *pudp = pud; + WRITE_ONCE(*pudp, pud); } static inline void native_pud_clear(pud_t *pud) @@ -137,13 +138,13 @@ static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d) pgd_t pgd; if (pgtable_l5_enabled() || !IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) { - *p4dp = p4d; + WRITE_ONCE(*p4dp, p4d); return; } pgd = native_make_pgd(native_p4d_val(p4d)); pgd = pti_set_user_pgtbl((pgd_t *)p4dp, pgd); - *p4dp = native_make_p4d(native_pgd_val(pgd)); + WRITE_ONCE(*p4dp, native_make_p4d(native_pgd_val(pgd))); } static inline void native_p4d_clear(p4d_t *p4d) @@ -153,7 +154,7 @@ static inline void native_p4d_clear(p4d_t *p4d) static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) { - *pgdp = pti_set_user_pgtbl(pgdp, pgd); + WRITE_ONCE(*pgdp, pti_set_user_pgtbl(pgdp, pgd)); } static inline void native_pgd_clear(pgd_t *pgd) diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 86299efa804a..fd23d5778ea1 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -377,6 +377,7 @@ struct kvm_sync_regs { #define KVM_X86_QUIRK_LINT0_REENABLED (1 << 0) #define KVM_X86_QUIRK_CD_NW_CLEARED (1 << 1) +#define KVM_X86_QUIRK_LAPIC_MMIO_HOLE (1 << 2) #define KVM_STATE_NESTED_GUEST_MODE 0x00000001 #define KVM_STATE_NESTED_RUN_PENDING 0x00000002 diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 9f148e3d45b4..7654febd5102 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -413,7 +413,7 @@ static int activate_managed(struct irq_data *irqd) if (WARN_ON_ONCE(cpumask_empty(vector_searchmask))) { /* Something in the core code broke! Survive gracefully */ pr_err("Managed startup for irq %u, but no CPU\n", irqd->irq); - return EINVAL; + return -EINVAL; } ret = assign_managed_vector(irqd, vector_searchmask); diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index ec00d1ff5098..f7151cd03cb0 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -1640,6 +1640,7 @@ static int do_open(struct inode *inode, struct file *filp) return 0; } +#ifdef CONFIG_PROC_FS static int proc_apm_show(struct seq_file *m, void *v) { unsigned short bx; @@ -1719,6 +1720,7 @@ static int proc_apm_show(struct seq_file *m, void *v) units); return 0; } +#endif static int apm(void *unused) { diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h index 4e588f36228f..285eb3ec4200 100644 --- a/arch/x86/kernel/cpu/intel_rdt.h +++ b/arch/x86/kernel/cpu/intel_rdt.h @@ -382,6 +382,11 @@ static inline bool is_mbm_event(int e) e <= QOS_L3_MBM_LOCAL_EVENT_ID); } +struct rdt_parse_data { + struct rdtgroup *rdtgrp; + char *buf; +}; + /** * struct rdt_resource - attributes of an RDT resource * @rid: The index of the resource @@ -423,16 +428,19 @@ struct rdt_resource { struct rdt_cache cache; struct rdt_membw membw; const char *format_str; - int (*parse_ctrlval) (void *data, struct rdt_resource *r, - struct rdt_domain *d); + int (*parse_ctrlval)(struct rdt_parse_data *data, + struct rdt_resource *r, + struct rdt_domain *d); struct list_head evt_list; int num_rmid; unsigned int mon_scale; unsigned long fflags; }; -int parse_cbm(void *_data, struct rdt_resource *r, struct rdt_domain *d); -int parse_bw(void *_buf, struct rdt_resource *r, struct rdt_domain *d); +int parse_cbm(struct rdt_parse_data *data, struct rdt_resource *r, + struct rdt_domain *d); +int parse_bw(struct rdt_parse_data *data, struct rdt_resource *r, + struct rdt_domain *d); extern struct mutex rdtgroup_mutex; @@ -536,6 +544,7 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp); void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp); struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r); int update_domains(struct rdt_resource *r, int closid); +int closids_supported(void); void closid_free(int closid); int alloc_rmid(void); void free_rmid(u32 rmid); diff --git a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c index af358ca05160..0f53049719cd 100644 --- a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c +++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c @@ -64,19 +64,19 @@ static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r) return true; } -int parse_bw(void *_buf, struct rdt_resource *r, struct rdt_domain *d) +int parse_bw(struct rdt_parse_data *data, struct rdt_resource *r, + struct rdt_domain *d) { - unsigned long data; - char *buf = _buf; + unsigned long bw_val; if (d->have_new_ctrl) { rdt_last_cmd_printf("duplicate domain %d\n", d->id); return -EINVAL; } - if (!bw_validate(buf, &data, r)) + if (!bw_validate(data->buf, &bw_val, r)) return -EINVAL; - d->new_ctrl = data; + d->new_ctrl = bw_val; d->have_new_ctrl = true; return 0; @@ -123,18 +123,13 @@ static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r) return true; } -struct rdt_cbm_parse_data { - struct rdtgroup *rdtgrp; - char *buf; -}; - /* * Read one cache bit mask (hex). Check that it is valid for the current * resource type. */ -int parse_cbm(void *_data, struct rdt_resource *r, struct rdt_domain *d) +int parse_cbm(struct rdt_parse_data *data, struct rdt_resource *r, + struct rdt_domain *d) { - struct rdt_cbm_parse_data *data = _data; struct rdtgroup *rdtgrp = data->rdtgrp; u32 cbm_val; @@ -195,11 +190,17 @@ int parse_cbm(void *_data, struct rdt_resource *r, struct rdt_domain *d) static int parse_line(char *line, struct rdt_resource *r, struct rdtgroup *rdtgrp) { - struct rdt_cbm_parse_data data; + struct rdt_parse_data data; char *dom = NULL, *id; struct rdt_domain *d; unsigned long dom_id; + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP && + r->rid == RDT_RESOURCE_MBA) { + rdt_last_cmd_puts("Cannot pseudo-lock MBA resource\n"); + return -EINVAL; + } + next: if (!line || line[0] == '\0') return 0; diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index b799c00bef09..1b8e86a5d5e1 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -97,6 +97,12 @@ void rdt_last_cmd_printf(const char *fmt, ...) * limited as the number of resources grows. */ static int closid_free_map; +static int closid_free_map_len; + +int closids_supported(void) +{ + return closid_free_map_len; +} static void closid_init(void) { @@ -111,6 +117,7 @@ static void closid_init(void) /* CLOSID 0 is always reserved for the default group */ closid_free_map &= ~1; + closid_free_map_len = rdt_min_closid; } static int closid_alloc(void) @@ -802,7 +809,7 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of, sw_shareable = 0; exclusive = 0; seq_printf(seq, "%d=", dom->id); - for (i = 0; i < r->num_closid; i++, ctrl++) { + for (i = 0; i < closids_supported(); i++, ctrl++) { if (!closid_allocated(i)) continue; mode = rdtgroup_mode_by_closid(i); @@ -989,7 +996,7 @@ bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d, /* Check for overlap with other resource groups */ ctrl = d->ctrl_val; - for (i = 0; i < r->num_closid; i++, ctrl++) { + for (i = 0; i < closids_supported(); i++, ctrl++) { ctrl_b = (unsigned long *)ctrl; mode = rdtgroup_mode_by_closid(i); if (closid_allocated(i) && i != closid && @@ -1024,16 +1031,27 @@ static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp) { int closid = rdtgrp->closid; struct rdt_resource *r; + bool has_cache = false; struct rdt_domain *d; for_each_alloc_enabled_rdt_resource(r) { + if (r->rid == RDT_RESOURCE_MBA) + continue; + has_cache = true; list_for_each_entry(d, &r->domains, list) { if (rdtgroup_cbm_overlaps(r, d, d->ctrl_val[closid], - rdtgrp->closid, false)) + rdtgrp->closid, false)) { + rdt_last_cmd_puts("schemata overlaps\n"); return false; + } } } + if (!has_cache) { + rdt_last_cmd_puts("cannot be exclusive without CAT/CDP\n"); + return false; + } + return true; } @@ -1085,7 +1103,6 @@ static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of, rdtgrp->mode = RDT_MODE_SHAREABLE; } else if (!strcmp(buf, "exclusive")) { if (!rdtgroup_mode_test_exclusive(rdtgrp)) { - rdt_last_cmd_printf("schemata overlaps\n"); ret = -EINVAL; goto out; } @@ -1155,8 +1172,8 @@ static int rdtgroup_size_show(struct kernfs_open_file *of, struct rdt_resource *r; struct rdt_domain *d; unsigned int size; - bool sep = false; - u32 cbm; + bool sep; + u32 ctrl; rdtgrp = rdtgroup_kn_lock_live(of->kn); if (!rdtgrp) { @@ -1174,6 +1191,7 @@ static int rdtgroup_size_show(struct kernfs_open_file *of, } for_each_alloc_enabled_rdt_resource(r) { + sep = false; seq_printf(s, "%*s:", max_name_width, r->name); list_for_each_entry(d, &r->domains, list) { if (sep) @@ -1181,8 +1199,13 @@ static int rdtgroup_size_show(struct kernfs_open_file *of, if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { size = 0; } else { - cbm = d->ctrl_val[rdtgrp->closid]; - size = rdtgroup_cbm_to_size(r, d, cbm); + ctrl = (!is_mba_sc(r) ? + d->ctrl_val[rdtgrp->closid] : + d->mbps_val[rdtgrp->closid]); + if (r->rid == RDT_RESOURCE_MBA) + size = ctrl; + else + size = rdtgroup_cbm_to_size(r, d, ctrl); } seq_printf(s, "%d=%u", d->id, size); sep = true; @@ -2336,12 +2359,18 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) u32 *ctrl; for_each_alloc_enabled_rdt_resource(r) { + /* + * Only initialize default allocations for CBM cache + * resources + */ + if (r->rid == RDT_RESOURCE_MBA) + continue; list_for_each_entry(d, &r->domains, list) { d->have_new_ctrl = false; d->new_ctrl = r->cache.shareable_bits; used_b = r->cache.shareable_bits; ctrl = d->ctrl_val; - for (i = 0; i < r->num_closid; i++, ctrl++) { + for (i = 0; i < closids_supported(); i++, ctrl++) { if (closid_allocated(i) && i != closid) { mode = rdtgroup_mode_by_closid(i); if (mode == RDT_MODE_PSEUDO_LOCKSETUP) @@ -2373,6 +2402,12 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) } for_each_alloc_enabled_rdt_resource(r) { + /* + * Only initialize default allocations for CBM cache + * resources + */ + if (r->rid == RDT_RESOURCE_MBA) + continue; ret = update_domains(r, rdtgrp->closid); if (ret < 0) { rdt_last_cmd_puts("failed to initialize allocations\n"); diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 0624957aa068..07b5fc00b188 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -504,6 +504,7 @@ static enum ucode_state apply_microcode_amd(int cpu) struct microcode_amd *mc_amd; struct ucode_cpu_info *uci; struct ucode_patch *p; + enum ucode_state ret; u32 rev, dummy; BUG_ON(raw_smp_processor_id() != cpu); @@ -521,9 +522,8 @@ static enum ucode_state apply_microcode_amd(int cpu) /* need to apply patch? */ if (rev >= mc_amd->hdr.patch_id) { - c->microcode = rev; - uci->cpu_sig.rev = rev; - return UCODE_OK; + ret = UCODE_OK; + goto out; } if (__apply_microcode_amd(mc_amd)) { @@ -531,13 +531,21 @@ static enum ucode_state apply_microcode_amd(int cpu) cpu, mc_amd->hdr.patch_id); return UCODE_ERROR; } - pr_info("CPU%d: new patch_level=0x%08x\n", cpu, - mc_amd->hdr.patch_id); - uci->cpu_sig.rev = mc_amd->hdr.patch_id; - c->microcode = mc_amd->hdr.patch_id; + rev = mc_amd->hdr.patch_id; + ret = UCODE_UPDATED; + + pr_info("CPU%d: new patch_level=0x%08x\n", cpu, rev); - return UCODE_UPDATED; +out: + uci->cpu_sig.rev = rev; + c->microcode = rev; + + /* Update boot_cpu_data's revision too, if we're on the BSP: */ + if (c->cpu_index == boot_cpu_data.cpu_index) + boot_cpu_data.microcode = rev; + + return ret; } static int install_equiv_cpu_table(const u8 *buf) diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 97ccf4c3b45b..16936a24795c 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -795,6 +795,7 @@ static enum ucode_state apply_microcode_intel(int cpu) struct ucode_cpu_info *uci = ucode_cpu_info + cpu; struct cpuinfo_x86 *c = &cpu_data(cpu); struct microcode_intel *mc; + enum ucode_state ret; static int prev_rev; u32 rev; @@ -817,9 +818,8 @@ static enum ucode_state apply_microcode_intel(int cpu) */ rev = intel_get_microcode_revision(); if (rev >= mc->hdr.rev) { - uci->cpu_sig.rev = rev; - c->microcode = rev; - return UCODE_OK; + ret = UCODE_OK; + goto out; } /* @@ -848,10 +848,17 @@ static enum ucode_state apply_microcode_intel(int cpu) prev_rev = rev; } + ret = UCODE_UPDATED; + +out: uci->cpu_sig.rev = rev; - c->microcode = rev; + c->microcode = rev; + + /* Update boot_cpu_data's revision too, if we're on the BSP: */ + if (c->cpu_index == boot_cpu_data.cpu_index) + boot_cpu_data.microcode = rev; - return UCODE_UPDATED; + return ret; } static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index f56895106ccf..2b5886401e5f 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -146,7 +146,7 @@ static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs, * they can be printed in the right context. */ if (!partial && on_stack(info, regs, sizeof(*regs))) { - __show_regs(regs, 0); + __show_regs(regs, SHOW_REGS_SHORT); } else if (partial && on_stack(info, (void *)regs + IRET_FRAME_OFFSET, IRET_FRAME_SIZE)) { @@ -344,7 +344,7 @@ void oops_end(unsigned long flags, struct pt_regs *regs, int signr) oops_exit(); /* Executive summary in case the oops scrolled away */ - __show_regs(&exec_summary_regs, true); + __show_regs(&exec_summary_regs, SHOW_REGS_ALL); if (!signr) return; @@ -407,14 +407,9 @@ void die(const char *str, struct pt_regs *regs, long err) void show_regs(struct pt_regs *regs) { - bool all = true; - show_regs_print_info(KERN_DEFAULT); - if (IS_ENABLED(CONFIG_X86_32)) - all = !user_mode(regs); - - __show_regs(regs, all); + __show_regs(regs, user_mode(regs) ? SHOW_REGS_USER : SHOW_REGS_ALL); /* * When in-kernel, we also print out the stack at the time of the fault.. diff --git a/arch/x86/kernel/eisa.c b/arch/x86/kernel/eisa.c index f260e452e4f8..e8c8c5d78dbd 100644 --- a/arch/x86/kernel/eisa.c +++ b/arch/x86/kernel/eisa.c @@ -7,11 +7,17 @@ #include <linux/eisa.h> #include <linux/io.h> +#include <xen/xen.h> + static __init int eisa_bus_probe(void) { - void __iomem *p = ioremap(0x0FFFD9, 4); + void __iomem *p; + + if (xen_pv_domain() && !xen_initial_domain()) + return 0; - if (readl(p) == 'E' + ('I'<<8) + ('S'<<16) + ('A'<<24)) + p = ioremap(0x0FFFD9, 4); + if (p && readl(p) == 'E' + ('I' << 8) + ('S' << 16) + ('A' << 24)) EISA_bus = 1; iounmap(p); return 0; diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 8047379e575a..ddee1f0870c4 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -35,6 +35,7 @@ #include <asm/bootparam_utils.h> #include <asm/microcode.h> #include <asm/kasan.h> +#include <asm/fixmap.h> /* * Manage page tables very early on. @@ -112,6 +113,7 @@ static bool __head check_la57_support(unsigned long physaddr) unsigned long __head __startup_64(unsigned long physaddr, struct boot_params *bp) { + unsigned long vaddr, vaddr_end; unsigned long load_delta, *p; unsigned long pgtable_flags; pgdval_t *pgd; @@ -165,7 +167,8 @@ unsigned long __head __startup_64(unsigned long physaddr, pud[511] += load_delta; pmd = fixup_pointer(level2_fixmap_pgt, physaddr); - pmd[506] += load_delta; + for (i = FIXMAP_PMD_TOP; i > FIXMAP_PMD_TOP - FIXMAP_PMD_NUM; i--) + pmd[i] += load_delta; /* * Set up the identity mapping for the switchover. These @@ -235,6 +238,21 @@ unsigned long __head __startup_64(unsigned long physaddr, sme_encrypt_kernel(bp); /* + * Clear the memory encryption mask from the .bss..decrypted section. + * The bss section will be memset to zero later in the initialization so + * there is no need to zero it after changing the memory encryption + * attribute. + */ + if (mem_encrypt_active()) { + vaddr = (unsigned long)__start_bss_decrypted; + vaddr_end = (unsigned long)__end_bss_decrypted; + for (; vaddr < vaddr_end; vaddr += PMD_SIZE) { + i = pmd_index(vaddr); + pmd[i] -= sme_get_me_mask(); + } + } + + /* * Return the SME encryption mask (if SME is active) to be used as a * modifier for the initial pgdir entry programmed into CR3. */ diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 15ebc2fc166e..a3618cf04cf6 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -24,6 +24,7 @@ #include "../entry/calling.h" #include <asm/export.h> #include <asm/nospec-branch.h> +#include <asm/fixmap.h> #ifdef CONFIG_PARAVIRT #include <asm/asm-offsets.h> @@ -445,13 +446,20 @@ NEXT_PAGE(level2_kernel_pgt) KERNEL_IMAGE_SIZE/PMD_SIZE) NEXT_PAGE(level2_fixmap_pgt) - .fill 506,8,0 - .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC - /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */ - .fill 5,8,0 + .fill (512 - 4 - FIXMAP_PMD_NUM),8,0 + pgtno = 0 + .rept (FIXMAP_PMD_NUM) + .quad level1_fixmap_pgt + (pgtno << PAGE_SHIFT) - __START_KERNEL_map \ + + _PAGE_TABLE_NOENC; + pgtno = pgtno + 1 + .endr + /* 6 MB reserved space + a 2MB hole */ + .fill 4,8,0 NEXT_PAGE(level1_fixmap_pgt) + .rept (FIXMAP_PMD_NUM) .fill 512,8,0 + .endr #undef PMDS diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 1e6764648af3..013fe3d21dbb 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -28,6 +28,7 @@ #include <linux/sched/clock.h> #include <linux/mm.h> #include <linux/slab.h> +#include <linux/set_memory.h> #include <asm/hypervisor.h> #include <asm/mem_encrypt.h> @@ -61,9 +62,10 @@ early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall); (PAGE_SIZE / sizeof(struct pvclock_vsyscall_time_info)) static struct pvclock_vsyscall_time_info - hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __aligned(PAGE_SIZE); -static struct pvclock_wall_clock wall_clock; + hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __bss_decrypted __aligned(PAGE_SIZE); +static struct pvclock_wall_clock wall_clock __bss_decrypted; static DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu); +static struct pvclock_vsyscall_time_info *hvclock_mem; static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void) { @@ -236,6 +238,45 @@ static void kvm_shutdown(void) native_machine_shutdown(); } +static void __init kvmclock_init_mem(void) +{ + unsigned long ncpus; + unsigned int order; + struct page *p; + int r; + + if (HVC_BOOT_ARRAY_SIZE >= num_possible_cpus()) + return; + + ncpus = num_possible_cpus() - HVC_BOOT_ARRAY_SIZE; + order = get_order(ncpus * sizeof(*hvclock_mem)); + + p = alloc_pages(GFP_KERNEL, order); + if (!p) { + pr_warn("%s: failed to alloc %d pages", __func__, (1U << order)); + return; + } + + hvclock_mem = page_address(p); + + /* + * hvclock is shared between the guest and the hypervisor, must + * be mapped decrypted. + */ + if (sev_active()) { + r = set_memory_decrypted((unsigned long) hvclock_mem, + 1UL << order); + if (r) { + __free_pages(p, order); + hvclock_mem = NULL; + pr_warn("kvmclock: set_memory_decrypted() failed. Disabling\n"); + return; + } + } + + memset(hvclock_mem, 0, PAGE_SIZE << order); +} + static int __init kvm_setup_vsyscall_timeinfo(void) { #ifdef CONFIG_X86_64 @@ -250,6 +291,9 @@ static int __init kvm_setup_vsyscall_timeinfo(void) kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK; #endif + + kvmclock_init_mem(); + return 0; } early_initcall(kvm_setup_vsyscall_timeinfo); @@ -269,8 +313,10 @@ static int kvmclock_setup_percpu(unsigned int cpu) /* Use the static page for the first CPUs, allocate otherwise */ if (cpu < HVC_BOOT_ARRAY_SIZE) p = &hv_clock_boot[cpu]; + else if (hvclock_mem) + p = hvclock_mem + cpu - HVC_BOOT_ARRAY_SIZE; else - p = kzalloc(sizeof(*p), GFP_KERNEL); + return -ENOMEM; per_cpu(hv_clock_per_cpu, cpu) = p; return p ? 0 : -ENOMEM; diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index afdb303285f8..8dc69d82567e 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -91,7 +91,7 @@ unsigned paravirt_patch_call(void *insnbuf, if (len < 5) { #ifdef CONFIG_RETPOLINE - WARN_ONCE("Failing to patch indirect CALL in %ps\n", (void *)addr); + WARN_ONCE(1, "Failing to patch indirect CALL in %ps\n", (void *)addr); #endif return len; /* call too long for patch site */ } @@ -111,7 +111,7 @@ unsigned paravirt_patch_jmp(void *insnbuf, const void *target, if (len < 5) { #ifdef CONFIG_RETPOLINE - WARN_ONCE("Failing to patch indirect JMP in %ps\n", (void *)addr); + WARN_ONCE(1, "Failing to patch indirect JMP in %ps\n", (void *)addr); #endif return len; /* call too long for patch site */ } diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 2924fd447e61..5046a3c9dec2 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -59,7 +59,7 @@ #include <asm/intel_rdt_sched.h> #include <asm/proto.h> -void __show_regs(struct pt_regs *regs, int all) +void __show_regs(struct pt_regs *regs, enum show_regs_mode mode) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; unsigned long d0, d1, d2, d3, d6, d7; @@ -85,7 +85,7 @@ void __show_regs(struct pt_regs *regs, int all) printk(KERN_DEFAULT "DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x EFLAGS: %08lx\n", (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss, regs->flags); - if (!all) + if (mode != SHOW_REGS_ALL) return; cr0 = read_cr0(); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index a451bc374b9b..ea5ea850348d 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -62,7 +62,7 @@ __visible DEFINE_PER_CPU(unsigned long, rsp_scratch); /* Prints also some state that isn't saved in the pt_regs */ -void __show_regs(struct pt_regs *regs, int all) +void __show_regs(struct pt_regs *regs, enum show_regs_mode mode) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; unsigned long d0, d1, d2, d3, d6, d7; @@ -87,9 +87,17 @@ void __show_regs(struct pt_regs *regs, int all) printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n", regs->r13, regs->r14, regs->r15); - if (!all) + if (mode == SHOW_REGS_SHORT) return; + if (mode == SHOW_REGS_USER) { + rdmsrl(MSR_FS_BASE, fs); + rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); + printk(KERN_DEFAULT "FS: %016lx GS: %016lx\n", + fs, shadowgs); + return; + } + asm("movl %%ds,%0" : "=r" (ds)); asm("movl %%cs,%0" : "=r" (cs)); asm("movl %%es,%0" : "=r" (es)); diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c index 12cbe2b88c0f..738bf42b0218 100644 --- a/arch/x86/kernel/topology.c +++ b/arch/x86/kernel/topology.c @@ -111,8 +111,10 @@ int arch_register_cpu(int num) /* * Currently CPU0 is only hotpluggable on Intel platforms. Other * vendors can add hotplug support later. + * Xen PV guests don't support CPU0 hotplug at all. */ - if (c->x86_vendor != X86_VENDOR_INTEL) + if (c->x86_vendor != X86_VENDOR_INTEL || + boot_cpu_has(X86_FEATURE_XENPV)) cpu0_hotpluggable = 0; /* diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 1463468ba9a0..6490f618e096 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -1415,7 +1415,7 @@ static bool __init determine_cpu_tsc_frequencies(bool early) static unsigned long __init get_loops_per_jiffy(void) { - unsigned long lpj = tsc_khz * KHZ; + u64 lpj = (u64)tsc_khz * KHZ; do_div(lpj, HZ); return lpj; diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 8bde0a419f86..5dd3317d761f 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -65,6 +65,23 @@ jiffies_64 = jiffies; #define ALIGN_ENTRY_TEXT_BEGIN . = ALIGN(PMD_SIZE); #define ALIGN_ENTRY_TEXT_END . = ALIGN(PMD_SIZE); +/* + * This section contains data which will be mapped as decrypted. Memory + * encryption operates on a page basis. Make this section PMD-aligned + * to avoid splitting the pages while mapping the section early. + * + * Note: We use a separate section so that only this section gets + * decrypted to avoid exposing more than we wish. + */ +#define BSS_DECRYPTED \ + . = ALIGN(PMD_SIZE); \ + __start_bss_decrypted = .; \ + *(.bss..decrypted); \ + . = ALIGN(PAGE_SIZE); \ + __start_bss_decrypted_unused = .; \ + . = ALIGN(PMD_SIZE); \ + __end_bss_decrypted = .; \ + #else #define X86_ALIGN_RODATA_BEGIN @@ -74,6 +91,7 @@ jiffies_64 = jiffies; #define ALIGN_ENTRY_TEXT_BEGIN #define ALIGN_ENTRY_TEXT_END +#define BSS_DECRYPTED #endif @@ -355,6 +373,7 @@ SECTIONS __bss_start = .; *(.bss..page_aligned) *(.bss) + BSS_DECRYPTED . = ALIGN(PAGE_SIZE); __bss_stop = .; } diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 0cefba28c864..fbb0e6df121b 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -548,7 +548,7 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, } int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, - unsigned long ipi_bitmap_high, int min, + unsigned long ipi_bitmap_high, u32 min, unsigned long icr, int op_64_bit) { int i; @@ -571,18 +571,31 @@ int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, rcu_read_lock(); map = rcu_dereference(kvm->arch.apic_map); + if (min > map->max_apic_id) + goto out; /* Bits above cluster_size are masked in the caller. */ - for_each_set_bit(i, &ipi_bitmap_low, BITS_PER_LONG) { - vcpu = map->phys_map[min + i]->vcpu; - count += kvm_apic_set_irq(vcpu, &irq, NULL); + for_each_set_bit(i, &ipi_bitmap_low, + min((u32)BITS_PER_LONG, (map->max_apic_id - min + 1))) { + if (map->phys_map[min + i]) { + vcpu = map->phys_map[min + i]->vcpu; + count += kvm_apic_set_irq(vcpu, &irq, NULL); + } } min += cluster_size; - for_each_set_bit(i, &ipi_bitmap_high, BITS_PER_LONG) { - vcpu = map->phys_map[min + i]->vcpu; - count += kvm_apic_set_irq(vcpu, &irq, NULL); + + if (min > map->max_apic_id) + goto out; + + for_each_set_bit(i, &ipi_bitmap_high, + min((u32)BITS_PER_LONG, (map->max_apic_id - min + 1))) { + if (map->phys_map[min + i]) { + vcpu = map->phys_map[min + i]->vcpu; + count += kvm_apic_set_irq(vcpu, &irq, NULL); + } } +out: rcu_read_unlock(); return count; } @@ -1331,9 +1344,8 @@ EXPORT_SYMBOL_GPL(kvm_lapic_reg_read); static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr) { - return kvm_apic_hw_enabled(apic) && - addr >= apic->base_address && - addr < apic->base_address + LAPIC_MMIO_LENGTH; + return addr >= apic->base_address && + addr < apic->base_address + LAPIC_MMIO_LENGTH; } static int apic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this, @@ -1345,6 +1357,15 @@ static int apic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this, if (!apic_mmio_in_range(apic, address)) return -EOPNOTSUPP; + if (!kvm_apic_hw_enabled(apic) || apic_x2apic_mode(apic)) { + if (!kvm_check_has_quirk(vcpu->kvm, + KVM_X86_QUIRK_LAPIC_MMIO_HOLE)) + return -EOPNOTSUPP; + + memset(data, 0xff, len); + return 0; + } + kvm_lapic_reg_read(apic, offset, len, data); return 0; @@ -1904,6 +1925,14 @@ static int apic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, if (!apic_mmio_in_range(apic, address)) return -EOPNOTSUPP; + if (!kvm_apic_hw_enabled(apic) || apic_x2apic_mode(apic)) { + if (!kvm_check_has_quirk(vcpu->kvm, + KVM_X86_QUIRK_LAPIC_MMIO_HOLE)) + return -EOPNOTSUPP; + + return 0; + } + /* * APIC register must be aligned on 128-bits boundary. * 32/64/128 bits registers must be accessed thru 32 bits. diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index a282321329b5..d7e9bce6ff61 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -899,7 +899,7 @@ static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) { /* * Make sure the write to vcpu->mode is not reordered in front of - * reads to sptes. If it does, kvm_commit_zap_page() can see us + * reads to sptes. If it does, kvm_mmu_commit_zap_page() can see us * OUTSIDE_GUEST_MODE and proceed to free the shadow page table. */ smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE); @@ -1853,11 +1853,6 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler); } -int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) -{ - return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp); -} - int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) { return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp); @@ -5217,7 +5212,7 @@ static int make_mmu_pages_available(struct kvm_vcpu *vcpu) int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, void *insn, int insn_len) { - int r, emulation_type = EMULTYPE_RETRY; + int r, emulation_type = 0; enum emulation_result er; bool direct = vcpu->arch.mmu.direct_map; @@ -5230,10 +5225,8 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, r = RET_PF_INVALID; if (unlikely(error_code & PFERR_RSVD_MASK)) { r = handle_mmio_page_fault(vcpu, cr2, direct); - if (r == RET_PF_EMULATE) { - emulation_type = 0; + if (r == RET_PF_EMULATE) goto emulate; - } } if (r == RET_PF_INVALID) { @@ -5260,8 +5253,19 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, return 1; } - if (mmio_info_in_cache(vcpu, cr2, direct)) - emulation_type = 0; + /* + * vcpu->arch.mmu.page_fault returned RET_PF_EMULATE, but we can still + * optimistically try to just unprotect the page and let the processor + * re-execute the instruction that caused the page fault. Do not allow + * retrying MMIO emulation, as it's not only pointless but could also + * cause us to enter an infinite loop because the processor will keep + * faulting on the non-existent MMIO address. Retrying an instruction + * from a nested guest is also pointless and dangerous as we are only + * explicitly shadowing L1's page tables, i.e. unprotecting something + * for L1 isn't going to magically fix whatever issue cause L2 to fail. + */ + if (!mmio_info_in_cache(vcpu, cr2, direct) && !is_guest_mode(vcpu)) + emulation_type = EMULTYPE_ALLOW_RETRY; emulate: /* * On AMD platforms, under certain conditions insn_len may be zero on #NPF. @@ -5413,7 +5417,12 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu) { MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa)); - kvm_init_mmu(vcpu, true); + /* + * kvm_mmu_setup() is called only on vCPU initialization. + * Therefore, no need to reset mmu roots as they are not yet + * initialized. + */ + kvm_init_mmu(vcpu, false); } static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm, diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 6276140044d0..d96092b35936 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -776,7 +776,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) } if (!svm->next_rip) { - if (emulate_instruction(vcpu, EMULTYPE_SKIP) != + if (kvm_emulate_instruction(vcpu, EMULTYPE_SKIP) != EMULATE_DONE) printk(KERN_DEBUG "%s: NOP\n", __func__); return; @@ -1226,8 +1226,7 @@ static __init int sev_hardware_setup(void) min_sev_asid = cpuid_edx(0x8000001F); /* Initialize SEV ASID bitmap */ - sev_asid_bitmap = kcalloc(BITS_TO_LONGS(max_sev_asid), - sizeof(unsigned long), GFP_KERNEL); + sev_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL); if (!sev_asid_bitmap) return 1; @@ -1405,7 +1404,7 @@ static __exit void svm_hardware_unsetup(void) int cpu; if (svm_sev_enabled()) - kfree(sev_asid_bitmap); + bitmap_free(sev_asid_bitmap); for_each_possible_cpu(cpu) svm_cpu_uninit(cpu); @@ -2715,7 +2714,7 @@ static int gp_interception(struct vcpu_svm *svm) WARN_ON_ONCE(!enable_vmware_backdoor); - er = emulate_instruction(vcpu, + er = kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE | EMULTYPE_NO_UD_ON_FAIL); if (er == EMULATE_USER_EXIT) return 0; @@ -2819,7 +2818,7 @@ static int io_interception(struct vcpu_svm *svm) string = (io_info & SVM_IOIO_STR_MASK) != 0; in = (io_info & SVM_IOIO_TYPE_MASK) != 0; if (string) - return emulate_instruction(vcpu, 0) == EMULATE_DONE; + return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; port = io_info >> 16; size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; @@ -3861,7 +3860,7 @@ static int iret_interception(struct vcpu_svm *svm) static int invlpg_interception(struct vcpu_svm *svm) { if (!static_cpu_has(X86_FEATURE_DECODEASSISTS)) - return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; + return kvm_emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1); return kvm_skip_emulated_instruction(&svm->vcpu); @@ -3869,13 +3868,13 @@ static int invlpg_interception(struct vcpu_svm *svm) static int emulate_on_interception(struct vcpu_svm *svm) { - return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; + return kvm_emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; } static int rsm_interception(struct vcpu_svm *svm) { - return x86_emulate_instruction(&svm->vcpu, 0, 0, - rsm_ins_bytes, 2) == EMULATE_DONE; + return kvm_emulate_instruction_from_buffer(&svm->vcpu, + rsm_ins_bytes, 2) == EMULATE_DONE; } static int rdpmc_interception(struct vcpu_svm *svm) @@ -4700,7 +4699,7 @@ static int avic_unaccelerated_access_interception(struct vcpu_svm *svm) ret = avic_unaccel_trap_write(svm); } else { /* Handling Fault */ - ret = (emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE); + ret = (kvm_emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE); } return ret; @@ -6747,7 +6746,7 @@ e_free: static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec) { unsigned long vaddr, vaddr_end, next_vaddr; - unsigned long dst_vaddr, dst_vaddr_end; + unsigned long dst_vaddr; struct page **src_p, **dst_p; struct kvm_sev_dbg debug; unsigned long n; @@ -6763,7 +6762,6 @@ static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec) size = debug.len; vaddr_end = vaddr + size; dst_vaddr = debug.dst_uaddr; - dst_vaddr_end = dst_vaddr + size; for (; vaddr < vaddr_end; vaddr = next_vaddr) { int len, s_off, d_off; @@ -7150,6 +7148,8 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .check_intercept = svm_check_intercept, .handle_external_intr = svm_handle_external_intr, + .request_immediate_exit = __kvm_request_immediate_exit, + .sched_in = svm_sched_in, .pmu_ops = &amd_pmu_ops, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 1d26f3c4985b..06412ba46aa3 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -397,6 +397,7 @@ struct loaded_vmcs { int cpu; bool launched; bool nmi_known_unmasked; + bool hv_timer_armed; /* Support for vnmi-less CPUs */ int soft_vnmi_blocked; ktime_t entry_time; @@ -1019,6 +1020,8 @@ struct vcpu_vmx { int ple_window; bool ple_window_dirty; + bool req_immediate_exit; + /* Support for PML */ #define PML_ENTITY_NUM 512 struct page *pml_pg; @@ -2864,6 +2867,8 @@ static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) u16 fs_sel, gs_sel; int i; + vmx->req_immediate_exit = false; + if (vmx->loaded_cpu_state) return; @@ -5393,9 +5398,10 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) * To use VMXON (and later other VMX instructions), a guest * must first be able to turn on cr4.VMXE (see handle_vmon()). * So basically the check on whether to allow nested VMX - * is here. + * is here. We operate under the default treatment of SMM, + * so VMX cannot be enabled under SMM. */ - if (!nested_vmx_allowed(vcpu)) + if (!nested_vmx_allowed(vcpu) || is_smm(vcpu)) return 1; } @@ -6183,6 +6189,27 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) nested_mark_vmcs12_pages_dirty(vcpu); } +static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + void *vapic_page; + u32 vppr; + int rvi; + + if (WARN_ON_ONCE(!is_guest_mode(vcpu)) || + !nested_cpu_has_vid(get_vmcs12(vcpu)) || + WARN_ON_ONCE(!vmx->nested.virtual_apic_page)) + return false; + + rvi = vmcs_read16(GUEST_INTR_STATUS) & 0xff; + + vapic_page = kmap(vmx->nested.virtual_apic_page); + vppr = *((u32 *)(vapic_page + APIC_PROCPRI)); + kunmap(vmx->nested.virtual_apic_page); + + return ((rvi & 0xf0) > (vppr & 0xf0)); +} + static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, bool nested) { @@ -6983,7 +7010,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, * Cause the #SS fault with 0 error code in VM86 mode. */ if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) { - if (emulate_instruction(vcpu, 0) == EMULATE_DONE) { + if (kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE) { if (vcpu->arch.halt_request) { vcpu->arch.halt_request = 0; return kvm_vcpu_halt(vcpu); @@ -7054,7 +7081,7 @@ static int handle_exception(struct kvm_vcpu *vcpu) if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) { WARN_ON_ONCE(!enable_vmware_backdoor); - er = emulate_instruction(vcpu, + er = kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE | EMULTYPE_NO_UD_ON_FAIL); if (er == EMULATE_USER_EXIT) return 0; @@ -7157,7 +7184,7 @@ static int handle_io(struct kvm_vcpu *vcpu) ++vcpu->stat.io_exits; if (string) - return emulate_instruction(vcpu, 0) == EMULATE_DONE; + return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; port = exit_qualification >> 16; size = (exit_qualification & 7) + 1; @@ -7231,7 +7258,7 @@ static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val) static int handle_desc(struct kvm_vcpu *vcpu) { WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP)); - return emulate_instruction(vcpu, 0) == EMULATE_DONE; + return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; } static int handle_cr(struct kvm_vcpu *vcpu) @@ -7480,7 +7507,7 @@ static int handle_vmcall(struct kvm_vcpu *vcpu) static int handle_invd(struct kvm_vcpu *vcpu) { - return emulate_instruction(vcpu, 0) == EMULATE_DONE; + return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; } static int handle_invlpg(struct kvm_vcpu *vcpu) @@ -7547,7 +7574,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu) return kvm_skip_emulated_instruction(vcpu); } } - return emulate_instruction(vcpu, 0) == EMULATE_DONE; + return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; } static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu) @@ -7704,8 +7731,8 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) if (!static_cpu_has(X86_FEATURE_HYPERVISOR)) return kvm_skip_emulated_instruction(vcpu); else - return x86_emulate_instruction(vcpu, gpa, EMULTYPE_SKIP, - NULL, 0) == EMULATE_DONE; + return kvm_emulate_instruction(vcpu, EMULTYPE_SKIP) == + EMULATE_DONE; } return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0); @@ -7748,7 +7775,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) if (kvm_test_request(KVM_REQ_EVENT, vcpu)) return 1; - err = emulate_instruction(vcpu, 0); + err = kvm_emulate_instruction(vcpu, 0); if (err == EMULATE_USER_EXIT) { ++vcpu->stat.mmio_exits; @@ -7966,6 +7993,9 @@ static __init int hardware_setup(void) kvm_x86_ops->enable_log_dirty_pt_masked = NULL; } + if (!cpu_has_vmx_preemption_timer()) + kvm_x86_ops->request_immediate_exit = __kvm_request_immediate_exit; + if (cpu_has_vmx_preemption_timer() && enable_preemption_timer) { u64 vmx_msr; @@ -9208,7 +9238,8 @@ static int handle_pml_full(struct kvm_vcpu *vcpu) static int handle_preemption_timer(struct kvm_vcpu *vcpu) { - kvm_lapic_expired_hv_timer(vcpu); + if (!to_vmx(vcpu)->req_immediate_exit) + kvm_lapic_expired_hv_timer(vcpu); return 1; } @@ -10595,24 +10626,43 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) msrs[i].host, false); } -static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu) +static void vmx_arm_hv_timer(struct vcpu_vmx *vmx, u32 val) +{ + vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, val); + if (!vmx->loaded_vmcs->hv_timer_armed) + vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL, + PIN_BASED_VMX_PREEMPTION_TIMER); + vmx->loaded_vmcs->hv_timer_armed = true; +} + +static void vmx_update_hv_timer(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u64 tscl; u32 delta_tsc; - if (vmx->hv_deadline_tsc == -1) + if (vmx->req_immediate_exit) { + vmx_arm_hv_timer(vmx, 0); return; + } - tscl = rdtsc(); - if (vmx->hv_deadline_tsc > tscl) - /* sure to be 32 bit only because checked on set_hv_timer */ - delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >> - cpu_preemption_timer_multi); - else - delta_tsc = 0; + if (vmx->hv_deadline_tsc != -1) { + tscl = rdtsc(); + if (vmx->hv_deadline_tsc > tscl) + /* set_hv_timer ensures the delta fits in 32-bits */ + delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >> + cpu_preemption_timer_multi); + else + delta_tsc = 0; + + vmx_arm_hv_timer(vmx, delta_tsc); + return; + } - vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc); + if (vmx->loaded_vmcs->hv_timer_armed) + vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL, + PIN_BASED_VMX_PREEMPTION_TIMER); + vmx->loaded_vmcs->hv_timer_armed = false; } static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) @@ -10672,7 +10722,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) atomic_switch_perf_msrs(vmx); - vmx_arm_hv_timer(vcpu); + vmx_update_hv_timer(vcpu); /* * If this vCPU has touched SPEC_CTRL, restore the guest's value if @@ -11427,16 +11477,18 @@ static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu) u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value; struct vcpu_vmx *vmx = to_vmx(vcpu); - if (vcpu->arch.virtual_tsc_khz == 0) - return; - - /* Make sure short timeouts reliably trigger an immediate vmexit. - * hrtimer_start does not guarantee this. */ - if (preemption_timeout <= 1) { + /* + * A timer value of zero is architecturally guaranteed to cause + * a VMExit prior to executing any instructions in the guest. + */ + if (preemption_timeout == 0) { vmx_preemption_timer_fn(&vmx->nested.preemption_timer); return; } + if (vcpu->arch.virtual_tsc_khz == 0) + return; + preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; preemption_timeout *= 1000000; do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); @@ -11646,11 +11698,15 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, * bits 15:8 should be zero in posted_intr_nv, * the descriptor address has been already checked * in nested_get_vmcs12_pages. + * + * bits 5:0 of posted_intr_desc_addr should be zero. */ if (nested_cpu_has_posted_intr(vmcs12) && (!nested_cpu_has_vid(vmcs12) || !nested_exit_intr_ack_set(vcpu) || - vmcs12->posted_intr_nv & 0xff00)) + (vmcs12->posted_intr_nv & 0xff00) || + (vmcs12->posted_intr_desc_addr & 0x3f) || + (!page_address_valid(vcpu, vmcs12->posted_intr_desc_addr)))) return -EINVAL; /* tpr shadow is needed by all apicv features. */ @@ -12076,11 +12132,10 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, exec_control = vmcs12->pin_based_vm_exec_control; - /* Preemption timer setting is only taken from vmcs01. */ - exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; + /* Preemption timer setting is computed directly in vmx_vcpu_run. */ exec_control |= vmcs_config.pin_based_exec_ctrl; - if (vmx->hv_deadline_tsc == -1) - exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; + exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; + vmx->loaded_vmcs->hv_timer_armed = false; /* Posted interrupts setting is only taken from vmcs12. */ if (nested_cpu_has_posted_intr(vmcs12)) { @@ -12318,6 +12373,9 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (nested_vmx_check_io_bitmap_controls(vcpu, vmcs12)) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; @@ -12537,8 +12595,11 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, u32 *exit_qual) struct vmcs12 *vmcs12 = get_vmcs12(vcpu); bool from_vmentry = !!exit_qual; u32 dummy_exit_qual; + u32 vmcs01_cpu_exec_ctrl; int r = 0; + vmcs01_cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + enter_guest_mode(vcpu); if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) @@ -12575,6 +12636,25 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, u32 *exit_qual) } /* + * If L1 had a pending IRQ/NMI until it executed + * VMLAUNCH/VMRESUME which wasn't delivered because it was + * disallowed (e.g. interrupts disabled), L0 needs to + * evaluate if this pending event should cause an exit from L2 + * to L1 or delivered directly to L2 (e.g. In case L1 don't + * intercept EXTERNAL_INTERRUPT). + * + * Usually this would be handled by L0 requesting a + * IRQ/NMI window by setting VMCS accordingly. However, + * this setting was done on VMCS01 and now VMCS02 is active + * instead. Thus, we force L0 to perform pending event + * evaluation by requesting a KVM_REQ_EVENT. + */ + if (vmcs01_cpu_exec_ctrl & + (CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING)) { + kvm_make_request(KVM_REQ_EVENT, vcpu); + } + + /* * Note no nested_vmx_succeed or nested_vmx_fail here. At this point * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet * returned as far as L1 is concerned. It will only return (and set @@ -12841,6 +12921,11 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) return 0; } +static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu) +{ + to_vmx(vcpu)->req_immediate_exit = true; +} + static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) { ktime_t remaining = @@ -13231,12 +13316,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); - if (vmx->hv_deadline_tsc == -1) - vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL, - PIN_BASED_VMX_PREEMPTION_TIMER); - else - vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL, - PIN_BASED_VMX_PREEMPTION_TIMER); + if (kvm_has_tsc_control) decache_tsc_multiplier(vmx); @@ -13440,18 +13520,12 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc) return -ERANGE; vmx->hv_deadline_tsc = tscl + delta_tsc; - vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL, - PIN_BASED_VMX_PREEMPTION_TIMER); - return delta_tsc == 0; } static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - vmx->hv_deadline_tsc = -1; - vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL, - PIN_BASED_VMX_PREEMPTION_TIMER); + to_vmx(vcpu)->hv_deadline_tsc = -1; } #endif @@ -13932,6 +14006,14 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) return -EINVAL; + /* + * SMM temporarily disables VMX, so we cannot be in guest mode, + * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags + * must be zero. + */ + if (is_smm(vcpu) ? kvm_state->flags : kvm_state->vmx.smm.flags) + return -EINVAL; + if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && !(kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) return -EINVAL; @@ -13988,9 +14070,6 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, check_vmentry_postreqs(vcpu, vmcs12, &exit_qual)) return -EINVAL; - if (kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING) - vmx->nested.nested_run_pending = 1; - vmx->nested.dirty_vmcs12 = true; ret = enter_vmx_non_root_mode(vcpu, NULL); if (ret) @@ -14078,6 +14157,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .apicv_post_state_restore = vmx_apicv_post_state_restore, .hwapic_irr_update = vmx_hwapic_irr_update, .hwapic_isr_update = vmx_hwapic_isr_update, + .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt, .sync_pir_to_irr = vmx_sync_pir_to_irr, .deliver_posted_interrupt = vmx_deliver_posted_interrupt, @@ -14111,6 +14191,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .umip_emulated = vmx_umip_emulated, .check_nested_events = vmx_check_nested_events, + .request_immediate_exit = vmx_request_immediate_exit, .sched_in = vmx_sched_in, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 506bd2b4b8bb..edbf00ec56b3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -628,7 +628,7 @@ bool pdptrs_changed(struct kvm_vcpu *vcpu) gfn_t gfn; int r; - if (is_long_mode(vcpu) || !is_pae(vcpu)) + if (is_long_mode(vcpu) || !is_pae(vcpu) || !is_paging(vcpu)) return false; if (!test_bit(VCPU_EXREG_PDPTR, @@ -2537,7 +2537,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_PLATFORM_INFO: if (!msr_info->host_initiated || - data & ~MSR_PLATFORM_INFO_CPUID_FAULT || (!(data & MSR_PLATFORM_INFO_CPUID_FAULT) && cpuid_fault_enabled(vcpu))) return 1; @@ -2780,6 +2779,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vcpu->arch.osvw.status; break; case MSR_PLATFORM_INFO: + if (!msr_info->host_initiated && + !vcpu->kvm->arch.guest_can_read_msr_platform_info) + return 1; msr_info->data = vcpu->arch.msr_platform_info; break; case MSR_MISC_FEATURES_ENABLES: @@ -2927,6 +2929,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_SPLIT_IRQCHIP: case KVM_CAP_IMMEDIATE_EXIT: case KVM_CAP_GET_MSR_FEATURES: + case KVM_CAP_MSR_PLATFORM_INFO: r = 1; break; case KVM_CAP_SYNC_REGS: @@ -4007,19 +4010,23 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size)); + r = -EFAULT; if (get_user(user_data_size, &user_kvm_nested_state->size)) - return -EFAULT; + break; r = kvm_x86_ops->get_nested_state(vcpu, user_kvm_nested_state, user_data_size); if (r < 0) - return r; + break; if (r > user_data_size) { if (put_user(r, &user_kvm_nested_state->size)) - return -EFAULT; - return -E2BIG; + r = -EFAULT; + else + r = -E2BIG; + break; } + r = 0; break; } @@ -4031,19 +4038,21 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (!kvm_x86_ops->set_nested_state) break; + r = -EFAULT; if (copy_from_user(&kvm_state, user_kvm_nested_state, sizeof(kvm_state))) - return -EFAULT; + break; + r = -EINVAL; if (kvm_state.size < sizeof(kvm_state)) - return -EINVAL; + break; if (kvm_state.flags & ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE)) - return -EINVAL; + break; /* nested_run_pending implies guest_mode. */ if (kvm_state.flags == KVM_STATE_NESTED_RUN_PENDING) - return -EINVAL; + break; r = kvm_x86_ops->set_nested_state(vcpu, user_kvm_nested_state, &kvm_state); break; @@ -4350,6 +4359,10 @@ split_irqchip_unlock: kvm->arch.pause_in_guest = true; r = 0; break; + case KVM_CAP_MSR_PLATFORM_INFO: + kvm->arch.guest_can_read_msr_platform_info = cap->args[0]; + r = 0; + break; default: r = -EINVAL; break; @@ -4987,7 +5000,7 @@ int handle_ud(struct kvm_vcpu *vcpu) emul_type = 0; } - er = emulate_instruction(vcpu, emul_type); + er = kvm_emulate_instruction(vcpu, emul_type); if (er == EMULATE_USER_EXIT) return 0; if (er != EMULATE_DONE) @@ -5870,7 +5883,10 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2, gpa_t gpa = cr2; kvm_pfn_t pfn; - if (emulation_type & EMULTYPE_NO_REEXECUTE) + if (!(emulation_type & EMULTYPE_ALLOW_RETRY)) + return false; + + if (WARN_ON_ONCE(is_guest_mode(vcpu))) return false; if (!vcpu->arch.mmu.direct_map) { @@ -5958,7 +5974,10 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt, */ vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0; - if (!(emulation_type & EMULTYPE_RETRY)) + if (!(emulation_type & EMULTYPE_ALLOW_RETRY)) + return false; + + if (WARN_ON_ONCE(is_guest_mode(vcpu))) return false; if (x86_page_table_writing_insn(ctxt)) @@ -6276,7 +6295,19 @@ restart: return r; } -EXPORT_SYMBOL_GPL(x86_emulate_instruction); + +int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type) +{ + return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0); +} +EXPORT_SYMBOL_GPL(kvm_emulate_instruction); + +int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu, + void *insn, int insn_len) +{ + return x86_emulate_instruction(vcpu, 0, 0, insn, insn_len); +} +EXPORT_SYMBOL_GPL(kvm_emulate_instruction_from_buffer); static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port) @@ -7343,6 +7374,12 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_vcpu_reload_apic_access_page); +void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu) +{ + smp_send_reschedule(vcpu->cpu); +} +EXPORT_SYMBOL_GPL(__kvm_request_immediate_exit); + /* * Returns 1 to let vcpu_run() continue the guest execution loop without * exiting to the userspace. Otherwise, the value will be returned to the @@ -7547,7 +7584,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (req_immediate_exit) { kvm_make_request(KVM_REQ_EVENT, vcpu); - smp_send_reschedule(vcpu->cpu); + kvm_x86_ops->request_immediate_exit(vcpu); } trace_kvm_entry(vcpu->vcpu_id); @@ -7734,7 +7771,7 @@ static inline int complete_emulated_io(struct kvm_vcpu *vcpu) { int r; vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); - r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE); + r = kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE); srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); if (r != EMULATE_DONE) return 0; @@ -7811,6 +7848,29 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) return 0; } +/* Swap (qemu) user FPU context for the guest FPU context. */ +static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) +{ + preempt_disable(); + copy_fpregs_to_fpstate(&vcpu->arch.user_fpu); + /* PKRU is separately restored in kvm_x86_ops->run. */ + __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state, + ~XFEATURE_MASK_PKRU); + preempt_enable(); + trace_kvm_fpu(1); +} + +/* When vcpu_run ends, restore user space FPU context. */ +static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) +{ + preempt_disable(); + copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu); + copy_kernel_to_fpregs(&vcpu->arch.user_fpu.state); + preempt_enable(); + ++vcpu->stat.fpu_reload; + trace_kvm_fpu(0); +} + int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { int r; @@ -8159,7 +8219,7 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) kvm_update_cpuid(vcpu); idx = srcu_read_lock(&vcpu->kvm->srcu); - if (!is_long_mode(vcpu) && is_pae(vcpu)) { + if (!is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu)) { load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); mmu_reset_needed = 1; } @@ -8388,29 +8448,6 @@ static void fx_init(struct kvm_vcpu *vcpu) vcpu->arch.cr0 |= X86_CR0_ET; } -/* Swap (qemu) user FPU context for the guest FPU context. */ -void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) -{ - preempt_disable(); - copy_fpregs_to_fpstate(&vcpu->arch.user_fpu); - /* PKRU is separately restored in kvm_x86_ops->run. */ - __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state, - ~XFEATURE_MASK_PKRU); - preempt_enable(); - trace_kvm_fpu(1); -} - -/* When vcpu_run ends, restore user space FPU context. */ -void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) -{ - preempt_disable(); - copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu); - copy_kernel_to_fpregs(&vcpu->arch.user_fpu.state); - preempt_enable(); - ++vcpu->stat.fpu_reload; - trace_kvm_fpu(0); -} - void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) { void *wbinvd_dirty_mask = vcpu->arch.wbinvd_dirty_mask; @@ -8834,6 +8871,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm->arch.kvmclock_offset = -ktime_get_boot_ns(); pvclock_update_vm_gtod_copy(kvm); + kvm->arch.guest_can_read_msr_platform_info = true; + INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn); INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn); @@ -9182,6 +9221,13 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, kvm_page_track_flush_slot(kvm, slot); } +static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) +{ + return (is_guest_mode(vcpu) && + kvm_x86_ops->guest_apic_has_interrupt && + kvm_x86_ops->guest_apic_has_interrupt(vcpu)); +} + static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) { if (!list_empty_careful(&vcpu->async_pf.done)) @@ -9206,7 +9252,8 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) return true; if (kvm_arch_interrupt_allowed(vcpu) && - kvm_cpu_has_interrupt(vcpu)) + (kvm_cpu_has_interrupt(vcpu) || + kvm_guest_apic_has_interrupt(vcpu))) return true; if (kvm_hv_has_stimer_pending(vcpu)) diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 257f27620bc2..67b9568613f3 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -274,6 +274,8 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int page_num); bool kvm_vector_hashing_enabled(void); +int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, + int emulation_type, void *insn, int insn_len); #define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \ | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \ diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 7a8fc26c1115..faca978ebf9d 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -815,10 +815,14 @@ void free_kernel_image_pages(void *begin, void *end) set_memory_np_noalias(begin_ul, len_pages); } +void __weak mem_encrypt_free_decrypted_mem(void) { } + void __ref free_initmem(void) { e820__reallocate_tables(); + mem_encrypt_free_decrypted_mem(); + free_kernel_image_pages(&__init_begin, &__init_end); } diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index b2de398d1fd3..006f373f54ab 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -348,6 +348,30 @@ bool sev_active(void) EXPORT_SYMBOL(sev_active); /* Architecture __weak replacement functions */ +void __init mem_encrypt_free_decrypted_mem(void) +{ + unsigned long vaddr, vaddr_end, npages; + int r; + + vaddr = (unsigned long)__start_bss_decrypted_unused; + vaddr_end = (unsigned long)__end_bss_decrypted; + npages = (vaddr_end - vaddr) >> PAGE_SHIFT; + + /* + * The unused memory range was mapped decrypted, change the encryption + * attribute from decrypted to encrypted before freeing it. + */ + if (mem_encrypt_active()) { + r = set_memory_encrypted(vaddr, npages); + if (r) { + pr_warn("failed to free unused decrypted pages\n"); + return; + } + } + + free_init_pages("unused decrypted", vaddr, vaddr_end); +} + void __init mem_encrypt_init(void) { if (!sme_me_mask) diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index e848a4811785..089e78c4effd 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -269,7 +269,7 @@ static void mop_up_one_pmd(struct mm_struct *mm, pgd_t *pgdp) if (pgd_val(pgd) != 0) { pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd); - *pgdp = native_make_pgd(0); + pgd_clear(pgdp); paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT); pmd_free(mm, pmd); @@ -494,7 +494,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma, int changed = !pte_same(*ptep, entry); if (changed && dirty) - *ptep = entry; + set_pte(ptep, entry); return changed; } @@ -509,7 +509,7 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, VM_BUG_ON(address & ~HPAGE_PMD_MASK); if (changed && dirty) { - *pmdp = entry; + set_pmd(pmdp, entry); /* * We had a write-protection fault here and changed the pmd * to to more permissive. No need to flush the TLB for that, @@ -529,7 +529,7 @@ int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address, VM_BUG_ON(address & ~HPAGE_PUD_MASK); if (changed && dirty) { - *pudp = entry; + set_pud(pudp, entry); /* * We had a write-protection fault here and changed the pud * to to more permissive. No need to flush the TLB for that, @@ -637,6 +637,15 @@ void __native_set_fixmap(enum fixed_addresses idx, pte_t pte) { unsigned long address = __fix_to_virt(idx); +#ifdef CONFIG_X86_64 + /* + * Ensure that the static initial page tables are covering the + * fixmap completely. + */ + BUILD_BUG_ON(__end_of_permanent_fixed_addresses > + (FIXMAP_PMD_NUM * PTRS_PER_PTE)); +#endif + if (idx >= __end_of_fixed_addresses) { BUG(); return; diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c index 05ca14222463..9959657127f4 100644 --- a/arch/x86/platform/efi/efi_32.c +++ b/arch/x86/platform/efi/efi_32.c @@ -85,10 +85,9 @@ pgd_t * __init efi_call_phys_prolog(void) void __init efi_call_phys_epilog(pgd_t *save_pgd) { + load_fixmap_gdt(0); load_cr3(save_pgd); __flush_tlb_all(); - - load_fixmap_gdt(0); } void __init efi_runtime_update_mappings(void) diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 2fe5c9b1816b..dd461c0167ef 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -1907,7 +1907,7 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) /* L3_k[511] -> level2_fixmap_pgt */ convert_pfn_mfn(level3_kernel_pgt); - /* L3_k[511][506] -> level1_fixmap_pgt */ + /* L3_k[511][508-FIXMAP_PMD_NUM ... 507] -> level1_fixmap_pgt */ convert_pfn_mfn(level2_fixmap_pgt); /* We get [511][511] and have Xen's version of level2_kernel_pgt */ @@ -1952,7 +1952,11 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO); set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); - set_page_prot(level1_fixmap_pgt, PAGE_KERNEL_RO); + + for (i = 0; i < FIXMAP_PMD_NUM; i++) { + set_page_prot(level1_fixmap_pgt + i * PTRS_PER_PTE, + PAGE_KERNEL_RO); + } /* Pin down new L4 */ pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c index 5f7d530fc679..0972184f3f19 100644 --- a/arch/x86/xen/pmu.c +++ b/arch/x86/xen/pmu.c @@ -479,7 +479,7 @@ static void xen_convert_regs(const struct xen_pmu_regs *xen_regs, irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id) { int err, ret = IRQ_NONE; - struct pt_regs regs; + struct pt_regs regs = {0}; const struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); uint8_t xenpmu_flags = get_xenpmu_flags(); |