diff options
Diffstat (limited to 'arch/x86')
36 files changed, 336 insertions, 231 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index a16c45460f1b..dde744682e63 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -232,6 +232,7 @@ config X86 select THREAD_INFO_IN_TASK select USER_STACKTRACE_SUPPORT select VIRT_TO_BUS + select HAVE_ARCH_KCSAN if X86_64 select X86_FEATURE_NAMES if PROC_FS select PROC_PID_ARCH_STATUS if PROC_FS imply IMA_SECURE_AND_OR_TRUSTED_BOOT if EFI @@ -821,14 +822,6 @@ config PVH This option enables the PVH entry point for guest virtual machines as specified in the x86/HVM direct boot ABI. -config KVM_DEBUG_FS - bool "Enable debug information for KVM Guests in debugfs" - depends on KVM_GUEST && DEBUG_FS - ---help--- - This option enables collection of various statistics for KVM guest. - Statistics are displayed in debugfs filesystem. Enabling this option - may incur significant overhead. - config PARAVIRT_TIME_ACCOUNTING bool "Paravirtual steal time accounting" depends on PARAVIRT diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 4c5355684321..fe605205b4ce 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -9,7 +9,9 @@ # Changed by many, many contributors over the years. # +# Sanitizer runtimes are unavailable and cannot be linked for early boot code. KASAN_SANITIZE := n +KCSAN_SANITIZE := n OBJECT_FILES_NON_STANDARD := y # Kernel does not boot with kcov instrumentation here. diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 5f7c262bcc99..7619742f91c9 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -17,7 +17,9 @@ # (see scripts/Makefile.lib size_append) # compressed vmlinux.bin.all + u32 size of vmlinux.bin.all +# Sanitizer runtimes are unavailable and cannot be linked for early boot code. KASAN_SANITIZE := n +KCSAN_SANITIZE := n OBJECT_FILES_NON_STANDARD := y # Prevents link failures: __sanitizer_cov_trace_pc() is not linked in. diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 54e03ab26ff3..04e65f0698f6 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -10,8 +10,11 @@ ARCH_REL_TYPE_ABS += R_386_GLOB_DAT|R_386_JMP_SLOT|R_386_RELATIVE include $(srctree)/lib/vdso/Makefile KBUILD_CFLAGS += $(DISABLE_LTO) + +# Sanitizer runtimes are unavailable and cannot be linked here. KASAN_SANITIZE := n UBSAN_SANITIZE := n +KCSAN_SANITIZE := n OBJECT_FILES_NON_STANDARD := y # Prevents link failures: __sanitizer_cov_trace_pc() is not linked in. @@ -29,6 +32,9 @@ vobjs32-y += vdso32/vclock_gettime.o # files to link into kernel obj-y += vma.o +KASAN_SANITIZE_vma.o := y +UBSAN_SANITIZE_vma.o := y +KCSAN_SANITIZE_vma.o := y OBJECT_FILES_NON_STANDARD_vma.o := n # vDSO images to build diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 0367efdc5b7a..35460fef39b8 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -201,8 +201,12 @@ arch_test_and_change_bit(long nr, volatile unsigned long *addr) return GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(btc), *addr, c, "Ir", nr); } -static __always_inline bool constant_test_bit(long nr, const volatile unsigned long *addr) +static __no_kcsan_or_inline bool constant_test_bit(long nr, const volatile unsigned long *addr) { + /* + * Because this is a plain access, we need to disable KCSAN here to + * avoid double instrumentation via instrumented bitops. + */ return ((1UL << (nr & (BITS_PER_LONG-1))) & (addr[nr >> _BITOPS_LONG_SHIFT])) != 0; } diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index 8f1e94f29a16..a338a6deb950 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -89,6 +89,8 @@ #define INTEL_FAM6_COMETLAKE 0xA5 #define INTEL_FAM6_COMETLAKE_L 0xA6 +#define INTEL_FAM6_SAPPHIRERAPIDS_X 0x8F + /* "Small Core" Processors (Atom) */ #define INTEL_FAM6_ATOM_BONNELL 0x1C /* Diamondville, Pineview */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 1da5858501ca..f8998e97457f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1306,7 +1306,6 @@ struct kvm_arch_async_pf { extern u64 __read_mostly host_efer; extern struct kvm_x86_ops kvm_x86_ops; -extern struct kmem_cache *x86_fpu_cache; #define __KVM_HAVE_ARCH_VM_ALLOC static inline struct kvm *kvm_arch_alloc_vm(void) @@ -1671,7 +1670,7 @@ void kvm_make_scan_ioapic_request(struct kvm *kvm); void kvm_make_scan_ioapic_request_mask(struct kvm *kvm, unsigned long *vcpu_bitmap); -void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, +bool kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, struct kvm_async_pf *work); void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, struct kvm_async_pf *work); diff --git a/arch/x86/include/asm/vdso/gettimeofday.h b/arch/x86/include/asm/vdso/gettimeofday.h index 9a6dc9b4ec99..fb81fea99093 100644 --- a/arch/x86/include/asm/vdso/gettimeofday.h +++ b/arch/x86/include/asm/vdso/gettimeofday.h @@ -271,6 +271,24 @@ static __always_inline const struct vdso_data *__arch_get_vdso_data(void) return __vdso_data; } +static inline bool arch_vdso_clocksource_ok(const struct vdso_data *vd) +{ + return true; +} +#define vdso_clocksource_ok arch_vdso_clocksource_ok + +/* + * Clocksource read value validation to handle PV and HyperV clocksources + * which can be invalidated asynchronously and indicate invalidation by + * returning U64_MAX, which can be effectively tested by checking for a + * negative value after casting it to s64. + */ +static inline bool arch_vdso_cycles_ok(u64 cycles) +{ + return (s64)cycles >= 0; +} +#define vdso_cycles_ok arch_vdso_cycles_ok + /* * x86 specific delta calculation. * diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 8ef4369a4f06..e77261db2391 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -28,6 +28,10 @@ KASAN_SANITIZE_dumpstack_$(BITS).o := n KASAN_SANITIZE_stacktrace.o := n KASAN_SANITIZE_paravirt.o := n +# With some compiler versions the generated code results in boot hangs, caused +# by several compilation units. To be safe, disable all instrumentation. +KCSAN_SANITIZE := n + OBJECT_FILES_NON_STANDARD_test_nx.o := y OBJECT_FILES_NON_STANDARD_paravirt_patch.o := y diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 9244377ed454..e0e2f020ec02 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2051,7 +2051,7 @@ void __init init_apic_mappings(void) unsigned int new_apicid; if (apic_validate_deadline_timer()) - pr_debug("TSC deadline timer available\n"); + pr_info("TSC deadline timer available\n"); if (x2apic_mode) { boot_cpu_physical_apicid = read_apic_id(); diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 7dc4ad68eb41..dba6a83bc349 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -13,6 +13,9 @@ endif KCOV_INSTRUMENT_common.o := n KCOV_INSTRUMENT_perf_event.o := n +# As above, instrumenting secondary CPU boot code causes boot hangs. +KCSAN_SANITIZE_common.o := n + # Make sure load_percpu_segment has no stackprotector nostackp := $(call cc-option, -fno-stack-protector) CFLAGS_common.o := $(nostackp) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index b6f887be440c..0b71970d2d3d 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -588,7 +588,9 @@ early_param("nospectre_v1", nospectre_v1_cmdline); static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init = SPECTRE_V2_NONE; -static enum spectre_v2_user_mitigation spectre_v2_user __ro_after_init = +static enum spectre_v2_user_mitigation spectre_v2_user_stibp __ro_after_init = + SPECTRE_V2_USER_NONE; +static enum spectre_v2_user_mitigation spectre_v2_user_ibpb __ro_after_init = SPECTRE_V2_USER_NONE; #ifdef CONFIG_RETPOLINE @@ -734,15 +736,6 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) break; } - /* - * At this point, an STIBP mode other than "off" has been set. - * If STIBP support is not being forced, check if STIBP always-on - * is preferred. - */ - if (mode != SPECTRE_V2_USER_STRICT && - boot_cpu_has(X86_FEATURE_AMD_STIBP_ALWAYS_ON)) - mode = SPECTRE_V2_USER_STRICT_PREFERRED; - /* Initialize Indirect Branch Prediction Barrier */ if (boot_cpu_has(X86_FEATURE_IBPB)) { setup_force_cpu_cap(X86_FEATURE_USE_IBPB); @@ -765,23 +758,36 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) pr_info("mitigation: Enabling %s Indirect Branch Prediction Barrier\n", static_key_enabled(&switch_mm_always_ibpb) ? "always-on" : "conditional"); + + spectre_v2_user_ibpb = mode; } - /* If enhanced IBRS is enabled no STIBP required */ - if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) + /* + * If enhanced IBRS is enabled or SMT impossible, STIBP is not + * required. + */ + if (!smt_possible || spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) return; /* - * If SMT is not possible or STIBP is not available clear the STIBP - * mode. + * At this point, an STIBP mode other than "off" has been set. + * If STIBP support is not being forced, check if STIBP always-on + * is preferred. */ - if (!smt_possible || !boot_cpu_has(X86_FEATURE_STIBP)) + if (mode != SPECTRE_V2_USER_STRICT && + boot_cpu_has(X86_FEATURE_AMD_STIBP_ALWAYS_ON)) + mode = SPECTRE_V2_USER_STRICT_PREFERRED; + + /* + * If STIBP is not available, clear the STIBP mode. + */ + if (!boot_cpu_has(X86_FEATURE_STIBP)) mode = SPECTRE_V2_USER_NONE; + + spectre_v2_user_stibp = mode; + set_mode: - spectre_v2_user = mode; - /* Only print the STIBP mode when SMT possible */ - if (smt_possible) - pr_info("%s\n", spectre_v2_user_strings[mode]); + pr_info("%s\n", spectre_v2_user_strings[mode]); } static const char * const spectre_v2_strings[] = { @@ -1014,7 +1020,7 @@ void cpu_bugs_smt_update(void) { mutex_lock(&spec_ctrl_mutex); - switch (spectre_v2_user) { + switch (spectre_v2_user_stibp) { case SPECTRE_V2_USER_NONE: break; case SPECTRE_V2_USER_STRICT: @@ -1257,14 +1263,19 @@ static int ib_prctl_set(struct task_struct *task, unsigned long ctrl) { switch (ctrl) { case PR_SPEC_ENABLE: - if (spectre_v2_user == SPECTRE_V2_USER_NONE) + if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE && + spectre_v2_user_stibp == SPECTRE_V2_USER_NONE) return 0; /* * Indirect branch speculation is always disabled in strict - * mode. + * mode. It can neither be enabled if it was force-disabled + * by a previous prctl call. + */ - if (spectre_v2_user == SPECTRE_V2_USER_STRICT || - spectre_v2_user == SPECTRE_V2_USER_STRICT_PREFERRED) + if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT || + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT || + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED || + task_spec_ib_force_disable(task)) return -EPERM; task_clear_spec_ib_disable(task); task_update_spec_tif(task); @@ -1275,10 +1286,12 @@ static int ib_prctl_set(struct task_struct *task, unsigned long ctrl) * Indirect branch speculation is always allowed when * mitigation is force disabled. */ - if (spectre_v2_user == SPECTRE_V2_USER_NONE) + if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE && + spectre_v2_user_stibp == SPECTRE_V2_USER_NONE) return -EPERM; - if (spectre_v2_user == SPECTRE_V2_USER_STRICT || - spectre_v2_user == SPECTRE_V2_USER_STRICT_PREFERRED) + if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT || + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT || + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED) return 0; task_set_spec_ib_disable(task); if (ctrl == PR_SPEC_FORCE_DISABLE) @@ -1309,7 +1322,8 @@ void arch_seccomp_spec_mitigate(struct task_struct *task) { if (ssb_mode == SPEC_STORE_BYPASS_SECCOMP) ssb_prctl_set(task, PR_SPEC_FORCE_DISABLE); - if (spectre_v2_user == SPECTRE_V2_USER_SECCOMP) + if (spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP || + spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP) ib_prctl_set(task, PR_SPEC_FORCE_DISABLE); } #endif @@ -1340,22 +1354,24 @@ static int ib_prctl_get(struct task_struct *task) if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) return PR_SPEC_NOT_AFFECTED; - switch (spectre_v2_user) { - case SPECTRE_V2_USER_NONE: + if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE && + spectre_v2_user_stibp == SPECTRE_V2_USER_NONE) return PR_SPEC_ENABLE; - case SPECTRE_V2_USER_PRCTL: - case SPECTRE_V2_USER_SECCOMP: + else if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT || + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT || + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED) + return PR_SPEC_DISABLE; + else if (spectre_v2_user_ibpb == SPECTRE_V2_USER_PRCTL || + spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP || + spectre_v2_user_stibp == SPECTRE_V2_USER_PRCTL || + spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP) { if (task_spec_ib_force_disable(task)) return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE; if (task_spec_ib_disable(task)) return PR_SPEC_PRCTL | PR_SPEC_DISABLE; return PR_SPEC_PRCTL | PR_SPEC_ENABLE; - case SPECTRE_V2_USER_STRICT: - case SPECTRE_V2_USER_STRICT_PREFERRED: - return PR_SPEC_DISABLE; - default: + } else return PR_SPEC_NOT_AFFECTED; - } } int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which) @@ -1594,7 +1610,7 @@ static char *stibp_state(void) if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) return ""; - switch (spectre_v2_user) { + switch (spectre_v2_user_stibp) { case SPECTRE_V2_USER_NONE: return ", STIBP: disabled"; case SPECTRE_V2_USER_STRICT: diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 63926c94eb5f..c25a67a34bd3 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -1142,9 +1142,12 @@ void switch_to_sld(unsigned long tifn) static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, 0), X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, 0), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, 0), X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT, 1), X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, 1), X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_L, 1), + X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, 1), + X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, 1), {} }; diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 4d13c57f370a..983cd53ed4c9 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -991,7 +991,15 @@ void __init e820__reserve_setup_data(void) while (pa_data) { data = early_memremap(pa_data, sizeof(*data)); e820__range_update(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); - e820__range_update_kexec(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); + + /* + * SETUP_EFI is supplied by kexec and does not need to be + * reserved. + */ + if (data->type != SETUP_EFI) + e820__range_update_kexec(pa_data, + sizeof(*data) + data->len, + E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); if (data->type == SETUP_INDIRECT && ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) { diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index d00f7c430e65..df63786e7bfa 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -21,7 +21,6 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/kprobes.h> -#include <linux/debugfs.h> #include <linux/nmi.h> #include <linux/swait.h> #include <asm/timer.h> diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 8e3d0347b664..f362ce0d5ac0 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -545,28 +545,20 @@ static __always_inline void __speculation_ctrl_update(unsigned long tifp, lockdep_assert_irqs_disabled(); - /* - * If TIF_SSBD is different, select the proper mitigation - * method. Note that if SSBD mitigation is disabled or permanentely - * enabled this branch can't be taken because nothing can set - * TIF_SSBD. - */ - if (tif_diff & _TIF_SSBD) { - if (static_cpu_has(X86_FEATURE_VIRT_SSBD)) { + /* Handle change of TIF_SSBD depending on the mitigation method. */ + if (static_cpu_has(X86_FEATURE_VIRT_SSBD)) { + if (tif_diff & _TIF_SSBD) amd_set_ssb_virt_state(tifn); - } else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD)) { + } else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD)) { + if (tif_diff & _TIF_SSBD) amd_set_core_ssb_state(tifn); - } else if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) || - static_cpu_has(X86_FEATURE_AMD_SSBD)) { - msr |= ssbd_tif_to_spec_ctrl(tifn); - updmsr = true; - } + } else if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) || + static_cpu_has(X86_FEATURE_AMD_SSBD)) { + updmsr |= !!(tif_diff & _TIF_SSBD); + msr |= ssbd_tif_to_spec_ctrl(tifn); } - /* - * Only evaluate TIF_SPEC_IB if conditional STIBP is enabled, - * otherwise avoid the MSR write. - */ + /* Only evaluate TIF_SPEC_IB if conditional STIBP is enabled. */ if (IS_ENABLED(CONFIG_SMP) && static_branch_unlikely(&switch_to_cond_stibp)) { updmsr |= !!(tif_diff & _TIF_SPEC_IB); diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index e040ba6be27b..0ec7ced727fe 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -197,6 +197,14 @@ static const struct dmi_system_id reboot_dmi_table[] __initconst = { DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5"), }, }, + { /* Handle problems with rebooting on Apple MacBook6,1 */ + .callback = set_pci_reboot, + .ident = "Apple MacBook6,1", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "MacBook6,1"), + }, + }, { /* Handle problems with rebooting on Apple MacBookPro5 */ .callback = set_pci_reboot, .ident = "Apple MacBookPro5", diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index 371a6b348e44..e42faa792c07 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -25,10 +25,6 @@ #include <asm/hpet.h> #include <asm/time.h> -#ifdef CONFIG_X86_64 -__visible volatile unsigned long jiffies __cacheline_aligned_in_smp = INITIAL_JIFFIES; -#endif - unsigned long profile_pc(struct pt_regs *regs) { unsigned long pc = instruction_pointer(regs); diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index b4c6b6f35548..3bfc8dd8a43d 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -40,13 +40,13 @@ OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT) #ifdef CONFIG_X86_32 OUTPUT_ARCH(i386) ENTRY(phys_startup_32) -jiffies = jiffies_64; #else OUTPUT_ARCH(i386:x86-64) ENTRY(phys_startup_64) -jiffies_64 = jiffies; #endif +jiffies = jiffies_64; + #if defined(CONFIG_X86_64) /* * On 64-bit, align RODATA to 2MB so we retain large page mappings for diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 253b8e875ccd..8a294f9747aa 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -181,17 +181,14 @@ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, r = -E2BIG; if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) goto out; - r = -ENOMEM; if (cpuid->nent) { - cpuid_entries = - vmalloc(array_size(sizeof(struct kvm_cpuid_entry), - cpuid->nent)); - if (!cpuid_entries) - goto out; - r = -EFAULT; - if (copy_from_user(cpuid_entries, entries, - cpuid->nent * sizeof(struct kvm_cpuid_entry))) + cpuid_entries = vmemdup_user(entries, + array_size(sizeof(struct kvm_cpuid_entry), + cpuid->nent)); + if (IS_ERR(cpuid_entries)) { + r = PTR_ERR(cpuid_entries); goto out; + } } for (i = 0; i < cpuid->nent; i++) { vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; @@ -211,8 +208,8 @@ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, kvm_x86_ops.cpuid_update(vcpu); r = kvm_update_cpuid(vcpu); + kvfree(cpuid_entries); out: - vfree(cpuid_entries); return r; } @@ -325,7 +322,7 @@ void kvm_set_cpu_caps(void) ); kvm_cpu_cap_mask(CPUID_7_ECX, - F(AVX512VBMI) | F(LA57) | 0 /*PKU*/ | 0 /*OSPKE*/ | F(RDPID) | + F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(RDPID) | F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) | F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) | F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/ @@ -334,6 +331,13 @@ void kvm_set_cpu_caps(void) if (cpuid_ecx(7) & F(LA57)) kvm_cpu_cap_set(X86_FEATURE_LA57); + /* + * PKU not yet implemented for shadow paging and requires OSPKE + * to be set on the host. Clear it if that is not the case + */ + if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE)) + kvm_cpu_cap_clear(X86_FEATURE_PKU); + kvm_cpu_cap_mask(CPUID_7_EDX, F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) | F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) | @@ -426,7 +430,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cpu_caps); struct kvm_cpuid_array { struct kvm_cpuid_entry2 *entries; - const int maxnent; + int maxnent; int nent; }; @@ -870,7 +874,6 @@ int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_array array = { .nent = 0, - .maxnent = cpuid->nent, }; int r, i; @@ -887,6 +890,8 @@ int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, if (!array.entries) return -ENOMEM; + array.maxnent = cpuid->nent; + for (i = 0; i < ARRAY_SIZE(funcs); i++) { r = get_cpuid_func(&array, funcs[i], type); if (r) diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c index 018aebce33ff..7e818d64bb4d 100644 --- a/arch/x86/kvm/debugfs.c +++ b/arch/x86/kvm/debugfs.c @@ -43,22 +43,22 @@ static int vcpu_get_tsc_scaling_frac_bits(void *data, u64 *val) DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_frac_fops, vcpu_get_tsc_scaling_frac_bits, NULL, "%llu\n"); -void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu) +void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu, struct dentry *debugfs_dentry) { - debugfs_create_file("tsc-offset", 0444, vcpu->debugfs_dentry, vcpu, + debugfs_create_file("tsc-offset", 0444, debugfs_dentry, vcpu, &vcpu_tsc_offset_fops); if (lapic_in_kernel(vcpu)) debugfs_create_file("lapic_timer_advance_ns", 0444, - vcpu->debugfs_dentry, vcpu, + debugfs_dentry, vcpu, &vcpu_timer_advance_ns_fops); if (kvm_has_tsc_control) { debugfs_create_file("tsc-scaling-ratio", 0444, - vcpu->debugfs_dentry, vcpu, + debugfs_dentry, vcpu, &vcpu_tsc_scaling_fops); debugfs_create_file("tsc-scaling-ratio-frac-bits", 0444, - vcpu->debugfs_dentry, vcpu, + debugfs_dentry, vcpu, &vcpu_tsc_scaling_frac_fops); } } diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index de5476f8683e..d0e2825ae617 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4800,8 +4800,12 @@ static const struct opcode twobyte_table[256] = { GP(ModRM | DstReg | SrcMem | Mov | Sse, &pfx_0f_10_0f_11), GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_10_0f_11), N, N, N, N, N, N, - D(ImplicitOps | ModRM | SrcMem | NoAccess), - N, N, N, N, N, N, D(ImplicitOps | ModRM | SrcMem | NoAccess), + D(ImplicitOps | ModRM | SrcMem | NoAccess), /* 4 * prefetch + 4 * reserved NOP */ + D(ImplicitOps | ModRM | SrcMem | NoAccess), N, N, + D(ImplicitOps | ModRM | SrcMem | NoAccess), /* 8 * reserved NOP */ + D(ImplicitOps | ModRM | SrcMem | NoAccess), /* 8 * reserved NOP */ + D(ImplicitOps | ModRM | SrcMem | NoAccess), /* 8 * reserved NOP */ + D(ImplicitOps | ModRM | SrcMem | NoAccess), /* NOP + 7 * reserved NOP */ /* 0x20 - 0x2F */ DIP(ModRM | DstMem | Priv | Op3264 | NoMod, cr_read, check_cr_read), DIP(ModRM | DstMem | Priv | Op3264 | NoMod, dr_read, check_dr_read), diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index febca334c320..a6e218c6140d 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -462,7 +462,6 @@ static int pit_ioport_write(struct kvm_vcpu *vcpu, if (channel == 3) { /* Read-Back Command. */ for (channel = 0; channel < 3; channel++) { - s = &pit_state->channels[channel]; if (val & (2 << channel)) { if (!(val & 0x20)) pit_latch_count(pit, channel); diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 8a6db11dcb43..6bceafb19108 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -258,7 +258,7 @@ void sync_nested_vmcb_control(struct vcpu_svm *svm) /* Only a few fields of int_ctl are written by the processor. */ mask = V_IRQ_MASK | V_TPR_MASK; if (!(svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK) && - is_intercept(svm, SVM_EXIT_VINTR)) { + is_intercept(svm, INTERCEPT_VINTR)) { /* * In order to request an interrupt window, L0 is usurping * svm->vmcb->control.int_ctl and possibly setting V_IRQ diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 7502cd65528f..8ccfa4197d9c 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1378,6 +1378,8 @@ static void svm_clear_vintr(struct vcpu_svm *svm) /* Drop int_ctl fields related to VINTR injection. */ svm->vmcb->control.int_ctl &= mask; if (is_guest_mode(&svm->vcpu)) { + svm->nested.hsave->control.int_ctl &= mask; + WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) != (svm->nested.ctl.int_ctl & V_TPR_MASK)); svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl & ~mask; @@ -1999,7 +2001,7 @@ void svm_set_gif(struct vcpu_svm *svm, bool value) */ if (vgif_enabled(svm)) clr_intercept(svm, INTERCEPT_STGI); - if (is_intercept(svm, SVM_EXIT_VINTR)) + if (is_intercept(svm, INTERCEPT_VINTR)) svm_clear_vintr(svm); enable_gif(svm); diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 2e7238a57fc1..d1af20b050a8 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4624,19 +4624,24 @@ void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu) } } -static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer) +static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer, + int *ret) { gva_t gva; struct x86_exception e; + int r; if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), vmcs_read32(VMX_INSTRUCTION_INFO), false, - sizeof(*vmpointer), &gva)) - return 1; + sizeof(*vmpointer), &gva)) { + *ret = 1; + return -EINVAL; + } - if (kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e)) { - kvm_inject_emulated_page_fault(vcpu, &e); - return 1; + r = kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e); + if (r != X86EMUL_CONTINUE) { + *ret = vmx_handle_memory_failure(vcpu, r, &e); + return -EINVAL; } return 0; @@ -4764,8 +4769,8 @@ static int handle_vmon(struct kvm_vcpu *vcpu) return 1; } - if (nested_vmx_get_vmptr(vcpu, &vmptr)) - return 1; + if (nested_vmx_get_vmptr(vcpu, &vmptr, &ret)) + return ret; /* * SDM 3: 24.11.5 @@ -4838,12 +4843,13 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) u32 zero = 0; gpa_t vmptr; u64 evmcs_gpa; + int r; if (!nested_vmx_check_permission(vcpu)) return 1; - if (nested_vmx_get_vmptr(vcpu, &vmptr)) - return 1; + if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) + return r; if (!page_address_valid(vcpu, vmptr)) return nested_vmx_failValid(vcpu, @@ -4902,7 +4908,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) u64 value; gva_t gva = 0; short offset; - int len; + int len, r; if (!nested_vmx_check_permission(vcpu)) return 1; @@ -4943,10 +4949,9 @@ static int handle_vmread(struct kvm_vcpu *vcpu) instr_info, true, len, &gva)) return 1; /* _system ok, nested_vmx_check_permission has verified cpl=0 */ - if (kvm_write_guest_virt_system(vcpu, gva, &value, len, &e)) { - kvm_inject_emulated_page_fault(vcpu, &e); - return 1; - } + r = kvm_write_guest_virt_system(vcpu, gva, &value, len, &e); + if (r != X86EMUL_CONTINUE) + return vmx_handle_memory_failure(vcpu, r, &e); } return nested_vmx_succeed(vcpu); @@ -4987,7 +4992,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) unsigned long field; short offset; gva_t gva; - int len; + int len, r; /* * The value to write might be 32 or 64 bits, depending on L1's long @@ -5017,10 +5022,9 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) if (get_vmx_mem_address(vcpu, exit_qualification, instr_info, false, len, &gva)) return 1; - if (kvm_read_guest_virt(vcpu, gva, &value, len, &e)) { - kvm_inject_emulated_page_fault(vcpu, &e); - return 1; - } + r = kvm_read_guest_virt(vcpu, gva, &value, len, &e); + if (r != X86EMUL_CONTINUE) + return vmx_handle_memory_failure(vcpu, r, &e); } field = kvm_register_readl(vcpu, (((instr_info) >> 28) & 0xf)); @@ -5103,12 +5107,13 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); gpa_t vmptr; + int r; if (!nested_vmx_check_permission(vcpu)) return 1; - if (nested_vmx_get_vmptr(vcpu, &vmptr)) - return 1; + if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) + return r; if (!page_address_valid(vcpu, vmptr)) return nested_vmx_failValid(vcpu, @@ -5170,6 +5175,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu) gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr; struct x86_exception e; gva_t gva; + int r; if (!nested_vmx_check_permission(vcpu)) return 1; @@ -5181,11 +5187,11 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu) true, sizeof(gpa_t), &gva)) return 1; /* *_system ok, nested_vmx_check_permission has verified cpl=0 */ - if (kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr, - sizeof(gpa_t), &e)) { - kvm_inject_emulated_page_fault(vcpu, &e); - return 1; - } + r = kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr, + sizeof(gpa_t), &e); + if (r != X86EMUL_CONTINUE) + return vmx_handle_memory_failure(vcpu, r, &e); + return nested_vmx_succeed(vcpu); } @@ -5209,7 +5215,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) struct { u64 eptp, gpa; } operand; - int i; + int i, r; if (!(vmx->nested.msrs.secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) || @@ -5236,10 +5242,9 @@ static int handle_invept(struct kvm_vcpu *vcpu) if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), vmx_instruction_info, false, sizeof(operand), &gva)) return 1; - if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { - kvm_inject_emulated_page_fault(vcpu, &e); - return 1; - } + r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); + if (r != X86EMUL_CONTINUE) + return vmx_handle_memory_failure(vcpu, r, &e); /* * Nested EPT roots are always held through guest_mmu, @@ -5291,6 +5296,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) u64 gla; } operand; u16 vpid02; + int r; if (!(vmx->nested.msrs.secondary_ctls_high & SECONDARY_EXEC_ENABLE_VPID) || @@ -5318,10 +5324,10 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), vmx_instruction_info, false, sizeof(operand), &gva)) return 1; - if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { - kvm_inject_emulated_page_fault(vcpu, &e); - return 1; - } + r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); + if (r != X86EMUL_CONTINUE) + return vmx_handle_memory_failure(vcpu, r, &e); + if (operand.vpid >> 16) return nested_vmx_failValid(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); @@ -5666,7 +5672,7 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, u32 exit_reason) { u32 intr_info; - switch (exit_reason) { + switch ((u16)exit_reason) { case EXIT_REASON_EXCEPTION_NMI: intr_info = vmx_get_intr_info(vcpu); if (is_nmi(intr_info)) @@ -5727,7 +5733,7 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, u32 exit_reason) struct vmcs12 *vmcs12 = get_vmcs12(vcpu); u32 intr_info; - switch (exit_reason) { + switch ((u16)exit_reason) { case EXIT_REASON_EXCEPTION_NMI: intr_info = vmx_get_intr_info(vcpu); if (is_nmi(intr_info)) diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index d33d890b605f..bdcce65c7a1d 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -181,7 +181,7 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) ret = pmu->version > 1; break; case MSR_IA32_PERF_CAPABILITIES: - ret = guest_cpuid_has(vcpu, X86_FEATURE_PDCM); + ret = 1; break; default: ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) || diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 2b5ba6063a2d..36c771728c8c 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1600,6 +1600,32 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu) return 1; } +/* + * Handles kvm_read/write_guest_virt*() result and either injects #PF or returns + * KVM_EXIT_INTERNAL_ERROR for cases not currently handled by KVM. Return value + * indicates whether exit to userspace is needed. + */ +int vmx_handle_memory_failure(struct kvm_vcpu *vcpu, int r, + struct x86_exception *e) +{ + if (r == X86EMUL_PROPAGATE_FAULT) { + kvm_inject_emulated_page_fault(vcpu, e); + return 1; + } + + /* + * In case kvm_read/write_guest_virt*() failed with X86EMUL_IO_NEEDED + * while handling a VMX instruction KVM could've handled the request + * correctly by exiting to userspace and performing I/O but there + * doesn't seem to be a real use-case behind such requests, just return + * KVM_EXIT_INTERNAL_ERROR for now. + */ + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + + return 0; +} /* * Recognizes a pending MTF VM-exit and records the nested state for later @@ -5486,6 +5512,7 @@ static int handle_invpcid(struct kvm_vcpu *vcpu) u64 pcid; u64 gla; } operand; + int r; if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) { kvm_queue_exception(vcpu, UD_VECTOR); @@ -5508,10 +5535,9 @@ static int handle_invpcid(struct kvm_vcpu *vcpu) sizeof(operand), &gva)) return 1; - if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { - kvm_inject_emulated_page_fault(vcpu, &e); - return 1; - } + r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); + if (r != X86EMUL_CONTINUE) + return vmx_handle_memory_failure(vcpu, r, &e); if (operand.pcid >> 12 != 0) { kvm_inject_gp(vcpu, 0); @@ -7282,10 +7308,6 @@ static __init void vmx_set_cpu_caps(void) if (vmx_pt_mode_is_host_guest()) kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT); - /* PKU is not yet implemented for shadow paging. */ - if (enable_ept && boot_cpu_has(X86_FEATURE_OSPKE)) - kvm_cpu_cap_check_and_set(X86_FEATURE_PKU); - if (vmx_umip_emulated()) kvm_cpu_cap_set(X86_FEATURE_UMIP); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 672c28f17e49..8a83b5edc820 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -355,6 +355,8 @@ struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr); void pt_update_intercept_for_msr(struct vcpu_vmx *vmx); void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp); int vmx_find_msr_index(struct vmx_msrs *m, u32 msr); +int vmx_handle_memory_failure(struct kvm_vcpu *vcpu, int r, + struct x86_exception *e); #define POSTED_INTR_ON 0 #define POSTED_INTR_SN 1 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9e41b5135340..00c88c2f34e4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -239,8 +239,7 @@ u64 __read_mostly host_xcr0; u64 __read_mostly supported_xcr0; EXPORT_SYMBOL_GPL(supported_xcr0); -struct kmem_cache *x86_fpu_cache; -EXPORT_SYMBOL_GPL(x86_fpu_cache); +static struct kmem_cache *x86_fpu_cache; static struct kmem_cache *x86_emulator_cache; @@ -5647,13 +5646,6 @@ int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val, /* kvm_write_guest_virt_system can pull in tons of pages. */ vcpu->arch.l1tf_flush_l1d = true; - /* - * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED - * is returned, but our callers are not ready for that and they blindly - * call kvm_inject_page_fault. Ensure that they at least do not leak - * uninitialized kernel stack memory into cr2 and error code. - */ - memset(exception, 0, sizeof(*exception)); return kvm_write_guest_virt_helper(addr, val, bytes, vcpu, PFERR_WRITE_MASK, exception); } @@ -7018,7 +7010,7 @@ restart: if (!ctxt->have_exception || exception_type(ctxt->exception.vector) == EXCPT_TRAP) { kvm_rip_write(vcpu, ctxt->eip); - if (r && ctxt->tf) + if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP))) r = kvm_vcpu_do_singlestep(vcpu); if (kvm_x86_ops.update_emulated_instruction) kvm_x86_ops.update_emulated_instruction(vcpu); @@ -8277,9 +8269,8 @@ static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu) kvm_x86_ops.load_eoi_exitmap(vcpu, eoi_exit_bitmap); } -int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, - unsigned long start, unsigned long end, - bool blockable) +void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, + unsigned long start, unsigned long end) { unsigned long apic_address; @@ -8290,8 +8281,6 @@ int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, apic_address = gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); if (start <= apic_address && apic_address < end) kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD); - - return 0; } void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) @@ -9962,13 +9951,8 @@ int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) if (!slot || !slot->npages) return 0; - /* - * Stuff a non-canonical value to catch use-after-delete. This - * ends up being 0 on 32-bit KVM, but there's no better - * alternative. - */ - hva = (unsigned long)(0xdeadull << 48); old_npages = slot->npages; + hva = 0; } for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { @@ -10140,43 +10124,65 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, } static void kvm_mmu_slot_apply_flags(struct kvm *kvm, - struct kvm_memory_slot *new) + struct kvm_memory_slot *old, + struct kvm_memory_slot *new, + enum kvm_mr_change change) { - /* Still write protect RO slot */ - if (new->flags & KVM_MEM_READONLY) { - kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_4K); + /* + * Nothing to do for RO slots or CREATE/MOVE/DELETE of a slot. + * See comments below. + */ + if ((change != KVM_MR_FLAGS_ONLY) || (new->flags & KVM_MEM_READONLY)) return; - } /* - * Call kvm_x86_ops dirty logging hooks when they are valid. - * - * kvm_x86_ops.slot_disable_log_dirty is called when: - * - * - KVM_MR_CREATE with dirty logging is disabled - * - KVM_MR_FLAGS_ONLY with dirty logging is disabled in new flag - * - * The reason is, in case of PML, we need to set D-bit for any slots - * with dirty logging disabled in order to eliminate unnecessary GPA - * logging in PML buffer (and potential PML buffer full VMEXIT). This - * guarantees leaving PML enabled during guest's lifetime won't have - * any additional overhead from PML when guest is running with dirty - * logging disabled for memory slots. + * Dirty logging tracks sptes in 4k granularity, meaning that large + * sptes have to be split. If live migration is successful, the guest + * in the source machine will be destroyed and large sptes will be + * created in the destination. However, if the guest continues to run + * in the source machine (for example if live migration fails), small + * sptes will remain around and cause bad performance. * - * kvm_x86_ops.slot_enable_log_dirty is called when switching new slot - * to dirty logging mode. + * Scan sptes if dirty logging has been stopped, dropping those + * which can be collapsed into a single large-page spte. Later + * page faults will create the large-page sptes. * - * If kvm_x86_ops dirty logging hooks are invalid, use write protect. + * There is no need to do this in any of the following cases: + * CREATE: No dirty mappings will already exist. + * MOVE/DELETE: The old mappings will already have been cleaned up by + * kvm_arch_flush_shadow_memslot() + */ + if ((old->flags & KVM_MEM_LOG_DIRTY_PAGES) && + !(new->flags & KVM_MEM_LOG_DIRTY_PAGES)) + kvm_mmu_zap_collapsible_sptes(kvm, new); + + /* + * Enable or disable dirty logging for the slot. * - * In case of write protect: + * For KVM_MR_DELETE and KVM_MR_MOVE, the shadow pages of the old + * slot have been zapped so no dirty logging updates are needed for + * the old slot. + * For KVM_MR_CREATE and KVM_MR_MOVE, once the new slot is visible + * any mappings that might be created in it will consume the + * properties of the new slot and do not need to be updated here. * - * Write protect all pages for dirty logging. + * When PML is enabled, the kvm_x86_ops dirty logging hooks are + * called to enable/disable dirty logging. * - * All the sptes including the large sptes which point to this - * slot are set to readonly. We can not create any new large - * spte on this slot until the end of the logging. + * When disabling dirty logging with PML enabled, the D-bit is set + * for sptes in the slot in order to prevent unnecessary GPA + * logging in the PML buffer (and potential PML buffer full VMEXIT). + * This guarantees leaving PML enabled for the guest's lifetime + * won't have any additional overhead from PML when the guest is + * running with dirty logging disabled. * + * When enabling dirty logging, large sptes are write-protected + * so they can be split on first write. New large sptes cannot + * be created for this slot until the end of the logging. * See the comments in fast_page_fault(). + * For small sptes, nothing is done if the dirty log is in the + * initial-all-set state. Otherwise, depending on whether pml + * is enabled the D-bit or the W-bit will be cleared. */ if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) { if (kvm_x86_ops.slot_enable_log_dirty) { @@ -10213,39 +10219,9 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, kvm_mmu_calculate_default_mmu_pages(kvm)); /* - * Dirty logging tracks sptes in 4k granularity, meaning that large - * sptes have to be split. If live migration is successful, the guest - * in the source machine will be destroyed and large sptes will be - * created in the destination. However, if the guest continues to run - * in the source machine (for example if live migration fails), small - * sptes will remain around and cause bad performance. - * - * Scan sptes if dirty logging has been stopped, dropping those - * which can be collapsed into a single large-page spte. Later - * page faults will create the large-page sptes. - * - * There is no need to do this in any of the following cases: - * CREATE: No dirty mappings will already exist. - * MOVE/DELETE: The old mappings will already have been cleaned up by - * kvm_arch_flush_shadow_memslot() - */ - if (change == KVM_MR_FLAGS_ONLY && - (old->flags & KVM_MEM_LOG_DIRTY_PAGES) && - !(new->flags & KVM_MEM_LOG_DIRTY_PAGES)) - kvm_mmu_zap_collapsible_sptes(kvm, new); - - /* - * Set up write protection and/or dirty logging for the new slot. - * - * For KVM_MR_DELETE and KVM_MR_MOVE, the shadow pages of old slot have - * been zapped so no dirty logging staff is needed for old slot. For - * KVM_MR_FLAGS_ONLY, the old slot is essentially the same one as the - * new and it's also covered when dealing with the new slot. - * * FIXME: const-ify all uses of struct kvm_memory_slot. */ - if (change != KVM_MR_DELETE) - kvm_mmu_slot_apply_flags(kvm, (struct kvm_memory_slot *) new); + kvm_mmu_slot_apply_flags(kvm, old, (struct kvm_memory_slot *) new, change); /* Free the arrays associated with the old memslot. */ if (change == KVM_MR_MOVE) @@ -10530,7 +10506,7 @@ bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu) return kvm_arch_interrupt_allowed(vcpu); } -void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, +bool kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) { struct x86_exception fault; @@ -10547,6 +10523,7 @@ void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, fault.address = work->arch.token; fault.async_page_fault = true; kvm_inject_page_fault(vcpu, &fault); + return true; } else { /* * It is not possible to deliver a paravirtualized asynchronous @@ -10557,6 +10534,7 @@ void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, * fault is retried, hopefully the page will be ready in the host. */ kvm_make_request(KVM_REQ_APF_HALT, vcpu); + return false; } } @@ -10574,7 +10552,8 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, kvm_del_async_pf_gfn(vcpu, work->arch.gfn); trace_kvm_async_pf_ready(work->arch.token, work->cr2_or_gpa); - if (kvm_pv_async_pf_enabled(vcpu) && + if ((work->wakeup_all || work->notpresent_injected) && + kvm_pv_async_pf_enabled(vcpu) && !apf_put_user_ready(vcpu, work->arch.token)) { vcpu->arch.apf.pageready_pending = true; kvm_apic_set_irq(vcpu, &irq, NULL); diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 5246db42de45..6110bce7237b 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -6,10 +6,19 @@ # Produces uninteresting flaky coverage. KCOV_INSTRUMENT_delay.o := n +# KCSAN uses udelay for introducing watchpoint delay; avoid recursion. +KCSAN_SANITIZE_delay.o := n +ifdef CONFIG_KCSAN +# In case KCSAN+lockdep+ftrace are enabled, disable ftrace for delay.o to avoid +# lockdep -> [other libs] -> KCSAN -> udelay -> ftrace -> lockdep recursion. +CFLAGS_REMOVE_delay.o = $(CC_FLAGS_FTRACE) +endif + # Early boot use of cmdline; don't instrument it ifdef CONFIG_AMD_MEM_ENCRYPT KCOV_INSTRUMENT_cmdline.o := n KASAN_SANITIZE_cmdline.o := n +KCSAN_SANITIZE_cmdline.o := n ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_cmdline.o = -pg diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 98f7c6fa2eaa..f7fd0e868c9c 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -7,6 +7,10 @@ KCOV_INSTRUMENT_mem_encrypt_identity.o := n KASAN_SANITIZE_mem_encrypt.o := n KASAN_SANITIZE_mem_encrypt_identity.o := n +# Disable KCSAN entirely, because otherwise we get warnings that some functions +# reference __initdata sections. +KCSAN_SANITIZE := n + ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_mem_encrypt.o = -pg CFLAGS_REMOVE_mem_encrypt_identity.o = -pg diff --git a/arch/x86/purgatory/.gitignore b/arch/x86/purgatory/.gitignore new file mode 100644 index 000000000000..d2be1500671d --- /dev/null +++ b/arch/x86/purgatory/.gitignore @@ -0,0 +1 @@ +purgatory.chk diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile index fb4ee5444379..b04e6e72a592 100644 --- a/arch/x86/purgatory/Makefile +++ b/arch/x86/purgatory/Makefile @@ -14,10 +14,18 @@ $(obj)/sha256.o: $(srctree)/lib/crypto/sha256.c FORCE CFLAGS_sha256.o := -D__DISABLE_EXPORTS -LDFLAGS_purgatory.ro := -e purgatory_start -r --no-undefined -nostdlib -z nodefaultlib -targets += purgatory.ro - +# When linking purgatory.ro with -r unresolved symbols are not checked, +# also link a purgatory.chk binary without -r to check for unresolved symbols. +PURGATORY_LDFLAGS := -e purgatory_start -nostdlib -z nodefaultlib +LDFLAGS_purgatory.ro := -r $(PURGATORY_LDFLAGS) +LDFLAGS_purgatory.chk := $(PURGATORY_LDFLAGS) +targets += purgatory.ro purgatory.chk + +# Sanitizer, etc. runtimes are unavailable and cannot be linked here. +GCOV_PROFILE := n KASAN_SANITIZE := n +UBSAN_SANITIZE := n +KCSAN_SANITIZE := n KCOV_INSTRUMENT := n # These are adjustments to the compiler flags used for objects that @@ -25,7 +33,7 @@ KCOV_INSTRUMENT := n PURGATORY_CFLAGS_REMOVE := -mcmodel=kernel PURGATORY_CFLAGS := -mcmodel=large -ffreestanding -fno-zero-initialized-in-bss -PURGATORY_CFLAGS += $(DISABLE_STACKLEAK_PLUGIN) +PURGATORY_CFLAGS += $(DISABLE_STACKLEAK_PLUGIN) -DDISABLE_BRANCH_PROFILING # Default KBUILD_CFLAGS can have -pg option set when FTRACE is enabled. That # in turn leaves some undefined symbols like __fentry__ in purgatory and not @@ -58,12 +66,15 @@ CFLAGS_string.o += $(PURGATORY_CFLAGS) $(obj)/purgatory.ro: $(PURGATORY_OBJS) FORCE $(call if_changed,ld) +$(obj)/purgatory.chk: $(obj)/purgatory.ro FORCE + $(call if_changed,ld) + targets += kexec-purgatory.c quiet_cmd_bin2c = BIN2C $@ cmd_bin2c = $(objtree)/scripts/bin2c kexec_purgatory < $< > $@ -$(obj)/kexec-purgatory.c: $(obj)/purgatory.ro FORCE +$(obj)/kexec-purgatory.c: $(obj)/purgatory.ro $(obj)/purgatory.chk FORCE $(call if_changed,bin2c) obj-$(CONFIG_KEXEC_FILE) += kexec-purgatory.o diff --git a/arch/x86/realmode/Makefile b/arch/x86/realmode/Makefile index 682c895753d9..6b1f3a4eeb44 100644 --- a/arch/x86/realmode/Makefile +++ b/arch/x86/realmode/Makefile @@ -6,7 +6,10 @@ # for more details. # # + +# Sanitizer runtimes are unavailable and cannot be linked here. KASAN_SANITIZE := n +KCSAN_SANITIZE := n OBJECT_FILES_NON_STANDARD := y subdir- := rm diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile index b11ec5d8f8ac..83f1b6a56449 100644 --- a/arch/x86/realmode/rm/Makefile +++ b/arch/x86/realmode/rm/Makefile @@ -6,7 +6,10 @@ # for more details. # # + +# Sanitizer runtimes are unavailable and cannot be linked here. KASAN_SANITIZE := n +KCSAN_SANITIZE := n OBJECT_FILES_NON_STANDARD := y # Prevents link failures: __sanitizer_cov_trace_pc() is not linked in. |